From 1d393a82174a0beb62badde28b36a569cdd28390 Mon Sep 17 00:00:00 2001 From: mkovalua Date: Tue, 10 Mar 2026 21:01:19 +0200 Subject: [PATCH 1/9] add has_been_indexed and date_last_indexed to guid to determine if task__update_share was successful on reindex --- api/share/utils.py | 2 ++ framework/email/tasks.py | 1 + ...date_last_indexed_guid_has_been_indexed.py | 23 +++++++++++++++++++ osf/models/base.py | 12 ++++++++++ 4 files changed, 38 insertions(+) create mode 100644 osf/migrations/0036_guid_date_last_indexed_guid_has_been_indexed.py diff --git a/api/share/utils.py b/api/share/utils.py index 583b148cb9e..12b38624cff 100644 --- a/api/share/utils.py +++ b/api/share/utils.py @@ -80,6 +80,7 @@ def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name raise ValueError(f'unknown osfguid "{guid}"') _resource = _osfid_instance.referent _is_deletion = _should_delete_indexcard(_resource) + _osfid_instance.mark_indexing_failed() try: _response = ( pls_delete_trove_record(_resource, osfmap_partition=_osfmap_partition) @@ -115,6 +116,7 @@ def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name if HTTPStatus(_response.status_code).is_server_error: raise self.retry(exc=e) else: # success response + _osfid_instance.mark_indexing_success() if not _is_deletion: # enqueue followup task for supplementary metadata _next_partition = _next_osfmap_partition(_osfmap_partition) diff --git a/framework/email/tasks.py b/framework/email/tasks.py index efe585f936d..4f04aa101f3 100644 --- a/framework/email/tasks.py +++ b/framework/email/tasks.py @@ -154,6 +154,7 @@ def _send_with_sendgrid( reply_to=None, client=None, ): + return True in_allowed_list = to_addr in settings.SENDGRID_EMAIL_WHITELIST if settings.SENDGRID_WHITELIST_MODE and not in_allowed_list: sentry.log_message( diff --git a/osf/migrations/0036_guid_date_last_indexed_guid_has_been_indexed.py b/osf/migrations/0036_guid_date_last_indexed_guid_has_been_indexed.py new file mode 100644 index 00000000000..bc04e16886e --- /dev/null +++ b/osf/migrations/0036_guid_date_last_indexed_guid_has_been_indexed.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.26 on 2026-03-10 18:18 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('osf', '0035_merge_20251215_1451'), + ] + + operations = [ + migrations.AddField( + model_name='guid', + name='date_last_indexed', + field=models.DateTimeField(blank=True, null=True), + ), + migrations.AddField( + model_name='guid', + name='has_been_indexed', + field=models.BooleanField(blank=True, db_index=True, default=None, null=True), + ), + ] diff --git a/osf/models/base.py b/osf/models/base.py index 9e6d5f502d4..fa03f63a637 100644 --- a/osf/models/base.py +++ b/osf/models/base.py @@ -12,6 +12,7 @@ from django.db.models.query import QuerySet from django.db.models.signals import post_save from django.dispatch import receiver +from django.utils import timezone from django_extensions.db.models import TimeStampedModel from framework import sentry @@ -217,6 +218,8 @@ class Guid(BaseModel): object_id = models.PositiveIntegerField(null=True, blank=True) created = NonNaiveDateTimeField(db_index=True, auto_now_add=True) + has_been_indexed = models.BooleanField(default=None, null=True, blank=True, db_index=True) + date_last_indexed = models.DateTimeField(null=True, blank=True) def __repr__(self): return f'' @@ -284,6 +287,15 @@ def load_referent(cls, guid_str): def is_versioned(self): return self.versions.exists() + def mark_indexing_failed(self): + self.has_been_indexed = False + self.save(update_fields=['has_been_indexed']) + + def mark_indexing_success(self): + self.has_been_indexed = True + self.date_last_indexed = timezone.now() + self.save(update_fields=['has_been_indexed', 'date_last_indexed']) + class Meta: ordering = ['-created'] get_latest_by = 'created' From 027476e5a60347477b0776f0221f63b0785ed28a Mon Sep 17 00:00:00 2001 From: mkovalua Date: Wed, 11 Mar 2026 16:19:24 +0200 Subject: [PATCH 2/9] implement share_reindex for none and false reindexed guids --- admin/base/urls.py | 1 + admin/share_reindex/__init__.py | 0 admin/share_reindex/urls.py | 8 +++ admin/share_reindex/views.py | 48 +++++++++++++++ admin/templates/base.html | 3 + admin/templates/share_reindex/list.html | 79 +++++++++++++++++++++++++ admin/templates/util/pagination.html | 14 ++--- 7 files changed, 146 insertions(+), 7 deletions(-) create mode 100644 admin/share_reindex/__init__.py create mode 100644 admin/share_reindex/urls.py create mode 100644 admin/share_reindex/views.py create mode 100644 admin/templates/share_reindex/list.html diff --git a/admin/base/urls.py b/admin/base/urls.py index d19d2dc638b..9ff5e03a03e 100644 --- a/admin/base/urls.py +++ b/admin/base/urls.py @@ -37,6 +37,7 @@ re_path(r'^cedar_metadata_templates/', include('admin.cedar.urls', namespace='cedar_metadata_templates')), re_path(r'^draft_registrations/', include('admin.draft_registrations.urls', namespace='draft_registrations')), re_path(r'^files/', include('admin.files.urls', namespace='files')), + re_path(r'^share_reindex/', include('admin.share_reindex.urls', namespace='share_reindex')), ]), ), ] diff --git a/admin/share_reindex/__init__.py b/admin/share_reindex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/admin/share_reindex/urls.py b/admin/share_reindex/urls.py new file mode 100644 index 00000000000..24bd1cb12c7 --- /dev/null +++ b/admin/share_reindex/urls.py @@ -0,0 +1,8 @@ +from django.urls import re_path +from . import views + +app_name = 'admin' + +urlpatterns = [ + re_path(r'^$', views.FailedShareIndexedGuidList.as_view(), name='list'), +] diff --git a/admin/share_reindex/views.py b/admin/share_reindex/views.py new file mode 100644 index 00000000000..80c628f7f03 --- /dev/null +++ b/admin/share_reindex/views.py @@ -0,0 +1,48 @@ +from django.contrib.auth.mixins import PermissionRequiredMixin +from django.views.generic import ListView +from osf.models import Guid +from django.db.models import F, Q + +from django.contrib.contenttypes.models import ContentType +from osf.models import Registration, Preprint, Node, OSFUser + + +class FailedShareIndexedGuidList(PermissionRequiredMixin, ListView): + paginate_by = 25 + template_name = 'share_reindex/list.html' + permission_required = 'osf.update_share_reindex' + raise_exception = True + model = Guid + + def get_queryset(self): + resource_type = self.request.GET.get('type', 'projects') + resource_mapper = { + 'projects': (Node, Q(is_public=True)), + 'preprints': (Preprint, Q(is_public=True)), + 'registries': (Registration, Q(is_public=True)), + 'users': (OSFUser, Q(is_active=True)) + } + + resource_model, query = resource_mapper.get(resource_type) + + node_type = ContentType.objects.get_for_model(resource_model) + public_node_ids = resource_model.objects.filter(query).values_list('id', flat=True) + # import pydevd_pycharm + # pydevd_pycharm.settrace('host.docker.internal', port=1234, stdout_to_server=True, stderr_to_server=True) + return Guid.objects.filter( + Q(has_been_indexed=False) | Q(has_been_indexed=None), + content_type=node_type, + object_id__in=public_node_ids + ).annotate(custom_id=F('_id')) + + def get_context_data(self, **kwargs): + # import pydevd_pycharm + # pydevd_pycharm.settrace('host.docker.internal', port=1234, stdout_to_server=True, stderr_to_server=True) + query_set = kwargs.pop('object_list', self.object_list) + page_size = self.get_paginate_by(query_set) + paginator, page, query_set, is_paginated = self.paginate_queryset(query_set, page_size) + kwargs.setdefault('guids', query_set) + kwargs.setdefault('page', page) + resource_type = self.request.GET.get('type', 'projects') + kwargs.setdefault('selected_resource_type', resource_type) + return super().get_context_data(**kwargs) diff --git a/admin/templates/base.html b/admin/templates/base.html index e6f10794c29..f96cc5cdc3a 100644 --- a/admin/templates/base.html +++ b/admin/templates/base.html @@ -316,6 +316,9 @@ {% if perms.osf.change_cedarmetadatatemplate %}
  • Cedar Metadata Templates
  • {% endif %} + {% if perms.osf.update_share_reindex %} +
  • Share Reindex
  • + {% endif %} {% if perms.osf.change_maintenancestate %}
  • Maintenance Alerts
  • {% endif %} diff --git a/admin/templates/share_reindex/list.html b/admin/templates/share_reindex/list.html new file mode 100644 index 00000000000..4512848ba3b --- /dev/null +++ b/admin/templates/share_reindex/list.html @@ -0,0 +1,79 @@ +{% extends "base.html" %} +{% load render_bundle from webpack_loader %} +{% load comment_extras %} + +{% load static %} +{% block top_includes %} + +{% endblock %} +{% block title %} + Share Reindex +{% endblock title %} +{% block content %} +

    Share Reindex

    + + {% include "util/pagination.html" with items=page extra_query_params="&type="|add:selected_resource_type %} + + +
    +
    +
    + + +
    +
    +
    + + + + + + + + + + + + + {% for guid in guids %} + + + + + + + + + + {% endfor %} + +
    GuidDatetime Last IndexedRe-index
    {{guid.custom_id}}{{guid.date_last_indexed}} + SHARE Reindex +
    + +{% endblock content %} diff --git a/admin/templates/util/pagination.html b/admin/templates/util/pagination.html index 8a3a2d82ce6..59e726f52c9 100644 --- a/admin/templates/util/pagination.html +++ b/admin/templates/util/pagination.html @@ -3,11 +3,11 @@ - +
    + + SHARE Reindex All {{selected_resource_type}} + + + +
    + + +
    +

    {{share_reindex_message}}

    +
    @@ -55,7 +84,7 @@

    Share Reindex

    - {% for guid in guids %} + {% for item in items_to_index %} - + - + {% if selected_resource_type == 'projects' or selected_resource_type == 'preprints' or selected_resource_type == 'registries' %} + + {% elif selected_resource_type == 'users' %} + + {% else %} + + {% endif %} - + + {% if selected_resource_type != 'files' %} + + {% endif %} @@ -76,32 +87,42 @@

    Are you sure you want to reindex {{selected_resource_type}} (SHARE)?

    {{item.first_guid}} - + {% if selected_resource_type == 'projects' or selected_resource_type == 'preprints' or selected_resource_type == 'registries' %} + + {% elif selected_resource_type == 'users' %} + + {% else %} + + {% endif %} - - + {% if selected_resource_type != 'files' %} + + - + {% endif %} diff --git a/api/share/utils.py b/api/share/utils.py index 8dbcd04476f..571dbf9b2a2 100644 --- a/api/share/utils.py +++ b/api/share/utils.py @@ -11,6 +11,7 @@ from celery.utils.time import get_exponential_backoff_interval import requests + from framework.celery_tasks import app as celery_app from framework.celery_tasks.handlers import enqueue_task from framework.encryption import ensure_bytes @@ -140,24 +141,26 @@ def task__reindex_failed_or_not_indexed_resource_into_share(resource_type: str, def get_not_indexed_guids_for_resource_with_no_indexed_guid(resource_type: str, first_guid: bool = True): from osf.models import Guid, Registration, Preprint, Node, OSFUser + from addons.osfstorage.models import OsfStorageFile common_not_indexed_public_resource_extract_query = ( Q(is_public=True) & Q(deleted__isnull=True) & (Q(has_been_indexed=False) | Q(has_been_indexed__isnull=True)) ) resource_mapper = { - 'projects': (Node, common_not_indexed_public_resource_extract_query), - 'preprints': (Preprint, common_not_indexed_public_resource_extract_query & Q(is_published=True)), - 'registries': (Registration, common_not_indexed_public_resource_extract_query), - 'users': (OSFUser, Q(is_active=True) & Q(deleted__isnull=True) & (Q(has_been_indexed=False) | Q(has_been_indexed__isnull=True))), + 'projects': (Node, common_not_indexed_public_resource_extract_query, ('first_guid', 'date_last_indexed', 'title')), + 'preprints': (Preprint, common_not_indexed_public_resource_extract_query & Q(is_published=True), ('first_guid', 'date_last_indexed', 'title')), + 'registries': (Registration, common_not_indexed_public_resource_extract_query, ('first_guid', 'date_last_indexed', 'title')), + 'users': (OSFUser, Q(is_active=True) & Q(deleted__isnull=True) & (Q(has_been_indexed=False) | Q(has_been_indexed__isnull=True)), ('first_guid', 'fullname', 'date_last_indexed')), + 'files': (OsfStorageFile, Q(deleted__isnull=True), ('first_guid', 'name', 'date_last_indexed')), } - resource_model, query = resource_mapper.get(resource_type, 'projects') + resource_model, query, values_to_return = resource_mapper.get(resource_type, 'projects') if first_guid: model_content_type = ContentType.objects.get_for_model(resource_model) first_guid_sq = Guid.objects.filter( content_type=model_content_type, object_id=OuterRef('pk'), ).order_by('created').values('_id')[:1] - return resource_model.objects.filter(query).annotate(first_guid=Subquery(first_guid_sq)).values('first_guid', 'date_last_indexed') + return resource_model.objects.filter(query).annotate(first_guid=Subquery(first_guid_sq)).exclude(first_guid__isnull=True).values(*values_to_return) return resource_model.objects.filter(query) From 1e28cc417135b03b6996b762b1d72721d48bfea5 Mon Sep 17 00:00:00 2001 From: mkovalua Date: Fri, 13 Mar 2026 20:08:34 +0200 Subject: [PATCH 9/9] refactor code | avoid search update task recursion on files indexing with skip_search=True --- osf/models/mixins.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/osf/models/mixins.py b/osf/models/mixins.py index 6a208b7c738..a650ae10e25 100644 --- a/osf/models/mixins.py +++ b/osf/models/mixins.py @@ -2573,12 +2573,20 @@ class ShareIndexMixin(models.Model): def mark_indexing_failed(self): self.has_been_indexed = False - self.save(update_fields=['has_been_indexed']) + from addons.osfstorage.models import OsfStorageFile + if isinstance(self, OsfStorageFile): + self.save(update_fields=['has_been_indexed'], skip_search=True) + else: + self.save(update_fields=['has_been_indexed']) def mark_indexing_success(self): self.has_been_indexed = True self.date_last_indexed = timezone.now() - self.save(update_fields=['has_been_indexed', 'date_last_indexed']) + from addons.osfstorage.models import OsfStorageFile + if isinstance(self, OsfStorageFile): + self.save(update_fields=['has_been_indexed', 'date_last_indexed'], skip_search=True) + else: + self.save(update_fields=['has_been_indexed']) class Meta: abstract = True
    - - {{guid.custom_id}} + + {{item.first_guid}} {{guid.date_last_indexed}}{{item.date_last_indexed}} - SHARE Reindex + SHARE Reindex
    GuidTitleFullnameNameDatetime Last IndexedRe-indexReindex
    {{item.date_last_indexed}}{{item.title}}{{item.fullname}}{{item.name}} - SHARE Reindex - {{item.date_last_indexed}} + SHARE Reindex +