From 029647f4c70581bbd4a47fc1dbf266a020fccb00 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 9 Apr 2026 09:40:37 -0400 Subject: [PATCH 01/22] add background_migration queue (in the osf way) --- framework/celery_tasks/routers.py | 2 ++ website/settings/defaults.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/framework/celery_tasks/routers.py b/framework/celery_tasks/routers.py index c33238780e8..d9d6e335286 100644 --- a/framework/celery_tasks/routers.py +++ b/framework/celery_tasks/routers.py @@ -11,6 +11,8 @@ def match_by_module(task_path): return CeleryConfig.task_med_queue if task_subpath in CeleryConfig.high_pri_modules: return CeleryConfig.task_high_queue + if task_subpath in CeleryConfig.background_migration_modules: + return CeleryConfig.task_background_migration_queue if task_subpath in CeleryConfig.remote_computing_modules: return CeleryConfig.task_remote_computing_queue if task_subpath in CeleryConfig.account_status_changes_modules: diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 1e8032cc95c..3053f9d1075 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -421,6 +421,7 @@ class CeleryConfig: task_account_status_changes_queue = 'account_status_changes' task_external_high_queue = 'external_high' task_external_low_queue = 'external_low' + task_background_migration_queue = 'background_migration' external_high_modules = { 'osf.tasks.log_gv_addon', @@ -487,6 +488,10 @@ class CeleryConfig: 'api.share.utils', } + background_migration_modules = { + 'osf.management.commands.metrics_es8_migration', + } + try: from kombu import Queue, Exchange except ImportError: @@ -540,6 +545,12 @@ class CeleryConfig: routing_key=task_external_low_queue, consumer_arguments={'x-priority': -2}, ), + Queue( + task_background_migration_queue, + Exchange(task_background_migration_queue), + routing_key=task_background_migration_queue, + consumer_arguments={'x-priority': -1}, + ), ) task_default_exchange_type = 'direct' From ac397e8c509df085257ba214fa621fc5b61e8c13 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 11:30:17 -0400 Subject: [PATCH 02/22] wip --- .../commands/metrics_es8_migration.py | 184 ++++++++++++++++++ osf/management/commands/sync_databases.py | 2 +- 2 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 osf/management/commands/metrics_es8_migration.py diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/metrics_es8_migration.py new file mode 100644 index 00000000000..46b187c63bf --- /dev/null +++ b/osf/management/commands/metrics_es8_migration.py @@ -0,0 +1,184 @@ +import datetime +import logging + + +from django.core.management import call_command +from django.core.management.base import BaseCommand +from elasticsearch6 import helpers as es6_helpers +from elasticsearch8 import helpers as es8_helpers +from elasticsearch_metrics.registry import djelme_registry +from elasticsearch_metrics.imps import elastic8 as djel8me +from elasticsearch_metrics.util.timeparts import format_timeparts + +from framework.celery_tasks import app as celery_app +from osf.metrics import reports as es6_reports +from osf.metrics import es8_metrics, RegistriesModerationMetrics + + +_logger = logging.getLogger(__name__) + +_UNCHANGED_RECORDTYPES = { + # reports + es6_reports.StorageAddonUsage: es8_metrics.StorageAddonUsageEs8, + es6_reports.DownloadCountReport: es8_metrics.DownloadCountReportEs8, + es6_reports.InstitutionSummaryReport: es8_metrics.InstitutionSummaryReportEs8, + es6_reports.NewUserDomainReport: es8_metrics.NewUserDomainReportEs8, + es6_reports.NodeSummaryReport: es8_metrics.NodeSummaryReportEs8, + es6_reports.OsfstorageFileCountReport: es8_metrics.OsfstorageFileCountReportEs8, + es6_reports.PreprintSummaryReport: es8_metrics.PreprintSummaryReportEs8, + es6_reports.UserSummaryReport: es8_metrics.UserSummaryReportEs8, + es6_reports.SpamSummaryReport: es8_metrics.SpamSummaryReportEs8, + es6_reports.InstitutionalUserReport: es8_metrics.InstitutionalUserReportEs8, + es6_reports.InstitutionMonthlySummaryReport: es8_metrics.InstitutionMonthlySummaryReportEs8, + es6_reports.PrivateSpamMetricsReport: es8_metrics.PrivateSpamMetricsReportEs8, + # events + RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, +} + + +def _debug_migrate(es8_client, each_new): + for _each in each_new: + print(_each) + + +def _do_migrate(es8_client, each_new): + es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) + + +def _es6_scan(es6_recordtype, from_when: str, until_when: str): + return es6_helpers.scan( + es6_client, + index=es6_recordtype._template_pattern, + query={"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + ) + + +def _cycle_coverage_daily(report_date): ... + + +def _cycle_coverage_monthly(report_yearmonth): ... + + +def _unchanged_report_kwargs(es6_recordtype, hit): + if issubclass(es6_recordtype, es6_reports.DailyReport): + _cycle_coverage = format_timeparts( + datetime.date.fromisoformat(hit.pop("report_date")), djel8me.DAILY + ) + elif issubclass(es6_recordtype, es6_reports.MonthlyReport): + _cycle_coverage = format_timeparts(hit.pop("report_yearmonth"), djel8me.MONTHLY) + return { + **hit, + 'cycle_coverage': _cycle_coverage, + } + + +@celery_app.task +def migrate_unchanged_recordtype( + es6_recordtype_name: str, +): + _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) + _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + + def _each_new(): + for _hit in _es6_scan(_es6_recordtype, from_when, until_when): + breakpoint() + yield _es8_recordtype.record( + ..., + using=False, # saved in bulk + ) + + _debug_migrate(_each_new()) + # _do_migrate(_each_new()) + + +@celery_app.task +def migrate_preprint_views(from_date, until_date): + # convert to counted-usage + ... + + +@celery_app.task +def migrate_preprint_downloads(from_date, until_date): + # convert to counted-usage + ... + + +@celery_app.task +def migrate_usage_reports(from_date, until_date): + # from PublicItemUsageReport to PublicItemUsageReportEs8 + # add cumulative count + ... + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + "--start", + action="store_true", + ) + parser.add_argument( + "--unchanged", + action="store_true", + ) + parser.add_argument( + "--usage-events", + action="store_true", + ) + parser.add_argument( + "--usage-reports", + action="store_true", + ) + + def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): + call_command('djelme_backend_setup') # ensure all index templates + _default_all = not any((unchanged, usage_events, usage_reports)) + + if unchanged or _default_all: + self._handle_unchanged(start=start) + if usage_events or _default_all: + self._handle_usage_events(start=start) + if usage_reports or _default_all: + self._handle_usage_reports(start=start) + + def _handle_unchanged(self, *, start: bool): + # for each (unchanged) report/event: + for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + _es6_count = _es6_cls.search().count() + _es8_count = _es8_cls.search().count() + _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) + self.stdout.write(f'{_es6_cls.__name__} (es6):\t{_es6_count}') + self.stdout.write(f'{_es8_cls.__name__}:\t{_style(_es8_count)}') + if start: + self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + # TODO: migrate_unchanged_recordtype.apply_async(...) + self.stdout.write('---') + + def _handle_usage_events(self, *, start: bool): + # for counted-usage events: + # TODO: last X months only + # get/compare/print cardinalities + # schedule (per-day?) tasks (if --start) + _es6_pview_count = PreprintView.search().count() + _es6_pdownload_count = PreprintDownload.search().count() + _es6_usage_event_count = CountedAuthUsage.search().count() + _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + _es8_count = OsfCountedUsageEvent.search().count() + _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) + self.stdout.write(f'{PreprintView.__name__} (es6):\t{_es6_pview_count}') + self.stdout.write(f'{PreprintDownload.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'{CountedAuthUsage.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'total (es6):\t{_es6_count}') + self.stdout.write(f'{OsfCountedUsageEvent.__name__}:\t{_style(_es8_count)}') + if start: + self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + # TODO: migrate_usage_events.apply_async(...) + + def _handle_usage_reports(self, *, start: bool): + _es6_count = PublicItemUsageReport.search().count() + _es8_count = PublicItemUsageReportEs8.search().count() + _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) + self.stdout.write(f'{PublicItemUsageReport.__name__} (es6):\t{_es6_count}') + self.stdout.write(f'{PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') + _item_count + # (if --start) schedule task per item (by composite agg on es6 public usage reports) + # each item-task iter thru reports oldest to newest, adding cumulative counts diff --git a/osf/management/commands/sync_databases.py b/osf/management/commands/sync_databases.py index c31d63ea16e..b5030b4bba7 100644 --- a/osf/management/commands/sync_databases.py +++ b/osf/management/commands/sync_databases.py @@ -20,7 +20,7 @@ def handle(self, *args, **options): ['migrate'], ] if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): - COMMANDS.append(['sync_metrics']) + COMMANDS.append(['djelme_backend_setup']) for check in COMMANDS: call_command(*check) From ef981e7c886b67806d60c1d37261dde4cd6e1e8d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 08:25:11 -0400 Subject: [PATCH 03/22] wip --- .../commands/metrics_es8_migration.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/metrics_es8_migration.py index 46b187c63bf..c2f765eef72 100644 --- a/osf/management/commands/metrics_es8_migration.py +++ b/osf/management/commands/metrics_es8_migration.py @@ -11,6 +11,11 @@ from elasticsearch_metrics.util.timeparts import format_timeparts from framework.celery_tasks import app as celery_app +from osf.metrics.preprint_metrics import ( + PreprintView as PreprintViewEs6, + PreprintDownload as PreprintDownloadEs6, +) +from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics @@ -132,7 +137,6 @@ def add_arguments(self, parser): def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): call_command('djelme_backend_setup') # ensure all index templates _default_all = not any((unchanged, usage_events, usage_reports)) - if unchanged or _default_all: self._handle_unchanged(start=start) if usage_events or _default_all: @@ -143,12 +147,13 @@ def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): def _handle_unchanged(self, *, start: bool): # for each (unchanged) report/event: for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) self.stdout.write(f'{_es6_cls.__name__} (es6):\t{_es6_count}') self.stdout.write(f'{_es8_cls.__name__}:\t{_style(_es8_count)}') - if start: + if start: # schedule task self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') # TODO: migrate_unchanged_recordtype.apply_async(...) self.stdout.write('---') @@ -156,29 +161,30 @@ def _handle_unchanged(self, *, start: bool): def _handle_usage_events(self, *, start: bool): # for counted-usage events: # TODO: last X months only - # get/compare/print cardinalities - # schedule (per-day?) tasks (if --start) - _es6_pview_count = PreprintView.search().count() - _es6_pdownload_count = PreprintDownload.search().count() - _es6_usage_event_count = CountedAuthUsage.search().count() + # display counts for each view/download event type + _es6_pview_count = PreprintViewEs6.search().count() + _es6_pdownload_count = PreprintDownloadEs6.search().count() + _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count - _es8_count = OsfCountedUsageEvent.search().count() + _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{PreprintView.__name__} (es6):\t{_es6_pview_count}') - self.stdout.write(f'{PreprintDownload.__name__} (es6):\t{_es6_pdownload_count}') - self.stdout.write(f'{CountedAuthUsage.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'{PreprintViewEs6.__name__} (es6):\t{_es6_pview_count}') + self.stdout.write(f'{PreprintDownloadEs6.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'{CountedUsageEs6.__name__} (es6):\t{_es6_pdownload_count}') self.stdout.write(f'total (es6):\t{_es6_count}') - self.stdout.write(f'{OsfCountedUsageEvent.__name__}:\t{_style(_es8_count)}') - if start: + self.stdout.write(f'{es8_metrics.OsfCountedUsageRecord.__name__}:\t{_style(_es8_count)}') + if start: # schedule (per-day?) tasks (if --start) self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') # TODO: migrate_usage_events.apply_async(...) def _handle_usage_reports(self, *, start: bool): - _es6_count = PublicItemUsageReport.search().count() - _es8_count = PublicItemUsageReportEs8.search().count() + # display total report counts + _es6_count = es6_reports.PublicItemUsageReport.search().count() + _es8_count = es8_metrics.PublicItemUsageReportEs8.search().count() _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{PublicItemUsageReport.__name__} (es6):\t{_es6_count}') - self.stdout.write(f'{PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') + self.stdout.write(f'{es6_reports.PublicItemUsageReport.__name__} (es6):\t{_es6_count}') + self.stdout.write(f'{es8_metrics.PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') + # display distinct item counts _item_count # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts From 9ed70f3cda89f38b216455e3986f088f03814842 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 11:55:33 -0400 Subject: [PATCH 04/22] quieter elastic logs --- conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conftest.py b/conftest.py index 198316f1cc4..e80c4e5c566 100644 --- a/conftest.py +++ b/conftest.py @@ -43,6 +43,8 @@ def pytest_configure(config): 'transitions.core', 'MARKDOWN', 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', ] for logger_name in SILENT_LOGGERS: logging.getLogger(logger_name).setLevel(logging.CRITICAL) From be1ed2feec851748d16c0b55b20815ce3aa87917 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 11:56:05 -0400 Subject: [PATCH 05/22] wip --- .../commands/metrics_es8_migration.py | 146 +++++++++++++++--- osf/metrics/es8_metrics.py | 35 +++++ 2 files changed, 156 insertions(+), 25 deletions(-) diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/metrics_es8_migration.py index c2f765eef72..ff6cdfe8b0f 100644 --- a/osf/management/commands/metrics_es8_migration.py +++ b/osf/management/commands/metrics_es8_migration.py @@ -1,7 +1,6 @@ import datetime import logging - from django.core.management import call_command from django.core.management.base import BaseCommand from elasticsearch6 import helpers as es6_helpers @@ -22,6 +21,10 @@ _logger = logging.getLogger(__name__) +_USAGE_MONTHS_BACK = 3 + +_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control + _UNCHANGED_RECORDTYPES = { # reports es6_reports.StorageAddonUsage: es8_metrics.StorageAddonUsageEs8, @@ -40,7 +43,6 @@ RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, } - def _debug_migrate(es8_client, each_new): for _each in each_new: print(_each) @@ -49,7 +51,6 @@ def _debug_migrate(es8_client, each_new): def _do_migrate(es8_client, each_new): es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) - def _es6_scan(es6_recordtype, from_when: str, until_when: str): return es6_helpers.scan( es6_client, @@ -58,6 +59,46 @@ def _es6_scan(es6_recordtype, from_when: str, until_when: str): ) +def _es6_usage_report_counts() -> tuple[int, int]: + _search = ( + es6_reports.PublicItemUsageReport.search() + ) + _search.aggs.metric( + 'agg_item_count', + 'cardinality', + field='item_osfid', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _total_count = _response.hits.total + _item_count = ( + _response.aggregations.agg_item_count.value + if 'agg_item_count' in _response.aggregations + else 0 + ) + return (_total_count, _item_count) + + +def _es8_usage_report_counts() -> tuple[int, int]: + _search = ( + es8_metrics.PublicItemUsageReportEs8.search() + ) + _search.aggs.metric( + 'agg_item_count', + 'cardinality', + field='item_osfid', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _total_count = _response.hits.total.value + _item_count = ( + _response.aggregations.agg_item_count.value + if 'agg_item_count' in _response.aggregations + else 0 + ) + return (_total_count, _item_count) + + def _cycle_coverage_daily(report_date): ... @@ -114,7 +155,6 @@ def migrate_usage_reports(from_date, until_date): # add cumulative count ... - class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( @@ -133,9 +173,22 @@ def add_arguments(self, parser): "--usage-reports", action="store_true", ) + parser.add_argument( + "--clear-state", + action="store_true", + ) + parser.add_argument( + "--no-setup", + action="store_true", + ) - def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): - call_command('djelme_backend_setup') # ensure all index templates + def handle(self, *, start, unchanged, usage_events, usage_reports, clear_state, no_setup, **kwargs): + self._quiet_chatty_loggers() + if not no_setup: + call_command('djelme_backend_setup') + if clear_state: + self._clear_state() + self._display_started_at(start=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: self._handle_unchanged(start=start) @@ -150,11 +203,11 @@ def _handle_unchanged(self, *, start: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{_es6_cls.__name__} (es6):\t{_es6_count}') - self.stdout.write(f'{_es8_cls.__name__}:\t{_style(_es8_count)}') + #_es8_count = _es8_cls.search().count() + self._write_tabbed('es6', _es6_cls, _es6_count) + self._write_tabbed('es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count)) if start: # schedule task - self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + self._write_tabbed('starting', _es6_cls, '=>', _es8_cls) # TODO: migrate_unchanged_recordtype.apply_async(...) self.stdout.write('---') @@ -167,24 +220,67 @@ def _handle_usage_events(self, *, start: bool): _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{PreprintViewEs6.__name__} (es6):\t{_es6_pview_count}') - self.stdout.write(f'{PreprintDownloadEs6.__name__} (es6):\t{_es6_pdownload_count}') - self.stdout.write(f'{CountedUsageEs6.__name__} (es6):\t{_es6_pdownload_count}') - self.stdout.write(f'total (es6):\t{_es6_count}') - self.stdout.write(f'{es8_metrics.OsfCountedUsageRecord.__name__}:\t{_style(_es8_count)}') + self._write_tabbed('es6', PreprintViewEs6, _es6_pview_count) + self._write_tabbed('es6', PreprintDownloadEs6, _es6_pdownload_count) + self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed('es6', '(total to migrate)', _es6_count) + self._write_tabbed('es8', es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count)) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + self.stdout.write(f'starting usages => {es8_metrics.OsfCountedUsageRecord}') # TODO: migrate_usage_events.apply_async(...) + self.stdout.write('---') def _handle_usage_reports(self, *, start: bool): - # display total report counts - _es6_count = es6_reports.PublicItemUsageReport.search().count() - _es8_count = es8_metrics.PublicItemUsageReportEs8.search().count() - _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{es6_reports.PublicItemUsageReport.__name__} (es6):\t{_es6_count}') - self.stdout.write(f'{es8_metrics.PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') - # display distinct item counts - _item_count + # display counts of reports and distinct items + _es6_count, _es6_item_count = _es6_usage_report_counts() + _es8_count, _es8_item_count = _es8_usage_report_counts() + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, _es8_count, style=self._eq_style(_es8_count, _es6_count)) + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, '(items)', _es6_item_count) + self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, '(items)', _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count)) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts + if start: # schedule per-item tasks + self.stdout.write(f'starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}') + # TODO: migrate_usage_events.apply_async(...) + self.stdout.write('---') + + def _display_started_at(self, start): + _started_at = es8_metrics.Elastic6To8State.get_started_at() + if _started_at: + self.stdout.write( + f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' + ) + elif start: + _started_at = es8_metrics.Elastic6To8State.set_started_at_now() + self.stdout.write( + f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' + ) + else: + self.stdout.write( + 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' + ) + self.stdout.write('---') + + def _clear_state(self): + es8_metrics.Elastic6To8State.search().delete() + + def _eq_style(self, num: int, should_be: int): + return self.style.SUCCESS if (num == should_be) else self.style.NOTICE + + def _write_tabbed(self, *strables, style=None): + def _to_str(strable): + if isinstance(strable, type): + return strable.__name__ + return str(strable) + self.stdout.write('\t'.join(map(_to_str, strables)), style) + + def _quiet_chatty_loggers(self): + _chatty_loggers = [ + 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', + ] + for logger_name in _chatty_loggers: + logging.getLogger(logger_name).setLevel(logging.ERROR) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 436a1c62d46..8b5e9dd5bc8 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -346,3 +346,38 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): preprint_oopspam_hammed: int preprint_akismet_flagged: int preprint_akismet_hammed: int + + +### +# data migration state + +class Elastic6To8State(djelme.DjelmeRecordtype): + """index for storing values helpful for keeping track of the elastic 6->8 data migration""" + UNIQUE_TOGETHER_FIELDS = ('key',) + key: str + value: str | None + timestamp: datetime.datetime = esdsl.mapped_field( + default_factory=lambda: datetime.datetime.now(datetime.UTC), + ) + + class Index: + name = 'osf_elastic6to8state' + + @classmethod + def get_by_key(cls, key: str): + _response = cls.search().query({'term': {'key': key}})[0].execute() + return _response[0] if _response else None + + @classmethod + def get_timestamp(cls, key: str) -> datetime.datetime | None: + _record = cls.get_by_key(key) + return _record.timestamp if _record else None + + @classmethod + def get_started_at(cls): + return cls.get_timestamp('started_at') + + @classmethod + def set_started_at_now(cls): + _record = cls.record(key='started_at') + return _record.timestamp From 64aeeaba0d84cf33d9c6726b86c8844f127520e3 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 16:03:06 -0400 Subject: [PATCH 06/22] wip --- ...ics_es8_migration.py => migrate_metrics_6to8.py} | 13 ++++++------- poetry.lock | 6 +++--- pyproject.toml | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) rename osf/management/commands/{metrics_es8_migration.py => migrate_metrics_6to8.py} (99%) diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/migrate_metrics_6to8.py similarity index 99% rename from osf/management/commands/metrics_es8_migration.py rename to osf/management/commands/migrate_metrics_6to8.py index ff6cdfe8b0f..104caccfb6c 100644 --- a/osf/management/commands/metrics_es8_migration.py +++ b/osf/management/commands/migrate_metrics_6to8.py @@ -158,27 +158,27 @@ def migrate_usage_reports(from_date, until_date): class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( - "--start", + "--no-setup", action="store_true", ) parser.add_argument( - "--unchanged", + "--clear-state", action="store_true", ) parser.add_argument( - "--usage-events", + "--start", action="store_true", ) parser.add_argument( - "--usage-reports", + "--unchanged", action="store_true", ) parser.add_argument( - "--clear-state", + "--usage-events", action="store_true", ) parser.add_argument( - "--no-setup", + "--usage-reports", action="store_true", ) @@ -203,7 +203,6 @@ def _handle_unchanged(self, *, start: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - #_es8_count = _es8_cls.search().count() self._write_tabbed('es6', _es6_cls, _es6_count) self._write_tabbed('es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count)) if start: # schedule task diff --git a/poetry.lock b/poetry.lock index 90665bce81f..d86523f94de 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6" -resolved_reference = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6" +reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" +resolved_reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "ef1d6d327f5557e43482793b276ccb6c5fd07989f27367af3a3736a8547b4d1a" +content-hash = "320d3eb4cd7f0f4c5d8cc698db51ee1bf4c37f8b8d41d21a86ca5cdb9b2e6b42" diff --git a/pyproject.toml b/pyproject.toml index 013df3f448d..a5c39d297d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "e18f029c406d743d407f18fda8a133b261f9c4d2"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 97cd5b7f125c2413393d1276dec710f54b24a33a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Apr 2026 10:46:00 -0400 Subject: [PATCH 07/22] wip --- ...ics_6to8.py => migrate_osfmetrics_6to8.py} | 261 +++++++++++++----- osf/metrics/es8_metrics.py | 2 +- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 198 insertions(+), 73 deletions(-) rename osf/management/commands/{migrate_metrics_6to8.py => migrate_osfmetrics_6to8.py} (50%) diff --git a/osf/management/commands/migrate_metrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py similarity index 50% rename from osf/management/commands/migrate_metrics_6to8.py rename to osf/management/commands/migrate_osfmetrics_6to8.py index 104caccfb6c..2f4cbb28385 100644 --- a/osf/management/commands/migrate_metrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -1,5 +1,6 @@ import datetime import logging +from pprint import pprint from django.core.management import call_command from django.core.management.base import BaseCommand @@ -7,7 +8,6 @@ from elasticsearch8 import helpers as es8_helpers from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me -from elasticsearch_metrics.util.timeparts import format_timeparts from framework.celery_tasks import app as celery_app from osf.metrics.preprint_metrics import ( @@ -17,6 +17,7 @@ from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics +from osf.metrics.utils import YearMonth _logger = logging.getLogger(__name__) @@ -43,57 +44,76 @@ RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, } -def _debug_migrate(es8_client, each_new): + +def _delete_all(recordtype): + # TODO: REMOVE THIS + recordtype.search().query({"match_all": {}}).delete() + recordtype.refresh() + + +def _delete_all_es8(): + # TODO: REMOVE THIS + for _es8_recordtype in _UNCHANGED_RECORDTYPES.values(): + _delete_all(_es8_recordtype) + _delete_all(es8_metrics.PublicItemUsageReportEs8) + _delete_all(es8_metrics.OsfCountedUsageRecord) + + +def _debug_migrate(each_new): for _each in each_new: - print(_each) + pprint(_each.to_dict()) def _do_migrate(es8_client, each_new): es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) -def _es6_scan(es6_recordtype, from_when: str, until_when: str): + +def _es6_scan_all(es6_recordtype): + return es6_helpers.scan( + es6_recordtype._get_connection(), + index=es6_recordtype._template_pattern, + ) + + +def _es6_scan_range(es6_recordtype, from_when: str, until_when: str): return es6_helpers.scan( - es6_client, + es6_recordtype._get_connection(), index=es6_recordtype._template_pattern, query={"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, ) def _es6_usage_report_counts() -> tuple[int, int]: - _search = ( - es6_reports.PublicItemUsageReport.search() - ) + _search = es6_reports.PublicItemUsageReport.search() _search.aggs.metric( - 'agg_item_count', - 'cardinality', - field='item_osfid', + "agg_item_count", + "cardinality", + field="item_osfid", precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total _item_count = ( _response.aggregations.agg_item_count.value - if 'agg_item_count' in _response.aggregations + if "agg_item_count" in _response.aggregations else 0 ) return (_total_count, _item_count) def _es8_usage_report_counts() -> tuple[int, int]: - _search = ( - es8_metrics.PublicItemUsageReportEs8.search() - ) + _search = es8_metrics.PublicItemUsageReportEs8.search() _search.aggs.metric( - 'agg_item_count', - 'cardinality', - field='item_osfid', + "agg_item_count", + "cardinality", + field="item_osfid", precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total.value _item_count = ( _response.aggregations.agg_item_count.value - if 'agg_item_count' in _response.aggregations + if "agg_item_count" in _response.aggregations else 0 ) return (_total_count, _item_count) @@ -105,36 +125,95 @@ def _cycle_coverage_daily(report_date): ... def _cycle_coverage_monthly(report_yearmonth): ... -def _unchanged_report_kwargs(es6_recordtype, hit): +def _get_es6_field_names(es6_recordtype): + """ + adapted from DocumentBase._get_field_names in elasticsearch8.dsl + """ + for _field_name in es6_recordtype._doc_type.mapping: + _field = es6_recordtype._doc_type.mapping[_field_name] + if hasattr(_field, "_doc_class"): + for _sub_field in _get_es6_field_names(_field._doc_class): + yield f"{_field_name}.{_sub_field}" + else: + yield _field_name + + +def _assert_field_unchangedness(es6_recordtype, es8_recordtype): + _es6_fields = set(_get_es6_field_names(es6_recordtype)) + _es8_fields = set(es8_recordtype._get_field_names()) + + # remove fields intentionally removed/renamed in migration if issubclass(es6_recordtype, es6_reports.DailyReport): - _cycle_coverage = format_timeparts( - datetime.date.fromisoformat(hit.pop("report_date")), djel8me.DAILY - ) + assert issubclass(es8_recordtype, djel8me.CyclicRecord) + _es6_fields.remove("timestamp") + _es6_fields.remove("report_date") elif issubclass(es6_recordtype, es6_reports.MonthlyReport): - _cycle_coverage = format_timeparts(hit.pop("report_yearmonth"), djel8me.MONTHLY) - return { - **hit, - 'cycle_coverage': _cycle_coverage, - } + assert issubclass(es8_recordtype, djel8me.CyclicRecord) + _es6_fields.remove("timestamp") + _es6_fields.remove("report_yearmonth") + else: + assert issubclass(es8_recordtype, djel8me.EventRecord) + # remove fields intentionally added in migration + _es8_fields.remove("timeseries_timeparts") + if issubclass(es8_recordtype, djel8me.CyclicRecord): + _es8_fields.remove("created") + _es8_fields.remove("cycle_coverage") -@celery_app.task + # all remaining fields should match + assert _es6_fields == _es8_fields + + +# TODO: @celery_app.task def migrate_unchanged_recordtype( es6_recordtype_name: str, ): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) + _kwarg_converter = ( + _each_cyclicrecord_kwarg + if issubclass( + _es6_recordtype, (es6_reports.DailyReport, es6_reports.MonthlyReport) + ) + else _each_eventrecord_kwarg + ) def _each_new(): - for _hit in _es6_scan(_es6_recordtype, from_when, until_when): - breakpoint() + for _hit in _es6_scan_all(_es6_recordtype): + _es8_kwargs = dict(_kwarg_converter(_hit["_source"])) yield _es8_recordtype.record( - ..., - using=False, # saved in bulk + **_es8_kwargs, + using=False, # skip saving; save in bulk ) _debug_migrate(_each_new()) - # _do_migrate(_each_new()) + # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) + + +def _semverish_from_yearmonth(given_yearmonth: str): + _ym = YearMonth.from_str(given_yearmonth) + return f"{_ym.year}.{_ym.month}" + + +def _semverish_from_date(given_date: str): + _d = datetime.date.fromisoformat(given_date) + return f"{_d.year}.{_d.month}.{_d.day}" + + +def _each_cyclicrecord_kwarg(es6_source: dict): + for _key, _val in es6_source.items(): + if _key == "report_yearmonth": + yield ("cycle_coverage", _semverish_from_yearmonth(_val)) + elif _key == "report_date": + yield ("cycle_coverage", _semverish_from_date(_val)) + elif _key != "timestamp": + # skipping timestamp; on daily/monthly reports just copied from yearmonth/date + yield (_key, _val) + + +def _each_eventrecord_kwarg(es6_source) -> dict: + yield from es6_source.items() # no changes needed @celery_app.task @@ -155,6 +234,7 @@ def migrate_usage_reports(from_date, until_date): # add cumulative count ... + class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( @@ -182,13 +262,23 @@ def add_arguments(self, parser): action="store_true", ) - def handle(self, *, start, unchanged, usage_events, usage_reports, clear_state, no_setup, **kwargs): + def handle( + self, + *, + start, + unchanged, + usage_events, + usage_reports, + clear_state, + no_setup, + **kwargs, + ): self._quiet_chatty_loggers() if not no_setup: - call_command('djelme_backend_setup') + call_command("djelme_backend_setup") if clear_state: self._clear_state() - self._display_started_at(start=start) + self._migration_start(start=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: self._handle_unchanged(start=start) @@ -203,12 +293,18 @@ def _handle_unchanged(self, *, start: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - self._write_tabbed('es6', _es6_cls, _es6_count) - self._write_tabbed('es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count)) + self._write_tabbed("es6", _es6_cls, _es6_count) + self._write_tabbed( + "es8", + _es8_cls, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) if start: # schedule task - self._write_tabbed('starting', _es6_cls, '=>', _es8_cls) + self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) + migrate_unchanged_recordtype(_es6_cls.__name__) # TODO: migrate_unchanged_recordtype.apply_async(...) - self.stdout.write('---') + self.stdout.write("---") def _handle_usage_events(self, *, start: bool): # for counted-usage events: @@ -219,67 +315,96 @@ def _handle_usage_events(self, *, start: bool): _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed('es6', PreprintViewEs6, _es6_pview_count) - self._write_tabbed('es6', PreprintDownloadEs6, _es6_pdownload_count) - self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed('es6', '(total to migrate)', _es6_count) - self._write_tabbed('es8', es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count)) + self._write_tabbed("es6", PreprintViewEs6, _es6_pview_count) + self._write_tabbed("es6", PreprintDownloadEs6, _es6_pdownload_count) + self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed("es6", "(total to migrate)", _es6_count) + self._write_tabbed( + "es8", + es8_metrics.OsfCountedUsageRecord, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f'starting usages => {es8_metrics.OsfCountedUsageRecord}') - # TODO: migrate_usage_events.apply_async(...) - self.stdout.write('---') + self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") + for _from_date, _until_date in _each_date_in_range(...): + migrate_usage_events(_from_date.isoformat(), _until_date.isoformat()) + # TODO: migrate_usage_events.apply_async(...) + self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): # display counts of reports and distinct items _es6_count, _es6_item_count = _es6_usage_report_counts() _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) - self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, _es8_count, style=self._eq_style(_es8_count, _es6_count)) - self._write_tabbed('es6', es6_reports.PublicItemUsageReport, '(items)', _es6_item_count) - self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, '(items)', _es8_item_count, - style=self._eq_style(_es8_item_count, _es6_item_count)) + self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + self._write_tabbed( + "es6", es6_reports.PublicItemUsageReport, "(items)", _es6_item_count + ) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + "(items)", + _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count), + ) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts if start: # schedule per-item tasks - self.stdout.write(f'starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}') - # TODO: migrate_usage_events.apply_async(...) - self.stdout.write('---') + self.stdout.write( + f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" + ) + # TODO: migrate_usage_reports.apply_async(...) + self.stdout.write("---") - def _display_started_at(self, start): + def _migration_start(self, start): _started_at = es8_metrics.Elastic6To8State.get_started_at() if _started_at: self.stdout.write( - f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' + f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" ) elif start: _started_at = es8_metrics.Elastic6To8State.set_started_at_now() self.stdout.write( - f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' + f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" ) else: self.stdout.write( - 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' + "osf.metrics 6->8 migration not started nor starting (run with `--start` to start)" ) - self.stdout.write('---') + self.stdout.write("---") def _clear_state(self): - es8_metrics.Elastic6To8State.search().delete() + self.stdout.write( + "clearing all migration state (start time, etc)", self.style.NOTICE + ) + es8_metrics.Elastic6To8State.search().query({"match_all": {}}).delete() + es8_metrics.Elastic6To8State.refresh() + # TODO: REMOVE THIS + self.stdout.write("deleting all migration target data in es8", self.style.ERROR) + _delete_all_es8() def _eq_style(self, num: int, should_be: int): - return self.style.SUCCESS if (num == should_be) else self.style.NOTICE + return self.style.SUCCESS if (num == should_be) else self.style.WARNING def _write_tabbed(self, *strables, style=None): def _to_str(strable): if isinstance(strable, type): return strable.__name__ return str(strable) - self.stdout.write('\t'.join(map(_to_str, strables)), style) + + self.stdout.write("\t".join(map(_to_str, strables)), style) def _quiet_chatty_loggers(self): _chatty_loggers = [ - 'elasticsearch', - 'elastic_transport', - 'elasticsearch_metrics', + "elasticsearch", + "elastic_transport", + "elasticsearch_metrics", ] for logger_name in _chatty_loggers: logging.getLogger(logger_name).setLevel(logging.ERROR) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 8b5e9dd5bc8..4980358dc5f 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -351,7 +351,7 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): ### # data migration state -class Elastic6To8State(djelme.DjelmeRecordtype): +class Elastic6To8State(djelme.SimpleRecord): """index for storing values helpful for keeping track of the elastic 6->8 data migration""" UNIQUE_TOGETHER_FIELDS = ('key',) key: str diff --git a/poetry.lock b/poetry.lock index d86523f94de..df08934ef29 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" -resolved_reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" +reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" +resolved_reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "320d3eb4cd7f0f4c5d8cc698db51ee1bf4c37f8b8d41d21a86ca5cdb9b2e6b42" +content-hash = "d149bb933fd3845714e26920360c34f3224ab0f84a789b3185cf716033a8d4bf" diff --git a/pyproject.toml b/pyproject.toml index a5c39d297d1..4b6f896f39e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "e18f029c406d743d407f18fda8a133b261f9c4d2"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "d7e0483972a58b940bec843679c2a8c9b8bcb75c"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 7eba5cce220851ac479b74b67e1baa71e94d2c95 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Apr 2026 16:25:28 -0400 Subject: [PATCH 08/22] wip --- .../commands/migrate_osfmetrics_6to8.py | 222 +++++++++++++----- osf/metrics/es8_metrics.py | 1 + 2 files changed, 167 insertions(+), 56 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 2f4cbb28385..812322657a9 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -1,4 +1,6 @@ +import collections import datetime +import functools import logging from pprint import pprint @@ -10,6 +12,7 @@ from elasticsearch_metrics.imps import elastic8 as djel8me from framework.celery_tasks import app as celery_app +from osf.metadata import rdfutils from osf.metrics.preprint_metrics import ( PreprintView as PreprintViewEs6, PreprintDownload as PreprintDownloadEs6, @@ -18,10 +21,14 @@ from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics from osf.metrics.utils import YearMonth +from website import settings as website_settings _logger = logging.getLogger(__name__) +### +# constants + _USAGE_MONTHS_BACK = 3 _MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control @@ -45,6 +52,68 @@ } +### +# celery tasks + + +# TODO: @celery_app.task +def migrate_unchanged_recordtype(es6_recordtype_name: str): + _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) + _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) + + if issubclass(_es8_recordtype, djel8me.CyclicRecord): + + def _new_es8_record(source_dict): + _kwargs = dict(_convert_cyclicrecord_kwargs(source_dict)) + return _es8_recordtype(**_kwargs) + + else: # no conversion needed for event record with unchanged fields + + def _new_es8_record(source_dict): + return _es8_recordtype(**source_dict) + + def _each_new(): + for _hit in _es6_scan_all(_es6_recordtype): + yield _new_es8_record(_hit["_source"]) + + _debug_migrate(_each_new()) + # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) + + +# TODO: @celery_app.task +def migrate_counted_usages(from_when: str, until_when: str): + # CountedAuthUsage => OsfCountedUsageRecord + def _each_new(): + for _hit in _es6_scan_all(CountedUsageEs6, from_when, until_when): + yield _convert_counted_usage(_hit["_source"]) + + _debug_migrate(_each_new()) + + +# TODO: @celery_app.task +def migrate_preprint_views(from_date: str, until_date: str): + # convert to counted-usage + ... + + +# TODO: @celery_app.task +def migrate_preprint_downloads(from_date: str, until_date: str): + # convert to counted-usage + ... + + +# TODO: @celery_app.task +def migrate_usage_reports(from_date, until_date): + # from PublicItemUsageReport to PublicItemUsageReportEs8 + # add cumulative count + ... + + +### +# various helper functions + + def _delete_all(recordtype): # TODO: REMOVE THIS recordtype.search().query({"match_all": {}}).delete() @@ -60,6 +129,7 @@ def _delete_all_es8(): def _debug_migrate(each_new): + # TODO: remove this for _each in each_new: pprint(_each.to_dict()) @@ -68,6 +138,18 @@ def _do_migrate(es8_client, each_new): es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) +def _date_range( + range_start: datetime.date, + range_end: datetime.date, + step: datetime.timedelta = datetime.timedelta(days=1), +) -> collections.abc.Iterator[tuple[datetime.date, datetime.date]]: + _from_date = range_start + _until_date = range_start + step + while _from_date < range_end: + yield (_from_date, _until_date) + (_from_date, _until_date) = (_until_date, _until_date + step) + + def _es6_scan_all(es6_recordtype): return es6_helpers.scan( es6_recordtype._get_connection(), @@ -119,12 +201,6 @@ def _es8_usage_report_counts() -> tuple[int, int]: return (_total_count, _item_count) -def _cycle_coverage_daily(report_date): ... - - -def _cycle_coverage_monthly(report_yearmonth): ... - - def _get_es6_field_names(es6_recordtype): """ adapted from DocumentBase._get_field_names in elasticsearch8.dsl @@ -164,33 +240,6 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): assert _es6_fields == _es8_fields -# TODO: @celery_app.task -def migrate_unchanged_recordtype( - es6_recordtype_name: str, -): - _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) - _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] - _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) - _kwarg_converter = ( - _each_cyclicrecord_kwarg - if issubclass( - _es6_recordtype, (es6_reports.DailyReport, es6_reports.MonthlyReport) - ) - else _each_eventrecord_kwarg - ) - - def _each_new(): - for _hit in _es6_scan_all(_es6_recordtype): - _es8_kwargs = dict(_kwarg_converter(_hit["_source"])) - yield _es8_recordtype.record( - **_es8_kwargs, - using=False, # skip saving; save in bulk - ) - - _debug_migrate(_each_new()) - # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) - - def _semverish_from_yearmonth(given_yearmonth: str): _ym = YearMonth.from_str(given_yearmonth) return f"{_ym.year}.{_ym.month}" @@ -201,38 +250,85 @@ def _semverish_from_date(given_date: str): return f"{_d.year}.{_d.month}.{_d.day}" -def _each_cyclicrecord_kwarg(es6_source: dict): +def _convert_cyclicrecord_kwargs(es6_source: dict): for _key, _val in es6_source.items(): if _key == "report_yearmonth": + # report_yearmonth converts to cycle_coverage Y.M yield ("cycle_coverage", _semverish_from_yearmonth(_val)) elif _key == "report_date": + # report_date converts to cycle_coverage Y.M.D yield ("cycle_coverage", _semverish_from_date(_val)) elif _key != "timestamp": # skipping timestamp; on daily/monthly reports just copied from yearmonth/date yield (_key, _val) -def _each_eventrecord_kwarg(es6_source) -> dict: - yield from es6_source.items() # no changes needed +def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: + _item_iri = _iri_from_osfid(source_dict["item_guid"]) + return es8_metrics.OsfCountedUsageRecord( + # fields from djelme.CountedUsageRecord + timestamp=source_dict["timestamp"], + sessionhour_id=source_dict["session_id"], + platform_iri=source_dict["platform_iri"], + # TODO: database_iri=provider iri + item_iri=_item_iri, + within_iris=[ + _item_iri, # correct mistake; make inclusive-within aggregations easier + *( + _iri_from_osfid(_within_osfid) + for _within_osfid in source_dict["surrounding_guids"] + ), + ], + # fields from OsfCountedUsageRecord + item_osfid=source_dict["item_guid"], + item_type=_convert_item_type(source_dict), + item_public=source_dict["item_public"], + provider_id=source_dict["provider_id"], + user_is_authenticated=source_dict["user_is_authenticated"], + action_labels=source_dict["action_labels"], + pageview_info=source_dict[ + "pageview_info" + ], # TODO: does this need the PageviewInfo object? + ) -@celery_app.task -def migrate_preprint_views(from_date, until_date): - # convert to counted-usage - ... +def _iri_from_osfid(osfid: str) -> str: + return f"{website_settings.DOMAIN}{osfid}" -@celery_app.task -def migrate_preprint_downloads(from_date, until_date): - # convert to counted-usage - ... +def _convert_item_type(es6_usage_dict): + """convert model-name item types to OSFMAP item types + previous item_types use `type(osf_model).__name__.lower()` + """ + _modelname = es6_usage_dict["item_type"] + assert isinstance(_modelname, str) + match _modelname: + case "osfuser": + return rdfutils.DCTERMS.Agent + case "preprint": + return rdfutils.OSF.Preprint + case "registration": + return ( + rdfutils.OSF.RegistrationComponent + if es6_usage_dict.get("surrounding_guids") + else rdfutils.OSF.Registration + ) + case "node": + return ( + rdfutils.OSF.ProjectComponent + if es6_usage_dict.get("surrounding_guids") + else rdfutils.OSF.Project + ) + case _ if "file" in _modelname: + return rdfutils.OSF.File + case _: + _logger.error(f"unknown item type: {_modelname}") + return _modelname # give up -@celery_app.task -def migrate_usage_reports(from_date, until_date): - # from PublicItemUsageReport to PublicItemUsageReportEs8 - # add cumulative count - ... + +### +# the command itself class Command(BaseCommand): @@ -278,7 +374,7 @@ def handle( call_command("djelme_backend_setup") if clear_state: self._clear_state() - self._migration_start(start=start) + self._check_started_at(start_now=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: self._handle_unchanged(start=start) @@ -327,9 +423,18 @@ def _handle_usage_events(self, *, start: bool): ) if start: # schedule (per-day?) tasks (if --start) self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") - for _from_date, _until_date in _each_date_in_range(...): - migrate_usage_events(_from_date.isoformat(), _until_date.isoformat()) - # TODO: migrate_usage_events.apply_async(...) + _started = self._migration_started_at + _range_start = ( + _started - datetime.timedelta(months=_USAGE_MONTHS_BACK) + ).date + _range_end = _started.date() + datetime.timedelta(days=1) + for _from_date, _until_date in _date_range(_range_start, _range_end): + _from_str = _from_date.isoformat() + _until_str = _until_date.isoformat() + # TODO: .apply_async(...) + migrate_counted_usages(_from_str, _until_str) + migrate_preprint_views(_from_str, _until_str) + migrate_preprint_downloads(_from_str, _until_str) self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): @@ -362,13 +467,18 @@ def _handle_usage_reports(self, *, start: bool): # TODO: migrate_usage_reports.apply_async(...) self.stdout.write("---") - def _migration_start(self, start): - _started_at = es8_metrics.Elastic6To8State.get_started_at() + @functools.cached_property + def _migration_started_at(self): + return es8_metrics.Elastic6To8State.get_started_at() + + def _check_started_at(self, start_now): + _started_at = self._migration_started_at if _started_at: self.stdout.write( f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" ) - elif start: + elif start_now: + del self._migration_started_at # clear cache _started_at = es8_metrics.Elastic6To8State.set_started_at_now() self.stdout.write( f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 4980358dc5f..3be81e9262e 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -89,6 +89,7 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): item_osfid: str item_type: str item_public: bool + provider_id: str user_is_authenticated: bool action_labels: list[str] pageview_info: PageviewInfo From 7d554b66ba27447f79179c8e997427215302a11b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Apr 2026 14:14:20 -0400 Subject: [PATCH 09/22] wip --- .../commands/migrate_osfmetrics_6to8.py | 189 +++++++++++------- osf/metrics/es8_metrics.py | 9 +- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 130 insertions(+), 76 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 812322657a9..f0e1147f025 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -7,15 +7,16 @@ from django.core.management import call_command from django.core.management.base import BaseCommand from elasticsearch6 import helpers as es6_helpers -from elasticsearch8 import helpers as es8_helpers +from elasticsearch6_dsl.connections import connections as es6_connections +from elasticsearch8.dsl.connections import connections as es8_connections from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me from framework.celery_tasks import app as celery_app from osf.metadata import rdfutils from osf.metrics.preprint_metrics import ( - PreprintView as PreprintViewEs6, - PreprintDownload as PreprintDownloadEs6, + PreprintView, + PreprintDownload, ) from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports @@ -29,7 +30,7 @@ ### # constants -_USAGE_MONTHS_BACK = 3 +_USAGE_DAYS_BACK = 99 _MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control @@ -61,58 +62,76 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) - - if issubclass(_es8_recordtype, djel8me.CyclicRecord): - - def _new_es8_record(source_dict): - _kwargs = dict(_convert_cyclicrecord_kwargs(source_dict)) - return _es8_recordtype(**_kwargs) - - else: # no conversion needed for event record with unchanged fields - - def _new_es8_record(source_dict): - return _es8_recordtype(**source_dict) - - def _each_new(): - for _hit in _es6_scan_all(_es6_recordtype): - yield _new_es8_record(_hit["_source"]) - - _debug_migrate(_each_new()) - # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) + _convert_kwargs = ( + _convert_unchanged_cyclicrecord_kwargs + if issubclass(_es8_recordtype, djel8me.CyclicRecord) + else (lambda _kw: _kw) # no conversion needed for event record + ) + _each_new = ( + _es8_recordtype(**_convert_kwargs(_hit["_source"])) + for _hit in _es6_scan_all(_es6_recordtype) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(_es8_recordtype, _each_new) # TODO: @celery_app.task def migrate_counted_usages(from_when: str, until_when: str): # CountedAuthUsage => OsfCountedUsageRecord - def _each_new(): - for _hit in _es6_scan_all(CountedUsageEs6, from_when, until_when): - yield _convert_counted_usage(_hit["_source"]) - - _debug_migrate(_each_new()) + _each_new = ( + _convert_counted_usage(_hit["_source"]) + for _hit in _es6_scan_range(CountedUsageEs6, from_when, until_when) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) # TODO: @celery_app.task -def migrate_preprint_views(from_date: str, until_date: str): - # convert to counted-usage - ... +def migrate_preprint_views(from_when: str, until_when: str): + # PreprintView => OsfCountedUsageRecord + _action_labels = ['view', 'web'] + _each_new = ( + _convert_preprint_metric(_hit["_source"], _action_labels) + for _hit in _es6_scan_range(PreprintView, from_when, until_when) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) # TODO: @celery_app.task -def migrate_preprint_downloads(from_date: str, until_date: str): - # convert to counted-usage - ... +def migrate_preprint_downloads(from_when: str, until_when: str): + # PreprintDownload => OsfCountedUsageRecord + _action_labels = ['download'] + _each_new = ( + _convert_preprint_metric(_hit["_source"], _action_labels) + for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) # TODO: @celery_app.task -def migrate_usage_reports(from_date, until_date): +def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 # add cumulative count - ... + def _each_new(): + for _hit in _es6_scan_all(CountedUsageEs6, query=...): + yield ...(_hit["_source"]) + + _debug_migrate(_each_new) + # TODO: return _es8_bulk_save(PublicItemUsageReportEs8, _each_new) ### # various helper functions +def _es6_connection(): + return es6_connections.get_connection('osfmetrics_es6') + + +def _es8_connection(): + return es8_connections.get_connection('osfmetrics_es8') + def _delete_all(recordtype): # TODO: REMOVE THIS @@ -131,11 +150,15 @@ def _delete_all_es8(): def _debug_migrate(each_new): # TODO: remove this for _each in each_new: - pprint(_each.to_dict()) + pprint(_each.to_dict(include_meta=True)) -def _do_migrate(es8_client, each_new): - es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) +def _es8_bulk_save(es8_recordtype, each_new_record): + _success_count, _fail_count = es8_recordtype.bulk( + each_new_record, + stats_only=True, + ) + return _success_count def _date_range( @@ -150,18 +173,19 @@ def _date_range( (_from_date, _until_date) = (_until_date, _until_date + step) -def _es6_scan_all(es6_recordtype): +def _es6_scan_all(es6_recordtype, query=None): return es6_helpers.scan( - es6_recordtype._get_connection(), + _es6_connection(), index=es6_recordtype._template_pattern, + query=query, ) def _es6_scan_range(es6_recordtype, from_when: str, until_when: str): return es6_helpers.scan( - es6_recordtype._get_connection(), + _es6_connection(), index=es6_recordtype._template_pattern, - query={"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + query={"query": {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}}, ) @@ -218,7 +242,7 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): _es6_fields = set(_get_es6_field_names(es6_recordtype)) _es8_fields = set(es8_recordtype._get_field_names()) - # remove fields intentionally removed/renamed in migration + # remove fields intentionally removed in migration if issubclass(es6_recordtype, es6_reports.DailyReport): assert issubclass(es8_recordtype, djel8me.CyclicRecord) _es6_fields.remove("timestamp") @@ -250,17 +274,19 @@ def _semverish_from_date(given_date: str): return f"{_d.year}.{_d.month}.{_d.day}" -def _convert_cyclicrecord_kwargs(es6_source: dict): - for _key, _val in es6_source.items(): - if _key == "report_yearmonth": - # report_yearmonth converts to cycle_coverage Y.M - yield ("cycle_coverage", _semverish_from_yearmonth(_val)) - elif _key == "report_date": - # report_date converts to cycle_coverage Y.M.D - yield ("cycle_coverage", _semverish_from_date(_val)) - elif _key != "timestamp": - # skipping timestamp; on daily/monthly reports just copied from yearmonth/date - yield (_key, _val) +def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict: + def _each_kwarg(): + for _key, _val in es6_source.items(): + if _key == "report_yearmonth": + # report_yearmonth converts to cycle_coverage Y.M + yield ("cycle_coverage", _semverish_from_yearmonth(_val)) + elif _key == "report_date": + # report_date converts to cycle_coverage Y.M.D + yield ("cycle_coverage", _semverish_from_date(_val)) + elif _key != "timestamp": + # skipping timestamp; on daily/monthly reports just copied from yearmonth/date + yield (_key, _val) + return dict(_each_kwarg()) def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: @@ -276,19 +302,40 @@ def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: _item_iri, # correct mistake; make inclusive-within aggregations easier *( _iri_from_osfid(_within_osfid) - for _within_osfid in source_dict["surrounding_guids"] + for _within_osfid in source_dict.get("surrounding_guids", ()) ), ], # fields from OsfCountedUsageRecord item_osfid=source_dict["item_guid"], item_type=_convert_item_type(source_dict), item_public=source_dict["item_public"], - provider_id=source_dict["provider_id"], + provider_id=source_dict.get("provider_id"), user_is_authenticated=source_dict["user_is_authenticated"], action_labels=source_dict["action_labels"], - pageview_info=source_dict[ - "pageview_info" - ], # TODO: does this need the PageviewInfo object? + # TODO: does this need the PageviewInfo object? + pageview_info=source_dict.get("pageview_info"), + ) + + +def _convert_preprint_metric(source_dict, action_labels: list[str]) -> es8_metrics.OsfCountedUsageRecord: + _preprint_iri = _iri_from_osfid(source_dict["preprint_id"]) + return es8_metrics.OsfCountedUsageRecord.record( + using=False, # don't save yet; will save in bulk + # fields used to compute a sessionhour_id: + timestamp=source_dict["timestamp"], + user_id=source_dict['user_id'], # TODO: handle None? + # fields from djelme.CountedUsageRecord: + platform_iri=website_settings.DOMAIN, + # TODO: database_iri=provider iri + item_iri=_preprint_iri, + within_iris=[_preprint_iri], + # fields from OsfCountedUsageRecord: + item_osfid=source_dict["preprint_id"], + item_type=rdfutils.OSF.Preprint, + item_public=True, + provider_id=source_dict["provider_id"], + user_is_authenticated=bool(source_dict["user_id"]), + action_labels=action_labels, ) @@ -301,7 +348,11 @@ def _convert_item_type(es6_usage_dict): previous item_types use `type(osf_model).__name__.lower()` """ - _modelname = es6_usage_dict["item_type"] + try: + _modelname = es6_usage_dict["item_type"] + except KeyError: + # this probably only happens in fake data + return None assert isinstance(_modelname, str) match _modelname: case "osfuser": @@ -320,11 +371,11 @@ def _convert_item_type(es6_usage_dict): if es6_usage_dict.get("surrounding_guids") else rdfutils.OSF.Project ) - case _ if "file" in _modelname: + case _ if "file" in _modelname: # hack for the many "filenode" models return rdfutils.OSF.File - case _: + case _: # give up gracefully _logger.error(f"unknown item type: {_modelname}") - return _modelname # give up + return _modelname ### @@ -406,13 +457,13 @@ def _handle_usage_events(self, *, start: bool): # for counted-usage events: # TODO: last X months only # display counts for each view/download event type - _es6_pview_count = PreprintViewEs6.search().count() - _es6_pdownload_count = PreprintDownloadEs6.search().count() + _es6_pview_count = PreprintView.search().count() + _es6_pdownload_count = PreprintDownload.search().count() _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed("es6", PreprintViewEs6, _es6_pview_count) - self._write_tabbed("es6", PreprintDownloadEs6, _es6_pdownload_count) + self._write_tabbed("es6", PreprintView, _es6_pview_count) + self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) self._write_tabbed("es6", "(total to migrate)", _es6_count) self._write_tabbed( @@ -425,8 +476,8 @@ def _handle_usage_events(self, *, start: bool): self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") _started = self._migration_started_at _range_start = ( - _started - datetime.timedelta(months=_USAGE_MONTHS_BACK) - ).date + _started - datetime.timedelta(days=_USAGE_DAYS_BACK) + ).date() _range_end = _started.date() + datetime.timedelta(days=1) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 3be81e9262e..1824fcf2b3f 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -92,9 +92,10 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): provider_id: str user_is_authenticated: bool action_labels: list[str] - pageview_info: PageviewInfo + pageview_info: PageviewInfo | None - def save(self, *args, **kwargs): + def clean(self): + super().clean() # autofill pageview_info fields if self.pageview_info: self.pageview_info.hour_of_day = self.timestamp.hour @@ -104,7 +105,9 @@ def save(self, *args, **kwargs): _ref_url = self.pageview_info.referer_url if _ref_url: self.pageview_info.referer_domain = urlsplit(_ref_url).netloc - super().save(*args, **kwargs) + # ensure inclusive "within" + if self.item_iri not in self.within_iris: + self.within_iris = [self.item_iri, *self.within_iris] class ActionLabel(enum.Enum): diff --git a/poetry.lock b/poetry.lock index df08934ef29..09ee8c9749b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" -resolved_reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" +reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" +resolved_reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "d149bb933fd3845714e26920360c34f3224ab0f84a789b3185cf716033a8d4bf" +content-hash = "9aea963ca1a8b23c8e07fa22b34dc23c0f53d1d017edf29aad65a733ab4832fe" diff --git a/pyproject.toml b/pyproject.toml index 4b6f896f39e..a0a08b48047 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "d7e0483972a58b940bec843679c2a8c9b8bcb75c"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "445fcea0aa6b5d07523cd67e959cb14088f15bb0"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 68b38bae8483eb349f785105dd887617e1b046d6 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Apr 2026 17:28:12 -0400 Subject: [PATCH 10/22] wip --- .../commands/migrate_osfmetrics_6to8.py | 338 ++++++++++++++---- osf/metrics/es8_metrics.py | 55 ++- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 320 insertions(+), 81 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index f0e1147f025..acbc43df5dd 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -6,14 +6,18 @@ from django.core.management import call_command from django.core.management.base import BaseCommand +from django.db import OperationalError as DjangoOperationalError +from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError from elasticsearch6 import helpers as es6_helpers from elasticsearch6_dsl.connections import connections as es6_connections +from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError from elasticsearch8.dsl.connections import connections as es8_connections from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me +from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app -from osf.metadata import rdfutils +from osf.metadata.rdfutils import OSF, DCTERMS from osf.metrics.preprint_metrics import ( PreprintView, PreprintDownload, @@ -21,7 +25,9 @@ from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics +from osf.metrics.reporters.public_item_usage import _iter_composite_bucket_keys from osf.metrics.utils import YearMonth +from osf import models as osfdb from website import settings as website_settings @@ -52,12 +58,22 @@ RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, } +_TASK_KWARGS = dict( + autoretry_for=( + DjangoOperationalError, + Elastic6ConnectionError, + Elastic8ConnectionError, + PostgresOperationalError, + ), + max_retries=50, + retry_backoff=True, +) ### # celery tasks -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_unchanged_recordtype(es6_recordtype_name: str): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] @@ -71,66 +87,75 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es8_recordtype(**_convert_kwargs(_hit["_source"])) for _hit in _es6_scan_all(_es6_recordtype) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(_es8_recordtype, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(_es8_recordtype, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_counted_usages(from_when: str, until_when: str): # CountedAuthUsage => OsfCountedUsageRecord _each_new = ( _convert_counted_usage(_hit["_source"]) - for _hit in _es6_scan_range(CountedUsageEs6, from_when, until_when) + for _hit in _es6_scan_range( + CountedUsageEs6, + from_when, + until_when, + addl_filter={"exists": {"field": "item_guid"}}, + ) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_preprint_views(from_when: str, until_when: str): # PreprintView => OsfCountedUsageRecord - _action_labels = ['view', 'web'] + _action_labels = ["view", "web"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintView, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_preprint_downloads(from_when: str, until_when: str): # PreprintDownload => OsfCountedUsageRecord - _action_labels = ['download'] + _action_labels = ["download"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 # add cumulative count def _each_new(): - for _hit in _es6_scan_all(CountedUsageEs6, query=...): - yield ...(_hit["_source"]) + for _hit in _es6_scan_all( + es6_reports.PublicItemUsageReport, + query_body={"query": {"term": {"item_osfid": osfid}}}, + ): + yield _convert_public_usage_report(_hit["_source"]) - _debug_migrate(_each_new) - # TODO: return _es8_bulk_save(PublicItemUsageReportEs8, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) ### # various helper functions + def _es6_connection(): - return es6_connections.get_connection('osfmetrics_es6') + return es6_connections.get_connection("osfmetrics_es6") def _es8_connection(): - return es8_connections.get_connection('osfmetrics_es8') + return es8_connections.get_connection("osfmetrics_es8") def _delete_all(recordtype): @@ -173,19 +198,24 @@ def _date_range( (_from_date, _until_date) = (_until_date, _until_date + step) -def _es6_scan_all(es6_recordtype, query=None): +def _es6_scan_all(es6_recordtype, query_body=None): return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, - query=query, + query=query_body, ) -def _es6_scan_range(es6_recordtype, from_when: str, until_when: str): +def _es6_scan_range(es6_recordtype, from_when: str, until_when: str, addl_filter=None): + _filters = [ + {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + ] + if addl_filter: + _filters.append(addl_filter) return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, - query={"query": {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}}, + query={"query": {"bool": {"filter": _filters}}}, ) @@ -286,59 +316,182 @@ def _each_kwarg(): elif _key != "timestamp": # skipping timestamp; on daily/monthly reports just copied from yearmonth/date yield (_key, _val) + return dict(_each_kwarg()) def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: _item_iri = _iri_from_osfid(source_dict["item_guid"]) + _item_type = _convert_item_type(source_dict) return es8_metrics.OsfCountedUsageRecord( - # fields from djelme.CountedUsageRecord + # fields from djelme.CountedUsageRecord: timestamp=source_dict["timestamp"], sessionhour_id=source_dict["session_id"], platform_iri=source_dict["platform_iri"], - # TODO: database_iri=provider iri + database_iri=_convert_database_iri(source_dict.get("provider_id"), _item_type), item_iri=_item_iri, within_iris=[ - _item_iri, # correct mistake; make inclusive-within aggregations easier - *( - _iri_from_osfid(_within_osfid) - for _within_osfid in source_dict.get("surrounding_guids", ()) - ), + _iri_from_osfid(_within_osfid) + for _within_osfid in source_dict.get("surrounding_guids", ()) ], - # fields from OsfCountedUsageRecord + # fields from OsfCountedUsageRecord: item_osfid=source_dict["item_guid"], - item_type=_convert_item_type(source_dict), + item_type=_item_type, item_public=source_dict["item_public"], provider_id=source_dict.get("provider_id"), user_is_authenticated=source_dict["user_is_authenticated"], action_labels=source_dict["action_labels"], - # TODO: does this need the PageviewInfo object? + # TODO: does this need the PageviewInfo object or is the dictionary fine? pageview_info=source_dict.get("pageview_info"), ) -def _convert_preprint_metric(source_dict, action_labels: list[str]) -> es8_metrics.OsfCountedUsageRecord: +def _convert_preprint_metric( + source_dict, action_labels: list[str] +) -> es8_metrics.OsfCountedUsageRecord: _preprint_iri = _iri_from_osfid(source_dict["preprint_id"]) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: timestamp=source_dict["timestamp"], - user_id=source_dict['user_id'], # TODO: handle None? + user_id=source_dict.get("user_id"), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - # TODO: database_iri=provider iri + database_iri=_convert_database_iri( + source_dict.get("provider_id"), OSF.Preprint + ), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: item_osfid=source_dict["preprint_id"], - item_type=rdfutils.OSF.Preprint, + item_type=OSF.Preprint, item_public=True, - provider_id=source_dict["provider_id"], - user_is_authenticated=bool(source_dict["user_id"]), + provider_id=source_dict.get("provider_id"), + user_is_authenticated=bool(source_dict.get("user_id")), action_labels=action_labels, ) +def _convert_public_usage_report(source_dict) -> es8_metrics.PublicItemUsageReportEs8: + _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( + osfid=source_dict["item_osfid"], + until_when=YearMonth.from_str(source_dict["report_yearmonth"]).month_end(), + item_type=source_dict.get("item_type"), + ) + return es8_metrics.PublicItemUsageReportEs8( + item_osfid=source_dict["item_osfid"], + item_type=source_dict.get("item_type"), + provider_id=source_dict.get("provider_id"), + platform_iri=source_dict.get("platform_iri"), + view_count=source_dict.get("view_count"), + view_session_count=source_dict.get("view_session_count"), + cumulative_view_count=_c_views, + cumulative_view_session_count=_c_view_sess, + download_count=source_dict.get("download_count"), + download_session_count=source_dict.get("download_session_count"), + cumulative_download_count=_c_downloads, + cumulative_download_session_count=_c_download_sess, + ) + + +def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): + if item_type == "preprint": + _views = _cumulative_preprint_count(PreprintView, osfid, until_when) + _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) + _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) + else: + _views, _view_sess = _cumulative_countedusage_views(osfid, until_when) + _downloads, _download_sess = _cumulative_countedusage_downloads( + osfid, until_when + ) + return (_views, _view_sess, _downloads, _download_sess) + + +def _cumulative_countedusage_views( + osfid: str, until_when: str +) -> tuple[int, int]: + """compute view_session_count separately to avoid double-counting + + (the same session may be represented in both the composite agg on `item_guid` + and that on `surrounding_guids`) + """ + # copied/adapted from osf.metrics.reporters.public_item_usage + _search = ( + CountedUsageEs6.search() + .filter("term", item_public=True) + .filter("range", timestamp={"lt": until_when}) + .filter("term", action_labels="view") + .filter( + "bool", + should=[ + {"term": {"item_guid": osfid}}, + {"term": {"surrounding_guids": osfid}}, + ], + minimum_should_match=1, + ) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.metric( + "agg_session_count", + "cardinality", + field="session_id", + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _view_count = _response.hits.total + _view_session_count = ( + _response.aggregations.agg_session_count.value + if "agg_session_count" in _response.aggregations + else 0 + ) + return (_view_count, _view_session_count) + + +def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]: + """aggregate downloads on each osfid (not including components/files)""" + # copied/adapted from osf.metrics.reporters.public_item_usage + _search = ( + CountedUsageEs6.search() + .filter("term", item_public=True) + .filter("range", timestamp={"lt": until_when}) + .filter("term", action_labels="download") + .filter("term", item_guid=osfid) + ) + _search.aggs.metric( + "agg_session_count", + "cardinality", + field="session_id", + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _download_count = _response.hits.total + _download_session_count = ( + _response.aggregations.agg_session_count.value + if "agg_session_count" in _response.aggregations + else 0 + ) + return (_download_count, _download_session_count) + + +def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int: + """aggregate views on each preprint""" + # copied/adapted from osf.metrics.preprint_metrics + _search = ( + preprint_metric_cls.search() + .filter("term", preprint_id=osfid) + .filter("range", timestamp={"lt": until_when}) + .extra(size=0) # no hits; only aggs + ) + _search.aggs.metric("agg_count", "sum", field="count") + _response = _search.execute() + _view_count = ( + int(_response.aggregations.agg_count.value) + if hasattr(_response.aggregations, "agg_count") + else 0 + ) + return _view_count + + def _iri_from_osfid(osfid: str) -> str: return f"{website_settings.DOMAIN}{osfid}" @@ -348,34 +501,83 @@ def _convert_item_type(es6_usage_dict): previous item_types use `type(osf_model).__name__.lower()` """ - try: - _modelname = es6_usage_dict["item_type"] - except KeyError: - # this probably only happens in fake data - return None - assert isinstance(_modelname, str) + _modelname = es6_usage_dict.get("item_type") match _modelname: + case "" | None: + return OSF.Object case "osfuser": - return rdfutils.DCTERMS.Agent + return DCTERMS.Agent case "preprint": - return rdfutils.OSF.Preprint + return OSF.Preprint case "registration": return ( - rdfutils.OSF.RegistrationComponent + OSF.RegistrationComponent if es6_usage_dict.get("surrounding_guids") - else rdfutils.OSF.Registration + else OSF.Registration ) case "node": return ( - rdfutils.OSF.ProjectComponent + OSF.ProjectComponent if es6_usage_dict.get("surrounding_guids") - else rdfutils.OSF.Project + else OSF.Project ) case _ if "file" in _modelname: # hack for the many "filenode" models - return rdfutils.OSF.File + return OSF.File case _: # give up gracefully - _logger.error(f"unknown item type: {_modelname}") - return _modelname + return OSF.Object + + +@functools.lru_cache +def _convert_database_iri(provider_id: str | None, item_type_iri: str) -> str: + if not provider_id: + return website_settings.DOMAIN # osf is a provider, sure why not + + def _fallback_iri(): + return f"urn:osf.io:{provider_id}" + + match item_type_iri: + case OSF.ProjectComponent | OSF.Project | DCTERMS.Agent: + # implicit "osf" provider + return website_settings.DOMAIN + case OSF.Preprint: + try: + _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) + except osfdb.PreprintProvider.DoesNotExist: + _logger.error(f"unknown preprint provider {provider_id!r}") + return _fallback_iri() + else: + return _provider.get_semantic_iri() + case OSF.RegistrationComponent | OSF.Registration: + try: + _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) + except osfdb.RegistrationProvider.DoesNotExist: + _logger.error(f"unknown registration provider {provider_id!r}") + return _fallback_iri() + else: + return _provider.get_semantic_iri() + case OSF.File: + # file providers are a different thing that don't really have an iri, just an id + return _fallback_iri() + case _: # give up gracefully + _logger.error( + f"unknown item type {item_type_iri!r} with provider {provider_id!r}" + ) + return _fallback_iri() + + +def _each_usage_report_osfid(started_at, after_osfid=None): + _search = ( + es6_reports.PublicItemUsageReport.search() + .filter("range", timestamp={"lt": started_at}) + .extra(size=0) + ) + _search.aggs.bucket( + "agg_osfid", + "composite", + sources=[{"osfid": {"terms": {"field": "item_osfid"}}}], + size=500, + ) + return _iter_composite_bucket_keys(_search, "agg_osfid", "osfid", after=after_osfid) ### @@ -449,8 +651,7 @@ def _handle_unchanged(self, *, start: bool): ) if start: # schedule task self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) - migrate_unchanged_recordtype(_es6_cls.__name__) - # TODO: migrate_unchanged_recordtype.apply_async(...) + migrate_unchanged_recordtype.delay(_es6_cls.__name__) self.stdout.write("---") def _handle_usage_events(self, *, start: bool): @@ -475,17 +676,14 @@ def _handle_usage_events(self, *, start: bool): if start: # schedule (per-day?) tasks (if --start) self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") _started = self._migration_started_at - _range_start = ( - _started - datetime.timedelta(days=_USAGE_DAYS_BACK) - ).date() + _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() _range_end = _started.date() + datetime.timedelta(days=1) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() - # TODO: .apply_async(...) - migrate_counted_usages(_from_str, _until_str) - migrate_preprint_views(_from_str, _until_str) - migrate_preprint_downloads(_from_str, _until_str) + migrate_counted_usages.delay(_from_str, _until_str) + migrate_preprint_views.delay(_from_str, _until_str) + migrate_preprint_downloads.delay(_from_str, _until_str) self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): @@ -515,7 +713,11 @@ def _handle_usage_reports(self, *, start: bool): self.stdout.write( f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" ) - # TODO: migrate_usage_reports.apply_async(...) + for _osfid in _each_usage_report_osfid( + started_at=self._migration_started_at + ): + migrate_usage_reports(_osfid) + # TODO: migrate_usage_reports.apply_async(...) self.stdout.write("---") @functools.cached_property diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 1824fcf2b3f..fd8475b1bc3 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -77,19 +77,24 @@ class PageviewInfo(esdsl.InnerDoc): class OsfCountedUsageRecord(djelme.CountedUsageRecord): ''' - - inherited fields: - platform_iri: str - database_iri: str - item_iri: str - sessionhour_id: str - within_iris: list[str] + Aim to support a COUNTER-style reporting api + https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html ''' - # osf-specific fields + + # inherited fields: + # timestamp: datetime.datetime + # platform_iri: str + # database_iri: str + # item_iri: str + # sessionhour_id: str + # within_iris: list[str] + + # osf-specific fields: item_osfid: str item_type: str item_public: bool - provider_id: str + provider_id: str | None user_is_authenticated: bool action_labels: list[str] pageview_info: PageviewInfo | None @@ -109,6 +114,38 @@ def clean(self): if self.item_iri not in self.within_iris: self.within_iris = [self.item_iri, *self.within_iris] + def _get_unique_together_values(self): + """get "unique together" values for "ON CONFLICT UPDATE" behavior + + override djelme.BaseDjelmeRecord._get_unique_together_values + for more complex logic than UNIQUE_TOGETHER_FIELDS + to slightly better approximate `counter:Double-Click Filtering` + """ + # note: copied from osf.metrics.counted_usage._fill_document_id + target_identifier = ( + self.pageview_info.page_url + if self.pageview_info is not None and self.pageview_info.page_url is not None + else self.item_osfid + ) + # slice the day into an array of 30-second windows, + # find this timestamp's windowslice index + day_start = datetime.datetime( + self.timestamp.year, + self.timestamp.month, + self.timestamp.day, + tzinfo=datetime.UTC, + ) + time_in_seconds = (self.timestamp - day_start).total_seconds() + time_window = int(time_in_seconds / 30) # 30-second windows + return ( # unique-together values: + self.platform_iri, + target_identifier, + self.sessionhour_id, + self.timestamp.date(), + time_window, + ','.join(sorted(self.action_labels)), + ) + class ActionLabel(enum.Enum): SEARCH = 'search' # counter:Search diff --git a/poetry.lock b/poetry.lock index 09ee8c9749b..14113d228b3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" -resolved_reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" +reference = "a1e00e468830a40758caa8afa4b838821471f5c1" +resolved_reference = "a1e00e468830a40758caa8afa4b838821471f5c1" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "9aea963ca1a8b23c8e07fa22b34dc23c0f53d1d017edf29aad65a733ab4832fe" +content-hash = "1ba293f397fef29212fc58bfb8e08753f64bf43471a6fd2eb9d71bfded4ae326" diff --git a/pyproject.toml b/pyproject.toml index a0a08b48047..f7e6eb5bb41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "445fcea0aa6b5d07523cd67e959cb14088f15bb0"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "a1e00e468830a40758caa8afa4b838821471f5c1"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 69daa8744cdc5d47c3ba3fec571c564624d39dbd Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 10:11:50 -0400 Subject: [PATCH 11/22] wip --- docker-compose.yml | 7 +- .../commands/fake_metrics_reports.py | 19 +++ .../commands/migrate_osfmetrics_6to8.py | 150 ++++++++++-------- osf/metrics/es8_metrics.py | 41 ++++- poetry.lock | 8 +- pyproject.toml | 2 +- website/settings/defaults.py | 1 + 7 files changed, 157 insertions(+), 71 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 83e8fd27483..d771c75797a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,8 @@ services: # Temporary: Remove when we've upgraded to ES6 elasticsearch6: image: docker.elastic.co/elasticsearch/elasticsearch:6.3.1 + environment: + - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage ports: - 9201:9200 volumes: @@ -91,10 +93,9 @@ services: - elasticsearch8_data_vol:/usr/share/elasticsearch/data healthcheck: start_period: 15s - test: ["CMD", "curl", "-sf", "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s"] + test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' interval: 10s - timeout: 30s - retries: 5 + retries: 30 stdin_open: true postgres: diff --git a/osf/management/commands/fake_metrics_reports.py b/osf/management/commands/fake_metrics_reports.py index 765d6e475c1..53e13472e74 100644 --- a/osf/management/commands/fake_metrics_reports.py +++ b/osf/management/commands/fake_metrics_reports.py @@ -8,6 +8,8 @@ UserSummaryReport, PreprintSummaryReport, ) +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.models import PreprintProvider @@ -53,10 +55,27 @@ def fake_preprint_counts(days_back): ).save() +def fake_usage_reports(osfid: str, count: int): + _ym = YearMonth.from_date(date.today()).prior() + for _months in range(count): + PublicItemUsageReport.record( + item_osfid=osfid, + report_yearmonth=_ym, + view_count=(_vc := randint(0, 500)), + view_session_count=randint(0, _vc), + download_count=(_dc := randint(0, 300)), + download_session_count=randint(0, _dc), + ) + _ym = _ym.prior() + + class Command(BaseCommand): def handle(self, *args, **kwargs): if not settings.DEBUG: raise NotImplementedError('fake_reports requires DEBUG mode') fake_user_counts(1000) fake_preprint_counts(1000) + fake_usage_reports('blarg', 100) + fake_usage_reports('blerg', 50) + fake_usage_reports('bleg', 50) # TODO: more reports diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index acbc43df5dd..c72765ab261 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -65,8 +65,8 @@ Elastic8ConnectionError, PostgresOperationalError, ), - max_retries=50, - retry_backoff=True, + retry_backoff=True, # exponential backoff, with jitter + max_retries=20, ) ### @@ -87,8 +87,8 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es8_recordtype(**_convert_kwargs(_hit["_source"])) for _hit in _es6_scan_all(_es6_recordtype) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(_es8_recordtype, _each_new) + _debug_migrate(_each_new) + # return _es8_bulk_save(_es8_recordtype, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -103,8 +103,8 @@ def migrate_counted_usages(from_when: str, until_when: str): addl_filter={"exists": {"field": "item_guid"}}, ) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + _debug_migrate(_each_new) + #return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -115,8 +115,8 @@ def migrate_preprint_views(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintView, from_when, until_when) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -127,8 +127,8 @@ def migrate_preprint_downloads(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -136,14 +136,23 @@ def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 # add cumulative count def _each_new(): - for _hit in _es6_scan_all( + _each_hit = _es6_scan_all( es6_reports.PublicItemUsageReport, query_body={"query": {"term": {"item_osfid": osfid}}}, - ): - yield _convert_public_usage_report(_hit["_source"]) + ) + # only a few dozen of these per item; fine to hold all at once + _sorted_sources = sorted( + (_hit["_source"] for _hit in _each_hit), + key=lambda _s: _s["report_yearmonth"], + ) + _prior_report = None + for _source in _sorted_sources: + yield ( + _prior_report := _convert_public_usage_report(_source, _prior_report) + ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) + _debug_migrate(_each_new()) + # return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) ### @@ -175,6 +184,7 @@ def _delete_all_es8(): def _debug_migrate(each_new): # TODO: remove this for _each in each_new: + _each.full_clean() pprint(_each.to_dict(include_meta=True)) @@ -320,75 +330,89 @@ def _each_kwarg(): return dict(_each_kwarg()) -def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: - _item_iri = _iri_from_osfid(source_dict["item_guid"]) - _item_type = _convert_item_type(source_dict) +def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: + _item_iri = _iri_from_osfid(source["item_guid"]) + _item_type = _convert_item_type(source) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: - timestamp=source_dict["timestamp"], - sessionhour_id=source_dict["session_id"], - platform_iri=source_dict["platform_iri"], - database_iri=_convert_database_iri(source_dict.get("provider_id"), _item_type), + timestamp=source["timestamp"], + sessionhour_id=source["session_id"], + platform_iri=source.get("platform_iri") or website_settings.DOMAIN, + database_iri=_convert_database_iri(source.get("provider_id"), _item_type), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) - for _within_osfid in source_dict.get("surrounding_guids", ()) + for _within_osfid in source.get("surrounding_guids", ()) ], # fields from OsfCountedUsageRecord: - item_osfid=source_dict["item_guid"], + item_osfid=source["item_guid"], item_type=_item_type, - item_public=source_dict["item_public"], - provider_id=source_dict.get("provider_id"), - user_is_authenticated=source_dict["user_is_authenticated"], - action_labels=source_dict["action_labels"], + item_public=source["item_public"], + provider_id=source.get("provider_id"), + user_is_authenticated=source["user_is_authenticated"], + action_labels=source["action_labels"], # TODO: does this need the PageviewInfo object or is the dictionary fine? - pageview_info=source_dict.get("pageview_info"), + pageview_info=source.get("pageview_info"), ) def _convert_preprint_metric( - source_dict, action_labels: list[str] + source: dict, action_labels: list[str] ) -> es8_metrics.OsfCountedUsageRecord: - _preprint_iri = _iri_from_osfid(source_dict["preprint_id"]) + _preprint_iri = _iri_from_osfid(source["preprint_id"]) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: - timestamp=source_dict["timestamp"], - user_id=source_dict.get("user_id"), + timestamp=source["timestamp"], + user_id=source.get("user_id"), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri( - source_dict.get("provider_id"), OSF.Preprint - ), + database_iri=_convert_database_iri(source.get("provider_id"), OSF.Preprint), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: - item_osfid=source_dict["preprint_id"], + item_osfid=source["preprint_id"], item_type=OSF.Preprint, item_public=True, - provider_id=source_dict.get("provider_id"), - user_is_authenticated=bool(source_dict.get("user_id")), + provider_id=source.get("provider_id"), + user_is_authenticated=bool(source.get("user_id")), action_labels=action_labels, ) -def _convert_public_usage_report(source_dict) -> es8_metrics.PublicItemUsageReportEs8: - _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( - osfid=source_dict["item_osfid"], - until_when=YearMonth.from_str(source_dict["report_yearmonth"]).month_end(), - item_type=source_dict.get("item_type"), - ) +def _convert_public_usage_report( + source: dict, + prior_report: es8_metrics.PublicItemUsageReportEs8 | None, +) -> es8_metrics.PublicItemUsageReportEs8: + if prior_report is None: + _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( + osfid=source["item_osfid"], + until_when=YearMonth.from_str(source["report_yearmonth"]).month_end(), + item_type=source.get("item_type"), + ) + else: + _c_views = prior_report.cumulative_view_count + source.get("view_count", 0) + _c_view_sess = prior_report.cumulative_view_session_count + source.get( + "view_session_count", 0 + ) + _c_downloads = prior_report.cumulative_download_count + source.get( + "download_count", 0 + ) + _c_download_sess = prior_report.cumulative_download_session_count + source.get( + "download_session_count", 0 + ) return es8_metrics.PublicItemUsageReportEs8( - item_osfid=source_dict["item_osfid"], - item_type=source_dict.get("item_type"), - provider_id=source_dict.get("provider_id"), - platform_iri=source_dict.get("platform_iri"), - view_count=source_dict.get("view_count"), - view_session_count=source_dict.get("view_session_count"), + cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + item_osfid=source["item_osfid"], + item_type=source.get("item_type"), + provider_id=source.get("provider_id"), + platform_iri=source.get("platform_iri") or website_settings.DOMAIN, + view_count=source.get("view_count"), + view_session_count=source.get("view_session_count"), cumulative_view_count=_c_views, cumulative_view_session_count=_c_view_sess, - download_count=source_dict.get("download_count"), - download_session_count=source_dict.get("download_session_count"), + download_count=source.get("download_count"), + download_session_count=source.get("download_session_count"), cumulative_download_count=_c_downloads, cumulative_download_session_count=_c_download_sess, ) @@ -407,9 +431,7 @@ def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): return (_views, _view_sess, _downloads, _download_sess) -def _cumulative_countedusage_views( - osfid: str, until_when: str -) -> tuple[int, int]: +def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]: """compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` @@ -651,7 +673,8 @@ def _handle_unchanged(self, *, start: bool): ) if start: # schedule task self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) - migrate_unchanged_recordtype.delay(_es6_cls.__name__) + #migrate_unchanged_recordtype.delay(_es6_cls.__name__) + migrate_unchanged_recordtype(_es6_cls.__name__) self.stdout.write("---") def _handle_usage_events(self, *, start: bool): @@ -681,9 +704,12 @@ def _handle_usage_events(self, *, start: bool): for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() - migrate_counted_usages.delay(_from_str, _until_str) - migrate_preprint_views.delay(_from_str, _until_str) - migrate_preprint_downloads.delay(_from_str, _until_str) + # migrate_counted_usages.delay(_from_str, _until_str) + # migrate_preprint_views.delay(_from_str, _until_str) + # migrate_preprint_downloads.delay(_from_str, _until_str) + migrate_counted_usages(_from_str, _until_str) + migrate_preprint_views(_from_str, _until_str) + migrate_preprint_downloads(_from_str, _until_str) self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): @@ -709,7 +735,7 @@ def _handle_usage_reports(self, *, start: bool): ) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts - if start: # schedule per-item tasks + if start: self.stdout.write( f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" ) @@ -717,7 +743,7 @@ def _handle_usage_reports(self, *, start: bool): started_at=self._migration_started_at ): migrate_usage_reports(_osfid) - # TODO: migrate_usage_reports.apply_async(...) + # TODO: migrate_usage_reports.delay(...) self.stdout.write("---") @functools.cached_property diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index fd8475b1bc3..67fee676112 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -3,7 +3,7 @@ from urllib.parse import urlsplit import elasticsearch8.dsl as esdsl -from elasticsearch_metrics import DAILY, MONTHLY +from elasticsearch_metrics import DAILY, MONTHLY, YEARLY import elasticsearch_metrics.imps.elastic8 as djelme from osf.metrics.utils import YearMonth @@ -233,12 +233,18 @@ class StorageAddonUsageEs8(djelme.CyclicRecord): usage_by_addon: list[UsageByStorageAddon] + class Meta: + timeseries_index_timedepth = YEARLY + class DownloadCountReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY daily_file_downloads: int + class Meta: + timeseries_index_timedepth = YEARLY + class InstitutionSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -252,6 +258,9 @@ class InstitutionSummaryReportEs8(djelme.CyclicRecord): registered_nodes: RegistrationRunningTotals registered_projects: RegistrationRunningTotals + class Meta: + timeseries_index_timedepth = MONTHLY + class NewUserDomainReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -260,6 +269,9 @@ class NewUserDomainReportEs8(djelme.CyclicRecord): domain_name: str new_user_count: int + class Meta: + timeseries_index_timedepth = MONTHLY + class NodeSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -269,12 +281,18 @@ class NodeSummaryReportEs8(djelme.CyclicRecord): registered_nodes: RegistrationRunningTotals registered_projects: RegistrationRunningTotals + class Meta: + timeseries_index_timedepth = YEARLY + class OsfstorageFileCountReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY files: FileRunningTotals + class Meta: + timeseries_index_timedepth = YEARLY + class PreprintSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -283,6 +301,9 @@ class PreprintSummaryReportEs8(djelme.CyclicRecord): provider_key: str preprint_count: int + class Meta: + timeseries_index_timedepth = MONTHLY + class UserSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -294,6 +315,9 @@ class UserSummaryReportEs8(djelme.CyclicRecord): new_users_with_institution_daily: int unconfirmed: int + class Meta: + timeseries_index_timedepth = YEARLY + class SpamSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -310,6 +334,9 @@ class SpamSummaryReportEs8(djelme.CyclicRecord): user_marked_as_spam: int user_marked_as_ham: int + class Meta: + timeseries_index_timedepth = YEARLY + class InstitutionalUserReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -333,6 +360,9 @@ class InstitutionalUserReportEs8(djelme.CyclicRecord): public_file_count: int = esdsl.mapped_field(esdsl.Long()) storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + class Meta: + timeseries_index_timedepth = MONTHLY + class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -350,6 +380,9 @@ class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) + class Meta: + timeseries_index_timedepth = YEARLY + class PublicItemUsageReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -375,6 +408,9 @@ class PublicItemUsageReportEs8(djelme.CyclicRecord): cumulative_download_count: int = esdsl.mapped_field(esdsl.Long()) cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) + class Meta: + timeseries_index_timedepth = MONTHLY + class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -388,6 +424,9 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): preprint_akismet_flagged: int preprint_akismet_hammed: int + class Meta: + timeseries_index_timedepth = YEARLY + ### # data migration state diff --git a/poetry.lock b/poetry.lock index 14113d228b3..7aee4eca49f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1085,7 +1085,7 @@ Django = ">=2.0" [[package]] name = "django-elasticsearch-metrics" -version = "2026.0.3" +version = "2026.0.4" description = "Django app for storing time-series metrics in Elasticsearch." optional = false python-versions = ">=3.10,<4" @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "a1e00e468830a40758caa8afa4b838821471f5c1" -resolved_reference = "a1e00e468830a40758caa8afa4b838821471f5c1" +reference = "fed3c14f213642284a197ac2933106cdafede25b" +resolved_reference = "fed3c14f213642284a197ac2933106cdafede25b" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "1ba293f397fef29212fc58bfb8e08753f64bf43471a6fd2eb9d71bfded4ae326" +content-hash = "0f9c547a6309aa915b25f9a7a98e5d0c15c867d577a883547d894ca173cb2344" diff --git a/pyproject.toml b/pyproject.toml index f7e6eb5bb41..b04e0540d90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "a1e00e468830a40758caa8afa4b838821471f5c1"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "fed3c14f213642284a197ac2933106cdafede25b"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 3053f9d1075..dc69126ca37 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -557,6 +557,7 @@ class CeleryConfig: task_routes = ('framework.celery_tasks.routers.CeleryRouter', ) task_ignore_result = True task_store_errors_even_if_ignored = True + result_extended = True broker_url = os.environ.get('BROKER_URL', f'amqp://{RABBITMQ_USERNAME}:{RABBITMQ_PASSWORD}@{RABBITMQ_HOST}:{RABBITMQ_PORT}/{RABBITMQ_VHOST}') broker_use_ssl = False From da7910a86760bfd9a5d581f2c9b35692d5d30670 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 12:11:55 -0400 Subject: [PATCH 12/22] wip --- .../commands/migrate_osfmetrics_6to8.py | 248 ++++++++---------- osf/metrics/es8_metrics.py | 24 +- website/settings/defaults.py | 1 + 3 files changed, 125 insertions(+), 148 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index c72765ab261..5ee937e80c0 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -17,7 +17,6 @@ from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app -from osf.metadata.rdfutils import OSF, DCTERMS from osf.metrics.preprint_metrics import ( PreprintView, PreprintDownload, @@ -87,8 +86,7 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es8_recordtype(**_convert_kwargs(_hit["_source"])) for _hit in _es6_scan_all(_es6_recordtype) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(_es8_recordtype, _each_new) + return _es8_bulk_save(_es8_recordtype, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -103,8 +101,7 @@ def migrate_counted_usages(from_when: str, until_when: str): addl_filter={"exists": {"field": "item_guid"}}, ) ) - _debug_migrate(_each_new) - #return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -115,8 +112,7 @@ def migrate_preprint_views(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintView, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -127,32 +123,31 @@ def migrate_preprint_downloads(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 - # add cumulative count def _each_new(): + # go in sorted order to build cumulative counts + # (only a few dozen of these per item; should be fine to sort and load all at once) _each_hit = _es6_scan_all( es6_reports.PublicItemUsageReport, - query_body={"query": {"term": {"item_osfid": osfid}}}, - ) - # only a few dozen of these per item; fine to hold all at once - _sorted_sources = sorted( - (_hit["_source"] for _hit in _each_hit), - key=lambda _s: _s["report_yearmonth"], + query_body={ + "query": {"term": {"item_osfid": osfid}}, + "sort": "report_yearmonth", + }, ) _prior_report = None - for _source in _sorted_sources: + for _hit in list(_each_hit): yield ( - _prior_report := _convert_public_usage_report(_source, _prior_report) + _prior_report := _convert_public_usage_report( + _hit["_source"], _prior_report + ) ) - _debug_migrate(_each_new()) - # return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) + return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new()) ### @@ -332,13 +327,12 @@ def _each_kwarg(): def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: _item_iri = _iri_from_osfid(source["item_guid"]) - _item_type = _convert_item_type(source) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: timestamp=source["timestamp"], sessionhour_id=source["session_id"], platform_iri=source.get("platform_iri") or website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), _item_type), + database_iri=_convert_database_iri(source.get("provider_id"), source.get("item_type")), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) @@ -346,11 +340,11 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: ], # fields from OsfCountedUsageRecord: item_osfid=source["item_guid"], - item_type=_item_type, - item_public=source["item_public"], + item_type=source.get("item_type", "osf:Object"), + item_public=source.get("item_public"), provider_id=source.get("provider_id"), - user_is_authenticated=source["user_is_authenticated"], - action_labels=source["action_labels"], + user_is_authenticated=source.get("user_is_authenticated"), + action_labels=source.get("action_labels"), # TODO: does this need the PageviewInfo object or is the dictionary fine? pageview_info=source.get("pageview_info"), ) @@ -367,12 +361,12 @@ def _convert_preprint_metric( user_id=source.get("user_id"), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), OSF.Preprint), + database_iri=_convert_database_iri(source.get("provider_id"), "preprint"), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: item_osfid=source["preprint_id"], - item_type=OSF.Preprint, + item_type="preprint", item_public=True, provider_id=source.get("provider_id"), user_is_authenticated=bool(source.get("user_id")), @@ -402,7 +396,7 @@ def _convert_public_usage_report( "download_session_count", 0 ) return es8_metrics.PublicItemUsageReportEs8( - cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + cycle_coverage=_semverish_from_yearmonth(source["report_yearmonth"]), item_osfid=source["item_osfid"], item_type=source.get("item_type"), provider_id=source.get("provider_id"), @@ -518,50 +512,19 @@ def _iri_from_osfid(osfid: str) -> str: return f"{website_settings.DOMAIN}{osfid}" -def _convert_item_type(es6_usage_dict): - """convert model-name item types to OSFMAP item types - - previous item_types use `type(osf_model).__name__.lower()` - """ - _modelname = es6_usage_dict.get("item_type") - match _modelname: - case "" | None: - return OSF.Object - case "osfuser": - return DCTERMS.Agent - case "preprint": - return OSF.Preprint - case "registration": - return ( - OSF.RegistrationComponent - if es6_usage_dict.get("surrounding_guids") - else OSF.Registration - ) - case "node": - return ( - OSF.ProjectComponent - if es6_usage_dict.get("surrounding_guids") - else OSF.Project - ) - case _ if "file" in _modelname: # hack for the many "filenode" models - return OSF.File - case _: # give up gracefully - return OSF.Object - - @functools.lru_cache -def _convert_database_iri(provider_id: str | None, item_type_iri: str) -> str: +def _convert_database_iri(provider_id: str | None, item_type: str) -> str: if not provider_id: return website_settings.DOMAIN # osf is a provider, sure why not def _fallback_iri(): return f"urn:osf.io:{provider_id}" - match item_type_iri: - case OSF.ProjectComponent | OSF.Project | DCTERMS.Agent: + match item_type: # lower-cased osf.models class names + case "node" | "osfuser": # implicit "osf" provider return website_settings.DOMAIN - case OSF.Preprint: + case "preprint": try: _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) except osfdb.PreprintProvider.DoesNotExist: @@ -569,7 +532,7 @@ def _fallback_iri(): return _fallback_iri() else: return _provider.get_semantic_iri() - case OSF.RegistrationComponent | OSF.Registration: + case "registration": try: _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) except osfdb.RegistrationProvider.DoesNotExist: @@ -577,12 +540,12 @@ def _fallback_iri(): return _fallback_iri() else: return _provider.get_semantic_iri() - case OSF.File: + case _ if "file" in item_type: # file providers are a different thing that don't really have an iri, just an id - return _fallback_iri() + return f"urn:files.osf.io:{provider_id}" case _: # give up gracefully _logger.error( - f"unknown item type {item_type_iri!r} with provider {provider_id!r}" + f"unknown item type {item_type!r} with provider {provider_id!r}" ) return _fallback_iri() @@ -612,6 +575,10 @@ def add_arguments(self, parser): "--no-setup", action="store_true", ) + parser.add_argument( + "--no-counts", + action="store_true", + ) parser.add_argument( "--clear-state", action="store_true", @@ -636,12 +603,13 @@ def add_arguments(self, parser): def handle( self, *, + no_setup, + no_counts, + clear_state, start, unchanged, usage_events, usage_reports, - clear_state, - no_setup, **kwargs, ): self._quiet_chatty_loggers() @@ -652,99 +620,94 @@ def handle( self._check_started_at(start_now=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: - self._handle_unchanged(start=start) + self._handle_unchanged(start=start, no_counts=no_counts) if usage_events or _default_all: - self._handle_usage_events(start=start) + self._handle_usage_events(start=start, no_counts=no_counts) if usage_reports or _default_all: - self._handle_usage_reports(start=start) + self._handle_usage_reports(start=start, no_counts=no_counts) - def _handle_unchanged(self, *, start: bool): + def _handle_unchanged(self, *, start: bool, no_counts: bool): # for each (unchanged) report/event: for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): - # display counts - _es6_count = _es6_cls.search().count() - _es8_count = _es8_cls.search().count() - self._write_tabbed("es6", _es6_cls, _es6_count) + if not no_counts: + # display counts + _es6_count = _es6_cls.search().count() + _es8_count = _es8_cls.search().count() + self._write_tabbed("es6", _es6_cls, _es6_count) + self._write_tabbed( + "es8", + _es8_cls, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + if start: # schedule task + self.stdout.write(f"starting {_es6_cls.__name__} => {_es8_cls.__name__}") + migrate_unchanged_recordtype.delay(_es6_cls.__name__) + + def _handle_usage_events(self, *, start: bool, no_counts: bool): + # for counted-usage events: + _started = self._migration_started_at + _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() + _range_end = _started.date() + datetime.timedelta(days=1) + if not no_counts: + # display counts for each view/download event type + _range_q = {"range": {"timestamp": {"gte": _range_start.isoformat(), "lt": _range_end.isoformat()}}} + _es6_pview_count = PreprintView.search().filter(_range_q).count() + _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count() + _es6_usage_event_count = CountedUsageEs6.search().filter(_range_q).count() + _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() + self._write_tabbed("es6", PreprintView, _es6_pview_count) + self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) + self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed("es6", f"(total between {_range_start} and {_range_end})", _es6_count) self._write_tabbed( "es8", - _es8_cls, + es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) - if start: # schedule task - self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) - #migrate_unchanged_recordtype.delay(_es6_cls.__name__) - migrate_unchanged_recordtype(_es6_cls.__name__) - self.stdout.write("---") - - def _handle_usage_events(self, *, start: bool): - # for counted-usage events: - # TODO: last X months only - # display counts for each view/download event type - _es6_pview_count = PreprintView.search().count() - _es6_pdownload_count = PreprintDownload.search().count() - _es6_usage_event_count = CountedUsageEs6.search().count() - _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count - _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed("es6", PreprintView, _es6_pview_count) - self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) - self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed("es6", "(total to migrate)", _es6_count) - self._write_tabbed( - "es8", - es8_metrics.OsfCountedUsageRecord, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") - _started = self._migration_started_at - _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() - _range_end = _started.date() + datetime.timedelta(days=1) + self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}") for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() - # migrate_counted_usages.delay(_from_str, _until_str) - # migrate_preprint_views.delay(_from_str, _until_str) - # migrate_preprint_downloads.delay(_from_str, _until_str) - migrate_counted_usages(_from_str, _until_str) - migrate_preprint_views(_from_str, _until_str) - migrate_preprint_downloads(_from_str, _until_str) - self.stdout.write("---") - - def _handle_usage_reports(self, *, start: bool): - # display counts of reports and distinct items - _es6_count, _es6_item_count = _es6_usage_report_counts() - _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) - self._write_tabbed( - "es8", - es8_metrics.PublicItemUsageReportEs8, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) - self._write_tabbed( - "es6", es6_reports.PublicItemUsageReport, "(items)", _es6_item_count - ) - self._write_tabbed( - "es8", - es8_metrics.PublicItemUsageReportEs8, - "(items)", - _es8_item_count, - style=self._eq_style(_es8_item_count, _es6_item_count), - ) + migrate_counted_usages.delay(_from_str, _until_str) + migrate_preprint_views.delay(_from_str, _until_str) + migrate_preprint_downloads.delay(_from_str, _until_str) + + def _handle_usage_reports(self, *, start: bool, no_counts: bool): + if not no_counts: + # display counts of reports and distinct items + _es6_count, _es6_item_count = _es6_usage_report_counts() + _es8_count, _es8_item_count = _es8_usage_report_counts() + self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + self._write_tabbed( + "es6", es6_reports.PublicItemUsageReport, "osfid count:", _es6_item_count + ) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + "(items)", + _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count), + ) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts if start: self.stdout.write( - f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" + f"starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}" ) for _osfid in _each_usage_report_osfid( started_at=self._migration_started_at ): - migrate_usage_reports(_osfid) - # TODO: migrate_usage_reports.delay(...) - self.stdout.write("---") + migrate_usage_reports.delay(_osfid) @functools.cached_property def _migration_started_at(self): @@ -757,8 +720,8 @@ def _check_started_at(self, start_now): f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" ) elif start_now: - del self._migration_started_at # clear cache _started_at = es8_metrics.Elastic6To8State.set_started_at_now() + del self._migration_started_at # clear cache self.stdout.write( f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" ) @@ -766,7 +729,6 @@ def _check_started_at(self, start_now): self.stdout.write( "osf.metrics 6->8 migration not started nor starting (run with `--start` to start)" ) - self.stdout.write("---") def _clear_state(self): self.stdout.write( diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 67fee676112..2f4023105d8 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -162,7 +162,7 @@ class RegistriesModerationMetricsEs8(djelme.EventRecord): from_state: str to_state: str user_id: str - comment: str + comment: str | None class Index: settings = { @@ -171,6 +171,9 @@ class Index: 'refresh_interval': '1s', } + class Meta: + timeseries_recordtype_name = 'RegistriesModerationMetrics' + ### # Reusable inner objects for reports @@ -235,6 +238,7 @@ class StorageAddonUsageEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'StorageAddonUsage' class DownloadCountReportEs8(djelme.CyclicRecord): @@ -244,6 +248,7 @@ class DownloadCountReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DownloadCountReport' class InstitutionSummaryReportEs8(djelme.CyclicRecord): @@ -260,6 +265,7 @@ class InstitutionSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'InstitutionSummaryReport' class NewUserDomainReportEs8(djelme.CyclicRecord): @@ -271,6 +277,7 @@ class NewUserDomainReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'NewUserDomainReport' class NodeSummaryReportEs8(djelme.CyclicRecord): @@ -283,6 +290,7 @@ class NodeSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'NodeSummaryReport' class OsfstorageFileCountReportEs8(djelme.CyclicRecord): @@ -292,6 +300,7 @@ class OsfstorageFileCountReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'OsfstorageFileCountReport' class PreprintSummaryReportEs8(djelme.CyclicRecord): @@ -303,6 +312,7 @@ class PreprintSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'PreprintSummaryReport' class UserSummaryReportEs8(djelme.CyclicRecord): @@ -317,6 +327,7 @@ class UserSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'UserSummaryReport' class SpamSummaryReportEs8(djelme.CyclicRecord): @@ -336,6 +347,7 @@ class SpamSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'SpamSummaryReport' class InstitutionalUserReportEs8(djelme.CyclicRecord): @@ -350,7 +362,7 @@ class InstitutionalUserReportEs8(djelme.CyclicRecord): month_last_login = YearmonthField() month_last_active = YearmonthField() account_creation_date = YearmonthField() - orcid_id: str + orcid_id: str | None # counts: public_project_count: int private_project_count: int @@ -362,6 +374,7 @@ class InstitutionalUserReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'InstitutionalUserReport' class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): @@ -382,6 +395,7 @@ class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'InstitutionMonthlySummaryReport' class PublicItemUsageReportEs8(djelme.CyclicRecord): @@ -410,6 +424,7 @@ class PublicItemUsageReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'PublicItemUsageReport' class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): @@ -426,6 +441,7 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'PrivateSpamMetricsReport' ### @@ -440,9 +456,6 @@ class Elastic6To8State(djelme.SimpleRecord): default_factory=lambda: datetime.datetime.now(datetime.UTC), ) - class Index: - name = 'osf_elastic6to8state' - @classmethod def get_by_key(cls, key: str): _response = cls.search().query({'term': {'key': key}})[0].execute() @@ -460,4 +473,5 @@ def get_started_at(cls): @classmethod def set_started_at_now(cls): _record = cls.record(key='started_at') + cls.refresh() return _record.timestamp diff --git a/website/settings/defaults.py b/website/settings/defaults.py index dc69126ca37..2d174472576 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -608,6 +608,7 @@ class CeleryConfig: 'scripts.remove_after_use.merge_notification_subscription_provider_ct', 'scripts.disable_removed_beat_tasks', 'osf.management.commands.delete_withdrawn_or_failed_registration_files', + 'osf.management.commands.migrate_osfmetrics_6to8', ) # Modules that need metrics and release requirements From 95b42e600b11fb2b4f8e51dd6e80b53756575bab Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 12:48:39 -0400 Subject: [PATCH 13/22] wip --- docker-compose.yml | 5 + .../commands/migrate_osfmetrics_6to8.py | 147 +++++++++--------- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 84 insertions(+), 76 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index d771c75797a..04d64c51fda 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,6 +78,11 @@ services: - 9201:9200 volumes: - elasticsearch6_data_vol:/usr/share/elasticsearch/data + healthcheck: + start_period: 15s + test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' + interval: 10s + retries: 30 stdin_open: true elasticsearch8: diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 5ee937e80c0..5ce383b99bc 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -2,7 +2,6 @@ import datetime import functools import logging -from pprint import pprint from django.core.management import call_command from django.core.management.base import BaseCommand @@ -11,7 +10,6 @@ from elasticsearch6 import helpers as es6_helpers from elasticsearch6_dsl.connections import connections as es6_connections from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError -from elasticsearch8.dsl.connections import connections as es8_connections from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me from psycopg2 import OperationalError as PostgresOperationalError @@ -73,10 +71,9 @@ @celery_app.task(**_TASK_KWARGS) -def migrate_unchanged_recordtype(es6_recordtype_name: str): +def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] - _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) _convert_kwargs = ( _convert_unchanged_cyclicrecord_kwargs if issubclass(_es8_recordtype, djel8me.CyclicRecord) @@ -84,7 +81,7 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): ) _each_new = ( _es8_recordtype(**_convert_kwargs(_hit["_source"])) - for _hit in _es6_scan_all(_es6_recordtype) + for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when) ) return _es8_bulk_save(_es8_recordtype, _each_new) @@ -96,8 +93,8 @@ def migrate_counted_usages(from_when: str, until_when: str): _convert_counted_usage(_hit["_source"]) for _hit in _es6_scan_range( CountedUsageEs6, - from_when, - until_when, + from_when=from_when, + until_when=until_when, addl_filter={"exists": {"field": "item_guid"}}, ) ) @@ -110,7 +107,9 @@ def migrate_preprint_views(from_when: str, until_when: str): _action_labels = ["view", "web"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) - for _hit in _es6_scan_range(PreprintView, from_when, until_when) + for _hit in _es6_scan_range( + PreprintView, from_when=from_when, until_when=until_when + ) ) return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @@ -121,23 +120,24 @@ def migrate_preprint_downloads(from_when: str, until_when: str): _action_labels = ["download"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) - for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) + for _hit in _es6_scan_range( + PreprintDownload, from_when=from_when, until_when=until_when + ) ) return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) -def migrate_usage_reports(osfid: str): +def migrate_usage_reports(osfid: str, until_when: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 def _each_new(): # go in sorted order to build cumulative counts # (only a few dozen of these per item; should be fine to sort and load all at once) - _each_hit = _es6_scan_all( + _each_hit = _es6_scan_range( es6_reports.PublicItemUsageReport, - query_body={ - "query": {"term": {"item_osfid": osfid}}, - "sort": "report_yearmonth", - }, + until_when=until_when, + addl_filter={"term": {"item_osfid": osfid}}, + sort="report_yearmonth", ) _prior_report = None for _hit in list(_each_hit): @@ -158,31 +158,6 @@ def _es6_connection(): return es6_connections.get_connection("osfmetrics_es6") -def _es8_connection(): - return es8_connections.get_connection("osfmetrics_es8") - - -def _delete_all(recordtype): - # TODO: REMOVE THIS - recordtype.search().query({"match_all": {}}).delete() - recordtype.refresh() - - -def _delete_all_es8(): - # TODO: REMOVE THIS - for _es8_recordtype in _UNCHANGED_RECORDTYPES.values(): - _delete_all(_es8_recordtype) - _delete_all(es8_metrics.PublicItemUsageReportEs8) - _delete_all(es8_metrics.OsfCountedUsageRecord) - - -def _debug_migrate(each_new): - # TODO: remove this - for _each in each_new: - _each.full_clean() - pprint(_each.to_dict(include_meta=True)) - - def _es8_bulk_save(es8_recordtype, each_new_record): _success_count, _fail_count = es8_recordtype.bulk( each_new_record, @@ -203,24 +178,29 @@ def _date_range( (_from_date, _until_date) = (_until_date, _until_date + step) -def _es6_scan_all(es6_recordtype, query_body=None): - return es6_helpers.scan( - _es6_connection(), - index=es6_recordtype._template_pattern, - query=query_body, - ) - - -def _es6_scan_range(es6_recordtype, from_when: str, until_when: str, addl_filter=None): +def _es6_scan_range( + es6_recordtype, + *, + from_when: str = "", + until_when: str, + addl_filter=None, + sort=None, +): + _timestamp_range = {"lt": until_when} + if from_when: + _timestamp_range["gte"] = from_when _filters = [ - {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + {"range": {"timestamp": _timestamp_range}}, ] if addl_filter: _filters.append(addl_filter) + _query_body = {"query": {"bool": {"filter": _filters}}} + if sort: + _query_body["sort"] = sort return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, - query={"query": {"bool": {"filter": _filters}}}, + query=_query_body, ) @@ -332,7 +312,9 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: timestamp=source["timestamp"], sessionhour_id=source["session_id"], platform_iri=source.get("platform_iri") or website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), source.get("item_type")), + database_iri=_convert_database_iri( + source.get("provider_id"), source.get("item_type") + ), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) @@ -345,7 +327,6 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: provider_id=source.get("provider_id"), user_is_authenticated=source.get("user_is_authenticated"), action_labels=source.get("action_labels"), - # TODO: does this need the PageviewInfo object or is the dictionary fine? pageview_info=source.get("pageview_info"), ) @@ -550,10 +531,10 @@ def _fallback_iri(): return _fallback_iri() -def _each_usage_report_osfid(started_at, after_osfid=None): +def _each_usage_report_osfid(until_when, after_osfid=None): _search = ( es6_reports.PublicItemUsageReport.search() - .filter("range", timestamp={"lt": started_at}) + .filter("range", timestamp={"lt": until_when}) .extra(size=0) ) _search.aggs.bucket( @@ -600,6 +581,10 @@ def add_arguments(self, parser): action="store_true", ) + @functools.cached_property + def _migration_started_at(self): + return es8_metrics.Elastic6To8State.get_started_at() + def handle( self, *, @@ -625,10 +610,13 @@ def handle( self._handle_usage_events(start=start, no_counts=no_counts) if usage_reports or _default_all: self._handle_usage_reports(start=start, no_counts=no_counts) + if not no_counts: + self.stdout.write("(counts may be approximate)") def _handle_unchanged(self, *, start: bool, no_counts: bool): # for each (unchanged) report/event: for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + _assert_field_unchangedness(_es6_cls, _es8_cls) if not no_counts: # display counts _es6_count = _es6_cls.search().count() @@ -641,26 +629,41 @@ def _handle_unchanged(self, *, start: bool, no_counts: bool): style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule task - self.stdout.write(f"starting {_es6_cls.__name__} => {_es8_cls.__name__}") - migrate_unchanged_recordtype.delay(_es6_cls.__name__) + self.stdout.write( + f"starting {_es6_cls.__name__} => {_es8_cls.__name__}" + ) + migrate_unchanged_recordtype.delay( + _es6_cls.__name__, self._migration_started_at.isoformat() + ) def _handle_usage_events(self, *, start: bool, no_counts: bool): # for counted-usage events: - _started = self._migration_started_at + _started = self._migration_started_at or datetime.datetime.now() _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() _range_end = _started.date() + datetime.timedelta(days=1) if not no_counts: # display counts for each view/download event type - _range_q = {"range": {"timestamp": {"gte": _range_start.isoformat(), "lt": _range_end.isoformat()}}} + _range_q = { + "range": { + "timestamp": { + "gte": _range_start.isoformat(), + "lt": _range_end.isoformat(), + } + } + } _es6_pview_count = PreprintView.search().filter(_range_q).count() _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count() _es6_usage_event_count = CountedUsageEs6.search().filter(_range_q).count() - _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + _es6_count = ( + _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + ) _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() self._write_tabbed("es6", PreprintView, _es6_pview_count) self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed("es6", f"(total between {_range_start} and {_range_end})", _es6_count) + self._write_tabbed( + "es6", f"(total between {_range_start} and {_range_end})", _es6_count + ) self._write_tabbed( "es8", es8_metrics.OsfCountedUsageRecord, @@ -668,7 +671,9 @@ def _handle_usage_events(self, *, start: bool, no_counts: bool): style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}") + self.stdout.write( + f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}" + ) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() @@ -689,7 +694,10 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): style=self._eq_style(_es8_count, _es6_count), ) self._write_tabbed( - "es6", es6_reports.PublicItemUsageReport, "osfid count:", _es6_item_count + "es6", + es6_reports.PublicItemUsageReport, + "osfid count:", + _es6_item_count, ) self._write_tabbed( "es8", @@ -705,13 +713,11 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): f"starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}" ) for _osfid in _each_usage_report_osfid( - started_at=self._migration_started_at + until_when=self._migration_started_at ): - migrate_usage_reports.delay(_osfid) - - @functools.cached_property - def _migration_started_at(self): - return es8_metrics.Elastic6To8State.get_started_at() + migrate_usage_reports.delay( + _osfid, self._migration_started_at.isoformat() + ) def _check_started_at(self, start_now): _started_at = self._migration_started_at @@ -736,9 +742,6 @@ def _clear_state(self): ) es8_metrics.Elastic6To8State.search().query({"match_all": {}}).delete() es8_metrics.Elastic6To8State.refresh() - # TODO: REMOVE THIS - self.stdout.write("deleting all migration target data in es8", self.style.ERROR) - _delete_all_es8() def _eq_style(self, num: int, should_be: int): return self.style.SUCCESS if (num == should_be) else self.style.WARNING diff --git a/poetry.lock b/poetry.lock index 7aee4eca49f..1aec6afa426 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "fed3c14f213642284a197ac2933106cdafede25b" -resolved_reference = "fed3c14f213642284a197ac2933106cdafede25b" +reference = "34c7b180e6d595b3374534cd50efb00f5a809582" +resolved_reference = "34c7b180e6d595b3374534cd50efb00f5a809582" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "0f9c547a6309aa915b25f9a7a98e5d0c15c867d577a883547d894ca173cb2344" +content-hash = "9edb43576b960885c14e32e9ae74218c28d883df48679868848dbaa5780c4b12" diff --git a/pyproject.toml b/pyproject.toml index b04e0540d90..815efdd61a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "fed3c14f213642284a197ac2933106cdafede25b"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "34c7b180e6d595b3374534cd50efb00f5a809582"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From bac21a0ba0d2973d5da7fdc1718cd13b5724f15c Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 13:23:15 -0400 Subject: [PATCH 14/22] chore: "fix' quotes --- .../commands/migrate_osfmetrics_6to8.py | 338 +++++++++--------- 1 file changed, 169 insertions(+), 169 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 5ce383b99bc..ccc15834644 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -72,7 +72,7 @@ @celery_app.task(**_TASK_KWARGS) def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): - _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) + _es6_recordtype = djelme_registry.get_recordtype('osf', es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] _convert_kwargs = ( _convert_unchanged_cyclicrecord_kwargs @@ -80,7 +80,7 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): else (lambda _kw: _kw) # no conversion needed for event record ) _each_new = ( - _es8_recordtype(**_convert_kwargs(_hit["_source"])) + _es8_recordtype(**_convert_kwargs(_hit['_source'])) for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when) ) return _es8_bulk_save(_es8_recordtype, _each_new) @@ -90,12 +90,12 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): def migrate_counted_usages(from_when: str, until_when: str): # CountedAuthUsage => OsfCountedUsageRecord _each_new = ( - _convert_counted_usage(_hit["_source"]) + _convert_counted_usage(_hit['_source']) for _hit in _es6_scan_range( CountedUsageEs6, from_when=from_when, until_when=until_when, - addl_filter={"exists": {"field": "item_guid"}}, + addl_filter={'exists': {'field': 'item_guid'}}, ) ) return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @@ -104,9 +104,9 @@ def migrate_counted_usages(from_when: str, until_when: str): @celery_app.task(**_TASK_KWARGS) def migrate_preprint_views(from_when: str, until_when: str): # PreprintView => OsfCountedUsageRecord - _action_labels = ["view", "web"] + _action_labels = ['view', 'web'] _each_new = ( - _convert_preprint_metric(_hit["_source"], _action_labels) + _convert_preprint_metric(_hit['_source'], _action_labels) for _hit in _es6_scan_range( PreprintView, from_when=from_when, until_when=until_when ) @@ -117,9 +117,9 @@ def migrate_preprint_views(from_when: str, until_when: str): @celery_app.task(**_TASK_KWARGS) def migrate_preprint_downloads(from_when: str, until_when: str): # PreprintDownload => OsfCountedUsageRecord - _action_labels = ["download"] + _action_labels = ['download'] _each_new = ( - _convert_preprint_metric(_hit["_source"], _action_labels) + _convert_preprint_metric(_hit['_source'], _action_labels) for _hit in _es6_scan_range( PreprintDownload, from_when=from_when, until_when=until_when ) @@ -136,14 +136,14 @@ def _each_new(): _each_hit = _es6_scan_range( es6_reports.PublicItemUsageReport, until_when=until_when, - addl_filter={"term": {"item_osfid": osfid}}, - sort="report_yearmonth", + addl_filter={'term': {'item_osfid': osfid}}, + sort='report_yearmonth', ) _prior_report = None for _hit in list(_each_hit): yield ( _prior_report := _convert_public_usage_report( - _hit["_source"], _prior_report + _hit['_source'], _prior_report ) ) @@ -155,7 +155,7 @@ def _each_new(): def _es6_connection(): - return es6_connections.get_connection("osfmetrics_es6") + return es6_connections.get_connection('osfmetrics_es6') def _es8_bulk_save(es8_recordtype, each_new_record): @@ -181,22 +181,22 @@ def _date_range( def _es6_scan_range( es6_recordtype, *, - from_when: str = "", + from_when: str = '', until_when: str, addl_filter=None, sort=None, ): - _timestamp_range = {"lt": until_when} + _timestamp_range = {'lt': until_when} if from_when: - _timestamp_range["gte"] = from_when + _timestamp_range['gte'] = from_when _filters = [ - {"range": {"timestamp": _timestamp_range}}, + {'range': {'timestamp': _timestamp_range}}, ] if addl_filter: _filters.append(addl_filter) - _query_body = {"query": {"bool": {"filter": _filters}}} + _query_body = {'query': {'bool': {'filter': _filters}}} if sort: - _query_body["sort"] = sort + _query_body['sort'] = sort return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, @@ -207,16 +207,16 @@ def _es6_scan_range( def _es6_usage_report_counts() -> tuple[int, int]: _search = es6_reports.PublicItemUsageReport.search() _search.aggs.metric( - "agg_item_count", - "cardinality", - field="item_osfid", + 'agg_item_count', + 'cardinality', + field='item_osfid', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total _item_count = ( _response.aggregations.agg_item_count.value - if "agg_item_count" in _response.aggregations + if 'agg_item_count' in _response.aggregations else 0 ) return (_total_count, _item_count) @@ -225,30 +225,30 @@ def _es6_usage_report_counts() -> tuple[int, int]: def _es8_usage_report_counts() -> tuple[int, int]: _search = es8_metrics.PublicItemUsageReportEs8.search() _search.aggs.metric( - "agg_item_count", - "cardinality", - field="item_osfid", + 'agg_item_count', + 'cardinality', + field='item_osfid', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total.value _item_count = ( _response.aggregations.agg_item_count.value - if "agg_item_count" in _response.aggregations + if 'agg_item_count' in _response.aggregations else 0 ) return (_total_count, _item_count) def _get_es6_field_names(es6_recordtype): - """ + ''' adapted from DocumentBase._get_field_names in elasticsearch8.dsl - """ + ''' for _field_name in es6_recordtype._doc_type.mapping: _field = es6_recordtype._doc_type.mapping[_field_name] - if hasattr(_field, "_doc_class"): + if hasattr(_field, '_doc_class'): for _sub_field in _get_es6_field_names(_field._doc_class): - yield f"{_field_name}.{_sub_field}" + yield f'{_field_name}.{_sub_field}' else: yield _field_name @@ -260,20 +260,20 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): # remove fields intentionally removed in migration if issubclass(es6_recordtype, es6_reports.DailyReport): assert issubclass(es8_recordtype, djel8me.CyclicRecord) - _es6_fields.remove("timestamp") - _es6_fields.remove("report_date") + _es6_fields.remove('timestamp') + _es6_fields.remove('report_date') elif issubclass(es6_recordtype, es6_reports.MonthlyReport): assert issubclass(es8_recordtype, djel8me.CyclicRecord) - _es6_fields.remove("timestamp") - _es6_fields.remove("report_yearmonth") + _es6_fields.remove('timestamp') + _es6_fields.remove('report_yearmonth') else: assert issubclass(es8_recordtype, djel8me.EventRecord) # remove fields intentionally added in migration - _es8_fields.remove("timeseries_timeparts") + _es8_fields.remove('timeseries_timeparts') if issubclass(es8_recordtype, djel8me.CyclicRecord): - _es8_fields.remove("created") - _es8_fields.remove("cycle_coverage") + _es8_fields.remove('created') + _es8_fields.remove('cycle_coverage') # all remaining fields should match assert _es6_fields == _es8_fields @@ -281,24 +281,24 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): def _semverish_from_yearmonth(given_yearmonth: str): _ym = YearMonth.from_str(given_yearmonth) - return f"{_ym.year}.{_ym.month}" + return f'{_ym.year}.{_ym.month}' def _semverish_from_date(given_date: str): _d = datetime.date.fromisoformat(given_date) - return f"{_d.year}.{_d.month}.{_d.day}" + return f'{_d.year}.{_d.month}.{_d.day}' def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict: def _each_kwarg(): for _key, _val in es6_source.items(): - if _key == "report_yearmonth": + if _key == 'report_yearmonth': # report_yearmonth converts to cycle_coverage Y.M - yield ("cycle_coverage", _semverish_from_yearmonth(_val)) - elif _key == "report_date": + yield ('cycle_coverage', _semverish_from_yearmonth(_val)) + elif _key == 'report_date': # report_date converts to cycle_coverage Y.M.D - yield ("cycle_coverage", _semverish_from_date(_val)) - elif _key != "timestamp": + yield ('cycle_coverage', _semverish_from_date(_val)) + elif _key != 'timestamp': # skipping timestamp; on daily/monthly reports just copied from yearmonth/date yield (_key, _val) @@ -306,51 +306,51 @@ def _each_kwarg(): def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: - _item_iri = _iri_from_osfid(source["item_guid"]) + _item_iri = _iri_from_osfid(source['item_guid']) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: - timestamp=source["timestamp"], - sessionhour_id=source["session_id"], - platform_iri=source.get("platform_iri") or website_settings.DOMAIN, + timestamp=source['timestamp'], + sessionhour_id=source['session_id'], + platform_iri=source.get('platform_iri') or website_settings.DOMAIN, database_iri=_convert_database_iri( - source.get("provider_id"), source.get("item_type") + source.get('provider_id'), source.get('item_type') ), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) - for _within_osfid in source.get("surrounding_guids", ()) + for _within_osfid in source.get('surrounding_guids', ()) ], # fields from OsfCountedUsageRecord: - item_osfid=source["item_guid"], - item_type=source.get("item_type", "osf:Object"), - item_public=source.get("item_public"), - provider_id=source.get("provider_id"), - user_is_authenticated=source.get("user_is_authenticated"), - action_labels=source.get("action_labels"), - pageview_info=source.get("pageview_info"), + item_osfid=source['item_guid'], + item_type=source.get('item_type', 'osf:Object'), + item_public=source.get('item_public'), + provider_id=source.get('provider_id'), + user_is_authenticated=source.get('user_is_authenticated'), + action_labels=source.get('action_labels'), + pageview_info=source.get('pageview_info'), ) def _convert_preprint_metric( source: dict, action_labels: list[str] ) -> es8_metrics.OsfCountedUsageRecord: - _preprint_iri = _iri_from_osfid(source["preprint_id"]) + _preprint_iri = _iri_from_osfid(source['preprint_id']) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: - timestamp=source["timestamp"], - user_id=source.get("user_id"), + timestamp=source['timestamp'], + user_id=source.get('user_id'), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), "preprint"), + database_iri=_convert_database_iri(source.get('provider_id'), 'preprint'), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: - item_osfid=source["preprint_id"], - item_type="preprint", + item_osfid=source['preprint_id'], + item_type='preprint', item_public=True, - provider_id=source.get("provider_id"), - user_is_authenticated=bool(source.get("user_id")), + provider_id=source.get('provider_id'), + user_is_authenticated=bool(source.get('user_id')), action_labels=action_labels, ) @@ -361,40 +361,40 @@ def _convert_public_usage_report( ) -> es8_metrics.PublicItemUsageReportEs8: if prior_report is None: _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( - osfid=source["item_osfid"], - until_when=YearMonth.from_str(source["report_yearmonth"]).month_end(), - item_type=source.get("item_type"), + osfid=source['item_osfid'], + until_when=YearMonth.from_str(source['report_yearmonth']).month_end(), + item_type=source.get('item_type'), ) else: - _c_views = prior_report.cumulative_view_count + source.get("view_count", 0) + _c_views = prior_report.cumulative_view_count + source.get('view_count', 0) _c_view_sess = prior_report.cumulative_view_session_count + source.get( - "view_session_count", 0 + 'view_session_count', 0 ) _c_downloads = prior_report.cumulative_download_count + source.get( - "download_count", 0 + 'download_count', 0 ) _c_download_sess = prior_report.cumulative_download_session_count + source.get( - "download_session_count", 0 + 'download_session_count', 0 ) return es8_metrics.PublicItemUsageReportEs8( - cycle_coverage=_semverish_from_yearmonth(source["report_yearmonth"]), - item_osfid=source["item_osfid"], - item_type=source.get("item_type"), - provider_id=source.get("provider_id"), - platform_iri=source.get("platform_iri") or website_settings.DOMAIN, - view_count=source.get("view_count"), - view_session_count=source.get("view_session_count"), + cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + item_osfid=source['item_osfid'], + item_type=source.get('item_type'), + provider_id=source.get('provider_id'), + platform_iri=source.get('platform_iri') or website_settings.DOMAIN, + view_count=source.get('view_count'), + view_session_count=source.get('view_session_count'), cumulative_view_count=_c_views, cumulative_view_session_count=_c_view_sess, - download_count=source.get("download_count"), - download_session_count=source.get("download_session_count"), + download_count=source.get('download_count'), + download_session_count=source.get('download_session_count'), cumulative_download_count=_c_downloads, cumulative_download_session_count=_c_download_sess, ) def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): - if item_type == "preprint": + if item_type == 'preprint': _views = _cumulative_preprint_count(PreprintView, osfid, until_when) _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) @@ -407,90 +407,90 @@ def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]: - """compute view_session_count separately to avoid double-counting + '''compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` and that on `surrounding_guids`) - """ + ''' # copied/adapted from osf.metrics.reporters.public_item_usage _search = ( CountedUsageEs6.search() - .filter("term", item_public=True) - .filter("range", timestamp={"lt": until_when}) - .filter("term", action_labels="view") + .filter('term', item_public=True) + .filter('range', timestamp={'lt': until_when}) + .filter('term', action_labels='view') .filter( - "bool", + 'bool', should=[ - {"term": {"item_guid": osfid}}, - {"term": {"surrounding_guids": osfid}}, + {'term': {'item_guid': osfid}}, + {'term': {'surrounding_guids': osfid}}, ], minimum_should_match=1, ) .extra(size=0) # only aggregations, no hits ) _search.aggs.metric( - "agg_session_count", - "cardinality", - field="session_id", + 'agg_session_count', + 'cardinality', + field='session_id', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _view_count = _response.hits.total _view_session_count = ( _response.aggregations.agg_session_count.value - if "agg_session_count" in _response.aggregations + if 'agg_session_count' in _response.aggregations else 0 ) return (_view_count, _view_session_count) def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]: - """aggregate downloads on each osfid (not including components/files)""" + '''aggregate downloads on each osfid (not including components/files)''' # copied/adapted from osf.metrics.reporters.public_item_usage _search = ( CountedUsageEs6.search() - .filter("term", item_public=True) - .filter("range", timestamp={"lt": until_when}) - .filter("term", action_labels="download") - .filter("term", item_guid=osfid) + .filter('term', item_public=True) + .filter('range', timestamp={'lt': until_when}) + .filter('term', action_labels='download') + .filter('term', item_guid=osfid) ) _search.aggs.metric( - "agg_session_count", - "cardinality", - field="session_id", + 'agg_session_count', + 'cardinality', + field='session_id', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _download_count = _response.hits.total _download_session_count = ( _response.aggregations.agg_session_count.value - if "agg_session_count" in _response.aggregations + if 'agg_session_count' in _response.aggregations else 0 ) return (_download_count, _download_session_count) def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int: - """aggregate views on each preprint""" + '''aggregate views on each preprint''' # copied/adapted from osf.metrics.preprint_metrics _search = ( preprint_metric_cls.search() - .filter("term", preprint_id=osfid) - .filter("range", timestamp={"lt": until_when}) + .filter('term', preprint_id=osfid) + .filter('range', timestamp={'lt': until_when}) .extra(size=0) # no hits; only aggs ) - _search.aggs.metric("agg_count", "sum", field="count") + _search.aggs.metric('agg_count', 'sum', field='count') _response = _search.execute() _view_count = ( int(_response.aggregations.agg_count.value) - if hasattr(_response.aggregations, "agg_count") + if hasattr(_response.aggregations, 'agg_count') else 0 ) return _view_count def _iri_from_osfid(osfid: str) -> str: - return f"{website_settings.DOMAIN}{osfid}" + return f'{website_settings.DOMAIN}{osfid}' @functools.lru_cache @@ -499,34 +499,34 @@ def _convert_database_iri(provider_id: str | None, item_type: str) -> str: return website_settings.DOMAIN # osf is a provider, sure why not def _fallback_iri(): - return f"urn:osf.io:{provider_id}" + return f'urn:osf.io:{provider_id}' match item_type: # lower-cased osf.models class names - case "node" | "osfuser": - # implicit "osf" provider + case 'node' | 'osfuser': + # implicit 'osf' provider return website_settings.DOMAIN - case "preprint": + case 'preprint': try: _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) except osfdb.PreprintProvider.DoesNotExist: - _logger.error(f"unknown preprint provider {provider_id!r}") + _logger.error(f'unknown preprint provider {provider_id!r}') return _fallback_iri() else: return _provider.get_semantic_iri() - case "registration": + case 'registration': try: _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) except osfdb.RegistrationProvider.DoesNotExist: - _logger.error(f"unknown registration provider {provider_id!r}") + _logger.error(f'unknown registration provider {provider_id!r}') return _fallback_iri() else: return _provider.get_semantic_iri() - case _ if "file" in item_type: + case _ if 'file' in item_type: # file providers are a different thing that don't really have an iri, just an id - return f"urn:files.osf.io:{provider_id}" + return f'urn:files.osf.io:{provider_id}' case _: # give up gracefully _logger.error( - f"unknown item type {item_type!r} with provider {provider_id!r}" + f'unknown item type {item_type!r} with provider {provider_id!r}' ) return _fallback_iri() @@ -534,16 +534,16 @@ def _fallback_iri(): def _each_usage_report_osfid(until_when, after_osfid=None): _search = ( es6_reports.PublicItemUsageReport.search() - .filter("range", timestamp={"lt": until_when}) + .filter('range', timestamp={'lt': until_when}) .extra(size=0) ) _search.aggs.bucket( - "agg_osfid", - "composite", - sources=[{"osfid": {"terms": {"field": "item_osfid"}}}], + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_osfid'}}}], size=500, ) - return _iter_composite_bucket_keys(_search, "agg_osfid", "osfid", after=after_osfid) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) ### @@ -553,32 +553,32 @@ def _each_usage_report_osfid(until_when, after_osfid=None): class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( - "--no-setup", - action="store_true", + '--no-setup', + action='store_true', ) parser.add_argument( - "--no-counts", - action="store_true", + '--no-counts', + action='store_true', ) parser.add_argument( - "--clear-state", - action="store_true", + '--clear-state', + action='store_true', ) parser.add_argument( - "--start", - action="store_true", + '--start', + action='store_true', ) parser.add_argument( - "--unchanged", - action="store_true", + '--unchanged', + action='store_true', ) parser.add_argument( - "--usage-events", - action="store_true", + '--usage-events', + action='store_true', ) parser.add_argument( - "--usage-reports", - action="store_true", + '--usage-reports', + action='store_true', ) @functools.cached_property @@ -599,7 +599,7 @@ def handle( ): self._quiet_chatty_loggers() if not no_setup: - call_command("djelme_backend_setup") + call_command('djelme_backend_setup') if clear_state: self._clear_state() self._check_started_at(start_now=start) @@ -611,7 +611,7 @@ def handle( if usage_reports or _default_all: self._handle_usage_reports(start=start, no_counts=no_counts) if not no_counts: - self.stdout.write("(counts may be approximate)") + self.stdout.write('(counts may be approximate)') def _handle_unchanged(self, *, start: bool, no_counts: bool): # for each (unchanged) report/event: @@ -621,16 +621,16 @@ def _handle_unchanged(self, *, start: bool, no_counts: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - self._write_tabbed("es6", _es6_cls, _es6_count) + self._write_tabbed('es6', _es6_cls, _es6_count) self._write_tabbed( - "es8", + 'es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule task self.stdout.write( - f"starting {_es6_cls.__name__} => {_es8_cls.__name__}" + f'starting {_es6_cls.__name__} => {_es8_cls.__name__}' ) migrate_unchanged_recordtype.delay( _es6_cls.__name__, self._migration_started_at.isoformat() @@ -644,10 +644,10 @@ def _handle_usage_events(self, *, start: bool, no_counts: bool): if not no_counts: # display counts for each view/download event type _range_q = { - "range": { - "timestamp": { - "gte": _range_start.isoformat(), - "lt": _range_end.isoformat(), + 'range': { + 'timestamp': { + 'gte': _range_start.isoformat(), + 'lt': _range_end.isoformat(), } } } @@ -658,21 +658,21 @@ def _handle_usage_events(self, *, start: bool, no_counts: bool): _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count ) _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed("es6", PreprintView, _es6_pview_count) - self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) - self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed('es6', PreprintView, _es6_pview_count) + self._write_tabbed('es6', PreprintDownload, _es6_pdownload_count) + self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) self._write_tabbed( - "es6", f"(total between {_range_start} and {_range_end})", _es6_count + 'es6', f'(total between {_range_start} and {_range_end})', _es6_count ) self._write_tabbed( - "es8", + 'es8', es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule (per-day?) tasks (if --start) self.stdout.write( - f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}" + f'starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}' ) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() @@ -686,23 +686,23 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): # display counts of reports and distinct items _es6_count, _es6_item_count = _es6_usage_report_counts() _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) self._write_tabbed( - "es8", + 'es8', es8_metrics.PublicItemUsageReportEs8, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) self._write_tabbed( - "es6", + 'es6', es6_reports.PublicItemUsageReport, - "osfid count:", + 'osfid count:', _es6_item_count, ) self._write_tabbed( - "es8", + 'es8', es8_metrics.PublicItemUsageReportEs8, - "(items)", + '(items)', _es8_item_count, style=self._eq_style(_es8_item_count, _es6_item_count), ) @@ -710,7 +710,7 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): # each item-task iter thru reports oldest to newest, adding cumulative counts if start: self.stdout.write( - f"starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}" + f'starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}' ) for _osfid in _each_usage_report_osfid( until_when=self._migration_started_at @@ -723,24 +723,24 @@ def _check_started_at(self, start_now): _started_at = self._migration_started_at if _started_at: self.stdout.write( - f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" + f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' ) elif start_now: _started_at = es8_metrics.Elastic6To8State.set_started_at_now() del self._migration_started_at # clear cache self.stdout.write( - f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" + f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' ) else: self.stdout.write( - "osf.metrics 6->8 migration not started nor starting (run with `--start` to start)" + 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' ) def _clear_state(self): self.stdout.write( - "clearing all migration state (start time, etc)", self.style.NOTICE + 'clearing all migration state (start time, etc)', self.style.NOTICE ) - es8_metrics.Elastic6To8State.search().query({"match_all": {}}).delete() + es8_metrics.Elastic6To8State.search().query({'match_all': {}}).delete() es8_metrics.Elastic6To8State.refresh() def _eq_style(self, num: int, should_be: int): @@ -752,13 +752,13 @@ def _to_str(strable): return strable.__name__ return str(strable) - self.stdout.write("\t".join(map(_to_str, strables)), style) + self.stdout.write('\t'.join(map(_to_str, strables)), style) def _quiet_chatty_loggers(self): _chatty_loggers = [ - "elasticsearch", - "elastic_transport", - "elasticsearch_metrics", + 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', ] for logger_name in _chatty_loggers: logging.getLogger(logger_name).setLevel(logging.ERROR) From 999dc869cd5bf0cd558f8cb2f0795e2a504e3427 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 13:43:29 -0400 Subject: [PATCH 15/22] fix: background migration task module --- website/settings/defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 2d174472576..69f82d2d2a7 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -489,7 +489,7 @@ class CeleryConfig: } background_migration_modules = { - 'osf.management.commands.metrics_es8_migration', + 'osf.management.commands.migrate_osfmetrics_6to8', } try: From d9f5380aa7a1556a535b136e564f7c8e61d1fdc3 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 15:08:18 -0400 Subject: [PATCH 16/22] fix: timestamp tz handling --- osf/metrics/es8_metrics.py | 2 +- osf_tests/metrics/test_es8_metrics.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 2f4023105d8..3b83103b197 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -133,7 +133,7 @@ def _get_unique_together_values(self): self.timestamp.year, self.timestamp.month, self.timestamp.day, - tzinfo=datetime.UTC, + tzinfo=self.timestamp.tzinfo, ) time_in_seconds = (self.timestamp - day_start).total_seconds() time_window = int(time_in_seconds / 30) # 30-second windows diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index e93579628dc..e9dd140b60a 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -1,4 +1,4 @@ -from datetime import datetime +import datetime from elasticsearch_metrics.tests.util import djelme_test_backends import pytest @@ -20,7 +20,7 @@ def _real_elastic(self): def test_nested_pageview_autofill(self): usage = OsfCountedUsageRecord.record( - timestamp=datetime(2024, 1, 1, 15, 0), + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), sessionhour_id='blah', database_iri='https://osf.example/provider', item_iri='https://osf.example/itemm', From beb85485f6f06df8abdd98a703c3b31e139e0d98 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 15:27:03 -0400 Subject: [PATCH 17/22] fix: tests with djelme --- osf_tests/metrics/test_es8_metrics.py | 41 ++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index e9dd140b60a..ce562a026b4 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -39,13 +39,52 @@ def test_nested_pageview_autofill(self): assert usage.pageview_info.page_path == '/path/test' assert usage.pageview_info.referer_domain == 'google.com' assert usage.pageview_info.hour_of_day == 15 + assert usage.item_iri in usage.within_iris + + def test_nested_pageview_autofill_dict(self): + usage = OsfCountedUsageRecord.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + pageview_info={ + 'page_url': 'https://example.com/path/test', + 'referer_url': 'https://google.com', + 'route_name': 'foo.bar', + 'page_title': 'title title', + }, + ) + assert usage.pageview_info.page_path == '/path/test' + assert usage.pageview_info.referer_domain == 'google.com' + assert usage.pageview_info.hour_of_day == 15 + assert usage.item_iri in usage.within_iris + + def test_none_pageview_nested_autofill(self): + usage = OsfCountedUsageRecord.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + ) + assert usage.pageview_info is None + assert usage.item_iri in usage.within_iris def test_save_report(self): _saved = DownloadCountReportEs8.record( cycle_coverage='2026.1.1', daily_file_downloads=17, ) - DownloadCountReportEs8.refresh_timeseries_indexes() + DownloadCountReportEs8.refresh() _response = DownloadCountReportEs8.search().execute() (_fetched,) = _response assert _fetched.meta.id == _saved.meta.id From 778f4b435627a08a7d5f475a578c6be0d37e5cb2 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 15:48:40 -0400 Subject: [PATCH 18/22] fix: pageview_info optional --- osf/metrics/es8_metrics.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 3b83103b197..4c46710748c 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -57,19 +57,19 @@ class PageviewInfo(esdsl.InnerDoc): """ # fields that should be provided - referer_url: str - page_url: str - page_title: str - route_name: str = esdsl.mapped_field(esdsl.Keyword( + referer_url: str | None + page_url: str | None + page_title: str | None + route_name: str | None = esdsl.mapped_field(esdsl.Keyword( fields={ 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), }, )) # fields auto-filled - page_path: str - referer_domain: str - hour_of_day: int + page_path: str | None + referer_domain: str | None + hour_of_day: int | None ### @@ -111,7 +111,9 @@ def clean(self): if _ref_url: self.pageview_info.referer_domain = urlsplit(_ref_url).netloc # ensure inclusive "within" - if self.item_iri not in self.within_iris: + if not self.within_iris: + self.within_iris = [self.item_iri] + elif self.item_iri not in self.within_iris: self.within_iris = [self.item_iri, *self.within_iris] def _get_unique_together_values(self): From ee913841430543a12f89c16a1aba3e40bb1e280b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 16:59:53 -0400 Subject: [PATCH 19/22] fix: tests --- osf/management/commands/migrate_osfmetrics_6to8.py | 2 ++ osf_tests/metrics/test_es8_metrics.py | 2 +- poetry.lock | 6 +++--- pyproject.toml | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index ccc15834644..04afa94b6b9 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -2,6 +2,7 @@ import datetime import functools import logging +import uuid from django.core.management import call_command from django.core.management.base import BaseCommand @@ -340,6 +341,7 @@ def _convert_preprint_metric( # fields used to compute a sessionhour_id: timestamp=source['timestamp'], user_id=source.get('user_id'), + client_session_id=str(uuid.uuid4()), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, database_iri=_convert_database_iri(source.get('provider_id'), 'preprint'), diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index ce562a026b4..a871054e96b 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -76,7 +76,7 @@ def test_none_pageview_nested_autofill(self): platform_iri='https://osf.example', user_is_authenticated=False, ) - assert usage.pageview_info is None + assert not usage.pageview_info assert usage.item_iri in usage.within_iris def test_save_report(self): diff --git a/poetry.lock b/poetry.lock index 1aec6afa426..4fcf24cabd1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "34c7b180e6d595b3374534cd50efb00f5a809582" -resolved_reference = "34c7b180e6d595b3374534cd50efb00f5a809582" +reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" +resolved_reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "9edb43576b960885c14e32e9ae74218c28d883df48679868848dbaa5780c4b12" +content-hash = "e510408fd1590e2ec46f022a6004e55df2c813f6e8688d0c6d75308f1dccf43b" diff --git a/pyproject.toml b/pyproject.toml index 815efdd61a6..ade2030afdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "34c7b180e6d595b3374534cd50efb00f5a809582"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "222f03e92ec45a86f76db7a0461ae4fc483b2810"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From a65d6a580159eabd5cf6fdae8ab89b5d9ade5cfb Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 17:28:49 -0400 Subject: [PATCH 20/22] fix: preprint metric conversion --- osf/management/commands/migrate_osfmetrics_6to8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 04afa94b6b9..92b01e913c3 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -339,7 +339,7 @@ def _convert_preprint_metric( return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: - timestamp=source['timestamp'], + timestamp=datetime.datetime.fromisoformat(source['timestamp']), user_id=source.get('user_id'), client_session_id=str(uuid.uuid4()), # fields from djelme.CountedUsageRecord: From 2059a5657e8d60312da3b9a1a99d2fe129dfc5be Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 17:28:58 -0400 Subject: [PATCH 21/22] fix: osf_shell --- osf/management/commands/osf_shell.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/osf/management/commands/osf_shell.py b/osf/management/commands/osf_shell.py index 851895623ac..69443d004be 100644 --- a/osf/management/commands/osf_shell.py +++ b/osf/management/commands/osf_shell.py @@ -32,7 +32,7 @@ def get_user_imports(): from django.db.models import Model from django_extensions.management.commands import shell_plus from django_extensions.management.utils import signalcommand -from elasticsearch_metrics.registry import registry as metrics_registry +from elasticsearch_metrics.registry import djelme_registry def header(text): @@ -160,7 +160,7 @@ def get_osf_imports(self): def get_metrics(self): return { each.__name__: each - for each in metrics_registry.get_metrics() + for each in djelme_registry.each_recordtype() } def get_grouped_imports(self, options): From c186373defd8b8bb732b1410fe320bb6c9553236 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 22 Apr 2026 15:58:00 -0400 Subject: [PATCH 22/22] per-deployment djelme index name prefix --- api/base/settings/defaults.py | 3 +++ poetry.lock | 6 +++--- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 72e169c25a1..8f3683b6115 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -325,6 +325,7 @@ }, 'osfmetrics_es8': { 'elasticsearch_metrics.imps.elastic8': { + # passthru kwargs to elasticsearch8 connection constructor 'hosts': osf_settings.ELASTIC8_URI, 'ca_certs': osf_settings.ELASTIC8_CERT_PATH, 'basic_auth': ( @@ -332,6 +333,8 @@ if osf_settings.ELASTIC8_SECRET is not None else None ), + # djelme-specific kwargs + 'djelme_default_index_name_prefix': osf_settings.SHARE_PROVIDER_PREPEND, }, }, } diff --git a/poetry.lock b/poetry.lock index 4fcf24cabd1..c16b7d021e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" -resolved_reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" +reference = "4e833670178beb682bb0d64e4f33db012cf8f014" +resolved_reference = "4e833670178beb682bb0d64e4f33db012cf8f014" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "e510408fd1590e2ec46f022a6004e55df2c813f6e8688d0c6d75308f1dccf43b" +content-hash = "d08b71fd886f9c6bd3d8d6cb1eda9f08431b7e84398b107e25f0371a4111266b" diff --git a/pyproject.toml b/pyproject.toml index ade2030afdd..fcc0decc86d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "222f03e92ec45a86f76db7a0461ae4fc483b2810"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "4e833670178beb682bb0d64e4f33db012cf8f014"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1"