|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Management command to verify and profile DataCite metadata resync after ROR migration. |
| 4 | +
|
| 5 | +Finds GuidMetadataRecords with ROR funder identifiers, builds DataCite metadata for each, |
| 6 | +validates the output, and reports timing/performance metrics. |
| 7 | +
|
| 8 | +Usage: |
| 9 | + # Verify metadata builds correctly (no DataCite API calls) |
| 10 | + python manage.py verify_datacite_ror_resync |
| 11 | +
|
| 12 | + # Verify with a sample of N records |
| 13 | + python manage.py verify_datacite_ror_resync --sample 50 |
| 14 | +
|
| 15 | + # Actually resync with DataCite (triggers API calls) |
| 16 | + python manage.py verify_datacite_ror_resync --resync |
| 17 | +
|
| 18 | + # Profile metadata build times only |
| 19 | + python manage.py verify_datacite_ror_resync --profile-only |
| 20 | +""" |
| 21 | +import logging |
| 22 | +import time |
| 23 | + |
| 24 | +import lxml.etree |
| 25 | +from django.core.management.base import BaseCommand |
| 26 | +from osf.models import GuidMetadataRecord |
| 27 | + |
| 28 | +logger = logging.getLogger(__name__) |
| 29 | + |
| 30 | +DATACITE_NS = 'http://datacite.org/schema/kernel-4' |
| 31 | + |
| 32 | + |
| 33 | +class Command(BaseCommand): |
| 34 | + help = 'Verify and profile DataCite metadata resync for records with ROR funder identifiers.' |
| 35 | + |
| 36 | + def add_arguments(self, parser): |
| 37 | + parser.add_argument( |
| 38 | + '--sample', |
| 39 | + type=int, |
| 40 | + default=0, |
| 41 | + help='Limit to N records (0 = all, default: 0).', |
| 42 | + ) |
| 43 | + parser.add_argument( |
| 44 | + '--resync', |
| 45 | + action='store_true', |
| 46 | + help='Actually trigger DataCite resync (API calls). Without this flag, only builds and validates metadata.', |
| 47 | + ) |
| 48 | + parser.add_argument( |
| 49 | + '--profile-only', |
| 50 | + action='store_true', |
| 51 | + dest='profile_only', |
| 52 | + help='Only report timing metrics, skip detailed validation output.', |
| 53 | + ) |
| 54 | + parser.add_argument( |
| 55 | + '--skip-reindex', |
| 56 | + action='store_true', |
| 57 | + dest='skip_reindex', |
| 58 | + help='When --resync is used, skip SHARE/ElasticSearch reindexing (only resync DataCite).', |
| 59 | + ) |
| 60 | + |
| 61 | + def handle(self, *args, **options): |
| 62 | + sample = options['sample'] |
| 63 | + resync = options['resync'] |
| 64 | + profile_only = options['profile_only'] |
| 65 | + skip_reindex = options['skip_reindex'] |
| 66 | + |
| 67 | + if resync: |
| 68 | + self.stdout.write(self.style.WARNING( |
| 69 | + 'RESYNC MODE: Will trigger DataCite API calls for records with DOIs.' |
| 70 | + )) |
| 71 | + |
| 72 | + # Find records with non-empty funding_info that contain ROR funders |
| 73 | + # (filter in Python since JSONField contains with dicts has adapter issues) |
| 74 | + queryset = ( |
| 75 | + GuidMetadataRecord.objects |
| 76 | + .exclude(funding_info=[]) |
| 77 | + .exclude(funding_info__isnull=True) |
| 78 | + .select_related('guid') |
| 79 | + ) |
| 80 | + |
| 81 | + # Pre-filter to only records with at least one ROR funder |
| 82 | + ror_record_ids = [] |
| 83 | + for record in queryset.iterator(chunk_size=500): |
| 84 | + if any( |
| 85 | + f.get('funder_identifier_type') == 'ROR' |
| 86 | + for f in record.funding_info |
| 87 | + ): |
| 88 | + ror_record_ids.append(record.pk) |
| 89 | + |
| 90 | + total = len(ror_record_ids) |
| 91 | + self.stdout.write(f'Found {total} records with ROR funder identifiers.') |
| 92 | + |
| 93 | + if sample: |
| 94 | + ror_record_ids = ror_record_ids[:sample] |
| 95 | + self.stdout.write(f'Sampling {sample} records.') |
| 96 | + |
| 97 | + queryset = GuidMetadataRecord.objects.filter( |
| 98 | + pk__in=ror_record_ids, |
| 99 | + ).select_related('guid') |
| 100 | + |
| 101 | + stats = { |
| 102 | + 'total': 0, |
| 103 | + 'build_success': 0, |
| 104 | + 'build_errors': 0, |
| 105 | + 'validation_errors': 0, |
| 106 | + 'has_doi': 0, |
| 107 | + 'resynced': 0, |
| 108 | + 'resync_errors': 0, |
| 109 | + 'reindexed': 0, |
| 110 | + 'build_times': [], |
| 111 | + 'resync_times': [], |
| 112 | + 'error_guids': [], |
| 113 | + } |
| 114 | + |
| 115 | + for record in queryset.iterator(chunk_size=100): |
| 116 | + stats['total'] += 1 |
| 117 | + |
| 118 | + if stats['total'] % 100 == 0: |
| 119 | + self.stdout.write(f' Processing {stats["total"]}/{total}...') |
| 120 | + |
| 121 | + guid_id = record.guid._id |
| 122 | + referent = record.guid.referent |
| 123 | + |
| 124 | + # Build metadata |
| 125 | + build_start = time.monotonic() |
| 126 | + try: |
| 127 | + metadata_xml = self._build_metadata(referent) |
| 128 | + build_elapsed = time.monotonic() - build_start |
| 129 | + stats['build_times'].append(build_elapsed) |
| 130 | + stats['build_success'] += 1 |
| 131 | + except Exception as e: |
| 132 | + build_elapsed = time.monotonic() - build_start |
| 133 | + stats['build_times'].append(build_elapsed) |
| 134 | + stats['build_errors'] += 1 |
| 135 | + stats['error_guids'].append((guid_id, f'build: {e}')) |
| 136 | + if not profile_only: |
| 137 | + self.stdout.write(self.style.ERROR( |
| 138 | + f' [{guid_id}] Build error: {e}' |
| 139 | + )) |
| 140 | + continue |
| 141 | + |
| 142 | + # Validate ROR funders in output |
| 143 | + if metadata_xml and not profile_only: |
| 144 | + validation_issues = self._validate_ror_funders(metadata_xml, record, guid_id) |
| 145 | + if validation_issues: |
| 146 | + stats['validation_errors'] += 1 |
| 147 | + for issue in validation_issues: |
| 148 | + self.stdout.write(self.style.WARNING(f' [{guid_id}] {issue}')) |
| 149 | + |
| 150 | + # Check DOI |
| 151 | + has_doi = bool( |
| 152 | + hasattr(referent, 'get_identifier_value') |
| 153 | + and referent.get_identifier_value('doi') |
| 154 | + ) |
| 155 | + if has_doi: |
| 156 | + stats['has_doi'] += 1 |
| 157 | + |
| 158 | + # Resync if requested |
| 159 | + if resync and has_doi: |
| 160 | + resync_start = time.monotonic() |
| 161 | + try: |
| 162 | + referent.request_identifier_update('doi') |
| 163 | + resync_elapsed = time.monotonic() - resync_start |
| 164 | + stats['resync_times'].append(resync_elapsed) |
| 165 | + stats['resynced'] += 1 |
| 166 | + |
| 167 | + if not skip_reindex and hasattr(referent, 'update_search'): |
| 168 | + referent.update_search() |
| 169 | + stats['reindexed'] += 1 |
| 170 | + except Exception as e: |
| 171 | + resync_elapsed = time.monotonic() - resync_start |
| 172 | + stats['resync_times'].append(resync_elapsed) |
| 173 | + stats['resync_errors'] += 1 |
| 174 | + stats['error_guids'].append((guid_id, f'resync: {e}')) |
| 175 | + if not profile_only: |
| 176 | + self.stdout.write(self.style.ERROR( |
| 177 | + f' [{guid_id}] Resync error: {e}' |
| 178 | + )) |
| 179 | + |
| 180 | + self._print_summary(stats, resync) |
| 181 | + |
| 182 | + def _build_metadata(self, referent): |
| 183 | + """Build DataCite XML metadata for a referent, returns bytes or None.""" |
| 184 | + client = getattr(referent, 'get_doi_client', lambda: None)() |
| 185 | + if not client: |
| 186 | + return None |
| 187 | + return client.build_metadata(referent) |
| 188 | + |
| 189 | + def _validate_ror_funders(self, metadata_xml, record, guid_id): |
| 190 | + """Validate that ROR funders from funding_info appear correctly in DataCite XML.""" |
| 191 | + issues = [] |
| 192 | + |
| 193 | + try: |
| 194 | + parser = lxml.etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') |
| 195 | + root = lxml.etree.fromstring(metadata_xml, parser=parser) |
| 196 | + except Exception as e: |
| 197 | + return [f'XML parse error: {e}'] |
| 198 | + |
| 199 | + ns = DATACITE_NS |
| 200 | + funding_refs_el = root.find(f'{{{ns}}}fundingReferences') |
| 201 | + xml_funders = [] |
| 202 | + if funding_refs_el is not None: |
| 203 | + for ref_el in funding_refs_el.findall(f'{{{ns}}}fundingReference'): |
| 204 | + funder_id_el = ref_el.find(f'{{{ns}}}funderIdentifier') |
| 205 | + if funder_id_el is not None: |
| 206 | + xml_funders.append({ |
| 207 | + 'identifier': funder_id_el.text or '', |
| 208 | + 'type': funder_id_el.attrib.get('funderIdentifierType', ''), |
| 209 | + 'schemeURI': funder_id_el.attrib.get('schemeURI', ''), |
| 210 | + }) |
| 211 | + |
| 212 | + # Check each ROR funder in funding_info appears in XML |
| 213 | + for funder in record.funding_info: |
| 214 | + if funder.get('funder_identifier_type') != 'ROR': |
| 215 | + continue |
| 216 | + ror_id = funder.get('funder_identifier', '') |
| 217 | + matching = [f for f in xml_funders if f['identifier'] == ror_id] |
| 218 | + if not matching: |
| 219 | + issues.append(f'ROR funder {ror_id} not found in DataCite XML') |
| 220 | + else: |
| 221 | + m = matching[0] |
| 222 | + if m['type'] != 'ROR': |
| 223 | + issues.append( |
| 224 | + f'ROR funder {ror_id} has wrong type: {m["type"]}' |
| 225 | + ) |
| 226 | + if m['schemeURI'] != 'https://ror.org/': |
| 227 | + issues.append( |
| 228 | + f'ROR funder {ror_id} missing/wrong schemeURI: {m["schemeURI"]}' |
| 229 | + ) |
| 230 | + |
| 231 | + return issues |
| 232 | + |
| 233 | + def _print_summary(self, stats, resync): |
| 234 | + self.stdout.write('\n' + '=' * 60) |
| 235 | + self.stdout.write(self.style.SUCCESS('DataCite ROR Resync Verification Summary:')) |
| 236 | + self.stdout.write(f" Records processed: {stats['total']}") |
| 237 | + self.stdout.write(f" Metadata build success: {stats['build_success']}") |
| 238 | + self.stdout.write(f" Metadata build errors: {stats['build_errors']}") |
| 239 | + self.stdout.write(f" Validation issues: {stats['validation_errors']}") |
| 240 | + self.stdout.write(f" Records with DOI: {stats['has_doi']}") |
| 241 | + |
| 242 | + if resync: |
| 243 | + self.stdout.write(f" Records resynced: {stats['resynced']}") |
| 244 | + self.stdout.write(f" Records reindexed: {stats['reindexed']}") |
| 245 | + self.stdout.write(f" Resync errors: {stats['resync_errors']}") |
| 246 | + |
| 247 | + # Timing stats |
| 248 | + if stats['build_times']: |
| 249 | + build_times = stats['build_times'] |
| 250 | + self.stdout.write('\n Build Performance:') |
| 251 | + self.stdout.write(f" Total: {sum(build_times):.2f}s") |
| 252 | + self.stdout.write(f" Mean: {sum(build_times) / len(build_times):.3f}s") |
| 253 | + self.stdout.write(f" Min: {min(build_times):.3f}s") |
| 254 | + self.stdout.write(f" Max: {max(build_times):.3f}s") |
| 255 | + sorted_times = sorted(build_times) |
| 256 | + p50 = sorted_times[len(sorted_times) // 2] |
| 257 | + p95 = sorted_times[int(len(sorted_times) * 0.95)] |
| 258 | + p99 = sorted_times[int(len(sorted_times) * 0.99)] |
| 259 | + self.stdout.write(f" P50: {p50:.3f}s") |
| 260 | + self.stdout.write(f" P95: {p95:.3f}s") |
| 261 | + self.stdout.write(f" P99: {p99:.3f}s") |
| 262 | + |
| 263 | + if stats['resync_times']: |
| 264 | + resync_times = stats['resync_times'] |
| 265 | + self.stdout.write('\n Resync Performance (DataCite API):') |
| 266 | + self.stdout.write(f" Total: {sum(resync_times):.2f}s") |
| 267 | + self.stdout.write(f" Mean: {sum(resync_times) / len(resync_times):.3f}s") |
| 268 | + self.stdout.write(f" Min: {min(resync_times):.3f}s") |
| 269 | + self.stdout.write(f" Max: {max(resync_times):.3f}s") |
| 270 | + sorted_times = sorted(resync_times) |
| 271 | + p50 = sorted_times[len(sorted_times) // 2] |
| 272 | + p95 = sorted_times[int(len(sorted_times) * 0.95)] |
| 273 | + self.stdout.write(f" P50: {p50:.3f}s") |
| 274 | + self.stdout.write(f" P95: {p95:.3f}s") |
| 275 | + |
| 276 | + if stats['error_guids']: |
| 277 | + self.stdout.write(f"\n Errors ({len(stats['error_guids'])}):") |
| 278 | + for guid_id, error in stats['error_guids'][:20]: |
| 279 | + self.stdout.write(f' {guid_id}: {error}') |
| 280 | + if len(stats['error_guids']) > 20: |
| 281 | + self.stdout.write(f' ... and {len(stats["error_guids"]) - 20} more') |
0 commit comments