Skip to content

Commit 9e400c2

Browse files
committed
feat(osf): Test / profile resyncing of Datacite on staging
1 parent 50016ab commit 9e400c2

File tree

2 files changed

+427
-0
lines changed

2 files changed

+427
-0
lines changed
Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Management command to verify and profile DataCite metadata resync after ROR migration.
4+
5+
Finds GuidMetadataRecords with ROR funder identifiers, builds DataCite metadata for each,
6+
validates the output, and reports timing/performance metrics.
7+
8+
Usage:
9+
# Verify metadata builds correctly (no DataCite API calls)
10+
python manage.py verify_datacite_ror_resync
11+
12+
# Verify with a sample of N records
13+
python manage.py verify_datacite_ror_resync --sample 50
14+
15+
# Actually resync with DataCite (triggers API calls)
16+
python manage.py verify_datacite_ror_resync --resync
17+
18+
# Profile metadata build times only
19+
python manage.py verify_datacite_ror_resync --profile-only
20+
"""
21+
import logging
22+
import time
23+
24+
import lxml.etree
25+
from django.core.management.base import BaseCommand
26+
from osf.models import GuidMetadataRecord
27+
28+
logger = logging.getLogger(__name__)
29+
30+
DATACITE_NS = 'http://datacite.org/schema/kernel-4'
31+
32+
33+
class Command(BaseCommand):
34+
help = 'Verify and profile DataCite metadata resync for records with ROR funder identifiers.'
35+
36+
def add_arguments(self, parser):
37+
parser.add_argument(
38+
'--sample',
39+
type=int,
40+
default=0,
41+
help='Limit to N records (0 = all, default: 0).',
42+
)
43+
parser.add_argument(
44+
'--resync',
45+
action='store_true',
46+
help='Actually trigger DataCite resync (API calls). Without this flag, only builds and validates metadata.',
47+
)
48+
parser.add_argument(
49+
'--profile-only',
50+
action='store_true',
51+
dest='profile_only',
52+
help='Only report timing metrics, skip detailed validation output.',
53+
)
54+
parser.add_argument(
55+
'--skip-reindex',
56+
action='store_true',
57+
dest='skip_reindex',
58+
help='When --resync is used, skip SHARE/ElasticSearch reindexing (only resync DataCite).',
59+
)
60+
61+
def handle(self, *args, **options):
62+
sample = options['sample']
63+
resync = options['resync']
64+
profile_only = options['profile_only']
65+
skip_reindex = options['skip_reindex']
66+
67+
if resync:
68+
self.stdout.write(self.style.WARNING(
69+
'RESYNC MODE: Will trigger DataCite API calls for records with DOIs.'
70+
))
71+
72+
# Find records with non-empty funding_info that contain ROR funders
73+
# (filter in Python since JSONField contains with dicts has adapter issues)
74+
queryset = (
75+
GuidMetadataRecord.objects
76+
.exclude(funding_info=[])
77+
.exclude(funding_info__isnull=True)
78+
.select_related('guid')
79+
)
80+
81+
# Pre-filter to only records with at least one ROR funder
82+
ror_record_ids = []
83+
for record in queryset.iterator(chunk_size=500):
84+
if any(
85+
f.get('funder_identifier_type') == 'ROR'
86+
for f in record.funding_info
87+
):
88+
ror_record_ids.append(record.pk)
89+
90+
total = len(ror_record_ids)
91+
self.stdout.write(f'Found {total} records with ROR funder identifiers.')
92+
93+
if sample:
94+
ror_record_ids = ror_record_ids[:sample]
95+
self.stdout.write(f'Sampling {sample} records.')
96+
97+
queryset = GuidMetadataRecord.objects.filter(
98+
pk__in=ror_record_ids,
99+
).select_related('guid')
100+
101+
stats = {
102+
'total': 0,
103+
'build_success': 0,
104+
'build_errors': 0,
105+
'validation_errors': 0,
106+
'has_doi': 0,
107+
'resynced': 0,
108+
'resync_errors': 0,
109+
'reindexed': 0,
110+
'build_times': [],
111+
'resync_times': [],
112+
'error_guids': [],
113+
}
114+
115+
for record in queryset.iterator(chunk_size=100):
116+
stats['total'] += 1
117+
118+
if stats['total'] % 100 == 0:
119+
self.stdout.write(f' Processing {stats["total"]}/{total}...')
120+
121+
guid_id = record.guid._id
122+
referent = record.guid.referent
123+
124+
# Build metadata
125+
build_start = time.monotonic()
126+
try:
127+
metadata_xml = self._build_metadata(referent)
128+
build_elapsed = time.monotonic() - build_start
129+
stats['build_times'].append(build_elapsed)
130+
stats['build_success'] += 1
131+
except Exception as e:
132+
build_elapsed = time.monotonic() - build_start
133+
stats['build_times'].append(build_elapsed)
134+
stats['build_errors'] += 1
135+
stats['error_guids'].append((guid_id, f'build: {e}'))
136+
if not profile_only:
137+
self.stdout.write(self.style.ERROR(
138+
f' [{guid_id}] Build error: {e}'
139+
))
140+
continue
141+
142+
# Validate ROR funders in output
143+
if metadata_xml and not profile_only:
144+
validation_issues = self._validate_ror_funders(metadata_xml, record, guid_id)
145+
if validation_issues:
146+
stats['validation_errors'] += 1
147+
for issue in validation_issues:
148+
self.stdout.write(self.style.WARNING(f' [{guid_id}] {issue}'))
149+
150+
# Check DOI
151+
has_doi = bool(
152+
hasattr(referent, 'get_identifier_value')
153+
and referent.get_identifier_value('doi')
154+
)
155+
if has_doi:
156+
stats['has_doi'] += 1
157+
158+
# Resync if requested
159+
if resync and has_doi:
160+
resync_start = time.monotonic()
161+
try:
162+
referent.request_identifier_update('doi')
163+
resync_elapsed = time.monotonic() - resync_start
164+
stats['resync_times'].append(resync_elapsed)
165+
stats['resynced'] += 1
166+
167+
if not skip_reindex and hasattr(referent, 'update_search'):
168+
referent.update_search()
169+
stats['reindexed'] += 1
170+
except Exception as e:
171+
resync_elapsed = time.monotonic() - resync_start
172+
stats['resync_times'].append(resync_elapsed)
173+
stats['resync_errors'] += 1
174+
stats['error_guids'].append((guid_id, f'resync: {e}'))
175+
if not profile_only:
176+
self.stdout.write(self.style.ERROR(
177+
f' [{guid_id}] Resync error: {e}'
178+
))
179+
180+
self._print_summary(stats, resync)
181+
182+
def _build_metadata(self, referent):
183+
"""Build DataCite XML metadata for a referent, returns bytes or None."""
184+
client = getattr(referent, 'get_doi_client', lambda: None)()
185+
if not client:
186+
return None
187+
return client.build_metadata(referent)
188+
189+
def _validate_ror_funders(self, metadata_xml, record, guid_id):
190+
"""Validate that ROR funders from funding_info appear correctly in DataCite XML."""
191+
issues = []
192+
193+
try:
194+
parser = lxml.etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
195+
root = lxml.etree.fromstring(metadata_xml, parser=parser)
196+
except Exception as e:
197+
return [f'XML parse error: {e}']
198+
199+
ns = DATACITE_NS
200+
funding_refs_el = root.find(f'{{{ns}}}fundingReferences')
201+
xml_funders = []
202+
if funding_refs_el is not None:
203+
for ref_el in funding_refs_el.findall(f'{{{ns}}}fundingReference'):
204+
funder_id_el = ref_el.find(f'{{{ns}}}funderIdentifier')
205+
if funder_id_el is not None:
206+
xml_funders.append({
207+
'identifier': funder_id_el.text or '',
208+
'type': funder_id_el.attrib.get('funderIdentifierType', ''),
209+
'schemeURI': funder_id_el.attrib.get('schemeURI', ''),
210+
})
211+
212+
# Check each ROR funder in funding_info appears in XML
213+
for funder in record.funding_info:
214+
if funder.get('funder_identifier_type') != 'ROR':
215+
continue
216+
ror_id = funder.get('funder_identifier', '')
217+
matching = [f for f in xml_funders if f['identifier'] == ror_id]
218+
if not matching:
219+
issues.append(f'ROR funder {ror_id} not found in DataCite XML')
220+
else:
221+
m = matching[0]
222+
if m['type'] != 'ROR':
223+
issues.append(
224+
f'ROR funder {ror_id} has wrong type: {m["type"]}'
225+
)
226+
if m['schemeURI'] != 'https://ror.org/':
227+
issues.append(
228+
f'ROR funder {ror_id} missing/wrong schemeURI: {m["schemeURI"]}'
229+
)
230+
231+
return issues
232+
233+
def _print_summary(self, stats, resync):
234+
self.stdout.write('\n' + '=' * 60)
235+
self.stdout.write(self.style.SUCCESS('DataCite ROR Resync Verification Summary:'))
236+
self.stdout.write(f" Records processed: {stats['total']}")
237+
self.stdout.write(f" Metadata build success: {stats['build_success']}")
238+
self.stdout.write(f" Metadata build errors: {stats['build_errors']}")
239+
self.stdout.write(f" Validation issues: {stats['validation_errors']}")
240+
self.stdout.write(f" Records with DOI: {stats['has_doi']}")
241+
242+
if resync:
243+
self.stdout.write(f" Records resynced: {stats['resynced']}")
244+
self.stdout.write(f" Records reindexed: {stats['reindexed']}")
245+
self.stdout.write(f" Resync errors: {stats['resync_errors']}")
246+
247+
# Timing stats
248+
if stats['build_times']:
249+
build_times = stats['build_times']
250+
self.stdout.write('\n Build Performance:')
251+
self.stdout.write(f" Total: {sum(build_times):.2f}s")
252+
self.stdout.write(f" Mean: {sum(build_times) / len(build_times):.3f}s")
253+
self.stdout.write(f" Min: {min(build_times):.3f}s")
254+
self.stdout.write(f" Max: {max(build_times):.3f}s")
255+
sorted_times = sorted(build_times)
256+
p50 = sorted_times[len(sorted_times) // 2]
257+
p95 = sorted_times[int(len(sorted_times) * 0.95)]
258+
p99 = sorted_times[int(len(sorted_times) * 0.99)]
259+
self.stdout.write(f" P50: {p50:.3f}s")
260+
self.stdout.write(f" P95: {p95:.3f}s")
261+
self.stdout.write(f" P99: {p99:.3f}s")
262+
263+
if stats['resync_times']:
264+
resync_times = stats['resync_times']
265+
self.stdout.write('\n Resync Performance (DataCite API):')
266+
self.stdout.write(f" Total: {sum(resync_times):.2f}s")
267+
self.stdout.write(f" Mean: {sum(resync_times) / len(resync_times):.3f}s")
268+
self.stdout.write(f" Min: {min(resync_times):.3f}s")
269+
self.stdout.write(f" Max: {max(resync_times):.3f}s")
270+
sorted_times = sorted(resync_times)
271+
p50 = sorted_times[len(sorted_times) // 2]
272+
p95 = sorted_times[int(len(sorted_times) * 0.95)]
273+
self.stdout.write(f" P50: {p50:.3f}s")
274+
self.stdout.write(f" P95: {p95:.3f}s")
275+
276+
if stats['error_guids']:
277+
self.stdout.write(f"\n Errors ({len(stats['error_guids'])}):")
278+
for guid_id, error in stats['error_guids'][:20]:
279+
self.stdout.write(f' {guid_id}: {error}')
280+
if len(stats['error_guids']) > 20:
281+
self.stdout.write(f' ... and {len(stats["error_guids"]) - 20} more')

0 commit comments

Comments
 (0)