Skip to content

Commit 626b62e

Browse files
felliottsh-andriy
andauthored
[ENG-10054] feature/ror-migration (#11610)
* feat(osf): script to migrate Crossref Funder IDs to ROR IDs * feat(osf): Fix fot the script to migrate Crossref Funder IDs to ROR IDs * feat(osf): Update OSF metadata model code and tests for ROR funder identifier support * feat(osf): Add DataCite client tests for ROR funder identifier support * feat(osf): update migration script to remove unmapped crossref funders * add another stat to the migration script --------- Co-authored-by: Andriy Sheredko <sheredko.andriy@gmail.com>
1 parent c9801c1 commit 626b62e

File tree

13 files changed

+962
-34
lines changed

13 files changed

+962
-34
lines changed
Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Management command to migrate Crossref Funder IDs to ROR IDs.
4+
5+
This script reads a CSV mapping file and updates all GuidMetadataRecord entries
6+
that have funding_info with Crossref Funder IDs, converting them to ROR IDs.
7+
8+
Usage:
9+
# Dry run (recommended first)
10+
python manage.py migrate_funder_ids_to_ror --csv-file /path/to/mapping.csv --dry-run
11+
12+
# Actual migration
13+
python manage.py migrate_funder_ids_to_ror --csv-file /path/to/mapping.csv
14+
15+
CSV Format Expected (tab or comma separated):
16+
Funder Name, ror ID, ROR name, Crossref DOI, Funder ID
17+
Example:
18+
National Science Foundation, https://ror.org/021nxhr62, National Science Foundation, http://dx.doi.org/10.13039/100000001, 100000001
19+
"""
20+
import csv
21+
import logging
22+
import re
23+
24+
from django.core.management.base import BaseCommand
25+
from django.db import transaction
26+
27+
from osf.models import GuidMetadataRecord
28+
29+
30+
logger = logging.getLogger(__name__)
31+
32+
33+
class Command(BaseCommand):
34+
help = 'Migrate Crossref Funder IDs to ROR IDs in GuidMetadataRecord.funding_info'
35+
36+
def add_arguments(self, parser):
37+
parser.add_argument(
38+
'--csv-file',
39+
type=str,
40+
required=True,
41+
help='Path to the CSV file containing the Crossref to ROR mapping.',
42+
)
43+
parser.add_argument(
44+
'--dry-run',
45+
action='store_true',
46+
dest='dry_run',
47+
help='Run without making any changes to the database.',
48+
)
49+
parser.add_argument(
50+
'--batch-size',
51+
type=int,
52+
default=1000,
53+
help='Number of records to process in each batch (default: 1000).',
54+
)
55+
parser.add_argument(
56+
'--update-funder-name',
57+
action='store_true',
58+
dest='update_funder_name',
59+
help='Also update funder_name to the ROR name from the mapping.',
60+
)
61+
parser.add_argument(
62+
'--skip-reindex',
63+
action='store_true',
64+
dest='skip_reindex',
65+
help='Skip triggering SHARE/DataCite re-indexing after migration. '
66+
'Use this if you plan to run recatalog_metadata separately.',
67+
)
68+
69+
def handle(self, *args, **options):
70+
csv_file = options['csv_file']
71+
dry_run = options['dry_run']
72+
batch_size = options['batch_size']
73+
update_funder_name = options['update_funder_name']
74+
reindex = not options['skip_reindex']
75+
76+
if dry_run:
77+
self.stdout.write(self.style.WARNING('[DRY RUN] No changes will be made to the database.'))
78+
79+
if not reindex:
80+
self.stdout.write(self.style.WARNING('Re-indexing is disabled. Run recatalog_metadata after migration.'))
81+
82+
# Load the mapping
83+
mapping = self.load_mapping(csv_file)
84+
if not mapping:
85+
self.stdout.write(self.style.ERROR('No valid mappings found in CSV file.'))
86+
return
87+
88+
self.stdout.write(f'Loaded {len(mapping)} Crossref to ROR mappings.')
89+
90+
# Find and update records
91+
stats = self.migrate_records(mapping, dry_run, batch_size, update_funder_name, reindex)
92+
93+
# Print summary
94+
self.stdout.write('\n' + '=' * 60)
95+
self.stdout.write(self.style.SUCCESS('Migration Summary:'))
96+
self.stdout.write(f" Records scanned: {stats['scanned']}")
97+
self.stdout.write(f" Records updated: {stats['updated']}")
98+
self.stdout.write(f" Records re-indexed: {stats['reindexed']}")
99+
self.stdout.write(f" Funders migrated: {stats['funders_migrated']}")
100+
self.stdout.write(f" Unmapped funders removed: {stats['not_in_mapping']}")
101+
self.stdout.write(f" Unique funders not in mapping: {len(stats['unmapped_ids'])}")
102+
if stats['errors']:
103+
self.stdout.write(self.style.ERROR(f" Errors: {stats['errors']}"))
104+
105+
if stats['unmapped_ids']:
106+
self.stdout.write('\nUnmapped Crossref Funder IDs (not in CSV):')
107+
for funder_id in sorted(stats['unmapped_ids'])[:50]: # Show first 50
108+
self.stdout.write(f' - {funder_id}')
109+
if len(stats['unmapped_ids']) > 50:
110+
self.stdout.write(f' ... and {len(stats["unmapped_ids"]) - 50} more')
111+
112+
def load_mapping(self, csv_file):
113+
"""Load the Crossref to ROR mapping from CSV file.
114+
115+
Returns a dict mapping various forms of Crossref ID to ROR info:
116+
{
117+
'100000001': {'ror_id': 'https://ror.org/021nxhr62', 'ror_name': 'National Science Foundation'},
118+
'http://dx.doi.org/10.13039/100000001': {...},
119+
'https://doi.org/10.13039/100000001': {...},
120+
...
121+
}
122+
"""
123+
mapping = {}
124+
125+
try:
126+
with open(csv_file, 'r', encoding='utf-8-sig') as f:
127+
# Try to detect delimiter
128+
sample = f.read(2048)
129+
f.seek(0)
130+
if '\t' in sample:
131+
delimiter = '\t'
132+
else:
133+
delimiter = ','
134+
135+
reader = csv.DictReader(f, delimiter=delimiter)
136+
137+
# Normalize column names (handle various formats)
138+
for row in reader:
139+
# Try to find the relevant columns
140+
ror_id = None
141+
ror_name = None
142+
crossref_doi = None
143+
funder_id = None
144+
145+
for key, value in row.items():
146+
if not key:
147+
continue
148+
key_lower = key.lower().strip()
149+
150+
if 'ror' in key_lower and 'id' in key_lower and 'ror_name' not in key_lower:
151+
ror_id = value.strip() if value else None
152+
elif 'ror' in key_lower and 'name' in key_lower:
153+
ror_name = value.strip() if value else None
154+
elif 'crossref' in key_lower and 'doi' in key_lower:
155+
crossref_doi = value.strip() if value else None
156+
elif key_lower == 'funder id' or key_lower == 'funder_id':
157+
funder_id = value.strip() if value else None
158+
159+
if not ror_id:
160+
continue
161+
162+
ror_info = {
163+
'ror_id': ror_id,
164+
'ror_name': ror_name,
165+
}
166+
167+
# Add mappings for various ID formats
168+
if funder_id:
169+
mapping[funder_id] = ror_info
170+
# Also add with various DOI prefixes
171+
mapping[f'http://dx.doi.org/10.13039/{funder_id}'] = ror_info
172+
mapping[f'https://doi.org/10.13039/{funder_id}'] = ror_info
173+
mapping[f'10.13039/{funder_id}'] = ror_info
174+
175+
if crossref_doi:
176+
mapping[crossref_doi] = ror_info
177+
# Normalize the DOI URL
178+
if crossref_doi.startswith('http://'):
179+
mapping[crossref_doi.replace('http://', 'https://')] = ror_info
180+
elif crossref_doi.startswith('https://'):
181+
mapping[crossref_doi.replace('https://', 'http://')] = ror_info
182+
183+
except FileNotFoundError:
184+
self.stdout.write(self.style.ERROR(f'CSV file not found: {csv_file}'))
185+
return None
186+
except Exception as e:
187+
self.stdout.write(self.style.ERROR(f'Error reading CSV file: {e}'))
188+
return None
189+
190+
return mapping
191+
192+
def extract_funder_id(self, identifier):
193+
"""Extract the numeric funder ID from various identifier formats."""
194+
if not identifier:
195+
return None
196+
197+
# Already just a number
198+
if re.match(r'^\d+$', identifier):
199+
return identifier
200+
201+
# Extract from DOI URL (e.g., http://dx.doi.org/10.13039/100000001)
202+
match = re.search(r'10\.13039/(\d+)', identifier)
203+
if match:
204+
return match.group(1)
205+
206+
return identifier
207+
208+
def migrate_records(self, mapping, dry_run, batch_size, update_funder_name, reindex):
209+
"""Find and migrate all GuidMetadataRecord entries with Crossref Funder IDs."""
210+
stats = {
211+
'scanned': 0,
212+
'updated': 0,
213+
'reindexed': 0,
214+
'funders_migrated': 0,
215+
'not_in_mapping': 0,
216+
'errors': 0,
217+
'unmapped_ids': set(),
218+
}
219+
220+
# Query records that have non-empty funding_info
221+
# We need to check if any funder has 'Crossref Funder ID' type
222+
queryset = GuidMetadataRecord.objects.exclude(funding_info=[]).exclude(funding_info__isnull=True)
223+
224+
total_count = queryset.count()
225+
self.stdout.write(f'Found {total_count} records with funding_info to scan.')
226+
227+
processed = 0
228+
for record in queryset.iterator(chunk_size=batch_size):
229+
stats['scanned'] += 1
230+
processed += 1
231+
232+
if processed % 500 == 0:
233+
self.stdout.write(f' Processed {processed}/{total_count} records...')
234+
235+
try:
236+
updated, funder_stats = self.migrate_record(record, mapping, dry_run, update_funder_name)
237+
if updated:
238+
stats['updated'] += 1
239+
if reindex and not dry_run:
240+
try:
241+
self.reindex_record(record)
242+
stats['reindexed'] += 1
243+
except Exception as e:
244+
logger.error(f'Error re-indexing record {record.guid._id}: {e}')
245+
stats['funders_migrated'] += funder_stats['migrated']
246+
stats['not_in_mapping'] += funder_stats['not_found']
247+
stats['unmapped_ids'].update(funder_stats['unmapped_ids'])
248+
except Exception as e:
249+
stats['errors'] += 1
250+
logger.error(f'Error migrating record {record.guid._id}: {e}')
251+
252+
return stats
253+
254+
def migrate_record(self, record, mapping, dry_run, update_funder_name):
255+
"""Migrate a single GuidMetadataRecord's funding_info.
256+
257+
Returns (was_updated, funder_stats)
258+
"""
259+
funder_stats = {
260+
'migrated': 0,
261+
'not_found': 0,
262+
'unmapped_ids': set(),
263+
}
264+
265+
if not record.funding_info:
266+
return False, funder_stats
267+
268+
updated_funding_info = []
269+
record_modified = False
270+
271+
for funder in record.funding_info:
272+
funder_type = funder.get('funder_identifier_type', '')
273+
funder_identifier = funder.get('funder_identifier', '')
274+
275+
# Only migrate Crossref Funder IDs (includes legacy 'Crossref Funder URI' type)
276+
if funder_type not in ('Crossref Funder ID', 'Crossref Funder URI'):
277+
updated_funding_info.append(funder)
278+
continue
279+
280+
# Try to find in mapping
281+
ror_info = None
282+
283+
# Try exact match first
284+
if funder_identifier in mapping:
285+
ror_info = mapping[funder_identifier]
286+
else:
287+
# Try to extract numeric ID and look up
288+
numeric_id = self.extract_funder_id(funder_identifier)
289+
if numeric_id and numeric_id in mapping:
290+
ror_info = mapping[numeric_id]
291+
292+
if ror_info:
293+
# Create updated funder entry
294+
updated_funder = funder.copy()
295+
updated_funder['funder_identifier'] = ror_info['ror_id']
296+
updated_funder['funder_identifier_type'] = 'ROR'
297+
298+
if update_funder_name and ror_info.get('ror_name'):
299+
updated_funder['funder_name'] = ror_info['ror_name']
300+
301+
updated_funding_info.append(updated_funder)
302+
record_modified = True
303+
funder_stats['migrated'] += 1
304+
305+
logger.info(
306+
f'{"[DRY RUN] " if dry_run else ""}'
307+
f'Migrating funder in {record.guid._id}: '
308+
f'{funder_identifier} -> {ror_info["ror_id"]}'
309+
)
310+
else:
311+
# No mapping found, remove unmapped Crossref funder
312+
record_modified = True
313+
funder_stats['not_found'] += 1
314+
funder_stats['unmapped_ids'].add(funder_identifier)
315+
316+
logger.warning(
317+
f'{"[DRY RUN] " if dry_run else ""}'
318+
f'Removing unmapped Crossref Funder ID: {funder_identifier} '
319+
f'from record {record.guid._id}'
320+
)
321+
322+
# Warn about duplicate ROR IDs that would result from migration
323+
if record_modified:
324+
ror_identifiers = [
325+
f['funder_identifier']
326+
for f in updated_funding_info
327+
if f.get('funder_identifier_type') == 'ROR'
328+
]
329+
seen = set()
330+
duplicates = {rid for rid in ror_identifiers if rid in seen or seen.add(rid)}
331+
if duplicates:
332+
logger.warning(
333+
f'Record {record.guid._id} has duplicate ROR IDs after migration: {duplicates}'
334+
)
335+
336+
if record_modified and not dry_run:
337+
with transaction.atomic():
338+
record.funding_info = updated_funding_info
339+
record.save(update_fields=['funding_info'])
340+
341+
return record_modified, funder_stats
342+
343+
def reindex_record(self, record):
344+
"""Trigger SHARE/ElasticSearch and DataCite re-indexing for the record's referent."""
345+
referent = record.guid.referent
346+
if hasattr(referent, 'update_search'):
347+
referent.update_search()
348+
if hasattr(referent, 'request_identifier_update'):
349+
referent.request_identifier_update('doi')

osf/metadata/schemas/datacite.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,8 +470,12 @@
470470
"ISNI",
471471
"GRID",
472472
"Crossref Funder ID",
473+
"ROR",
473474
"Other"
474475
]
476+
},
477+
"schemeURI": {
478+
"$ref": "#/definitions/uri"
475479
}
476480
},
477481
"additionalProperties": false,

osf/metadata/serializers/datacite/datacite_tree_walker.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,12 @@ def _identifier_type_and_value(self, identifier: str):
195195
return ('URL', identifier)
196196
logger.warning('skipping non-IRI-shaped identifier "%s"', identifier)
197197

198-
def _funder_identifier_type(self, identifier: str):
198+
def _funder_identifier_type_and_scheme(self, identifier: str):
199199
if identifier.startswith(DxDOI) or identifier.startswith(DOI):
200-
return 'Crossref Funder ID'
200+
return ('Crossref Funder ID', 'https://www.crossref.org/services/funder-registry/')
201201
if identifier.startswith(ROR):
202-
return 'ROR'
203-
return 'Other'
202+
return ('ROR', str(ROR))
203+
return ('Other', '')
204204

205205
def _get_name_type(self, agent_iri):
206206
if (agent_iri, RDF.type, FOAF.Person) in self.basket:
@@ -312,13 +312,15 @@ def _funding_reference(self, fundrefs_el, funder, funding_award=None):
312312
_fundref_el = self.visit(fundrefs_el, 'fundingReference')
313313
self.visit(_fundref_el, 'funderName', text=next(self.basket[funder:FOAF.name], ''))
314314
_funder_identifier = next(self.basket[funder:DCTERMS.identifier], '')
315+
_funder_id_type, _funder_scheme_uri = self._funder_identifier_type_and_scheme(_funder_identifier)
316+
_funder_id_attrib = {'funderIdentifierType': _funder_id_type}
317+
if _funder_scheme_uri:
318+
_funder_id_attrib['schemeURI'] = _funder_scheme_uri
315319
self.visit(
316320
_fundref_el,
317321
'funderIdentifier',
318322
text=_funder_identifier,
319-
attrib={
320-
'funderIdentifierType': self._funder_identifier_type(_funder_identifier),
321-
},
323+
attrib=_funder_id_attrib,
322324
)
323325
if funding_award is not None:
324326
self.visit(

0 commit comments

Comments
 (0)