Skip to content

Commit 0a4e198

Browse files
authored
Merge pull request #11615 from felliott/hotfix/ror-migration-names
[ENG-10131] add command to migrate ror funder names
2 parents 194fa0c + 32fe971 commit 0a4e198

1 file changed

Lines changed: 307 additions & 0 deletions

File tree

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Management command to migrate ROR funders to use ROR funder names.
4+
5+
This script reads a CSV mapping file and updates all GuidMetadataRecord entries
6+
that have funding_info with ROR funder IDs, converting them to ROR IDs.
7+
8+
This has similar functionality to migrate_funder_ids_to_ror.py but is useful if
9+
someone that definitely doesn't have the github id felliott forgot to include
10+
name migrations when running the prior script. It's also useful for generally
11+
updating a bunch of ROR funder names.
12+
13+
Usage:
14+
# Dry run (recommended first)
15+
python manage.py migrate_funder_names_to_ror --csv-file /path/to/mapping.csv --dry-run
16+
17+
# Actual migration
18+
python manage.py migrate_funder_names_to_ror --csv-file /path/to/mapping.csv
19+
20+
CSV Format Expected (tab or comma separated):
21+
Funder Name, ror ID, ROR name, Crossref DOI, Funder ID
22+
Example:
23+
National Science Foundation, https://ror.org/021nxhr62, National Science Foundation, http://dx.doi.org/10.13039/100000001, 100000001
24+
25+
Only the "ror id" and "ror name" columns are used. The others may be omitted.
26+
27+
"""
28+
import csv
29+
import logging
30+
31+
from django.core.management.base import BaseCommand
32+
from django.db import transaction
33+
34+
from osf.models import GuidMetadataRecord
35+
36+
37+
logger = logging.getLogger(__name__)
38+
39+
40+
class Command(BaseCommand):
41+
help = 'Migrate Crossref Funder IDs to ROR IDs in GuidMetadataRecord.funding_info'
42+
43+
def add_arguments(self, parser):
44+
parser.add_argument(
45+
'--csv-file',
46+
type=str,
47+
required=True,
48+
help='Path to the CSV file containing the Crossref to ROR mapping.',
49+
)
50+
parser.add_argument(
51+
'--dry-run',
52+
action='store_true',
53+
dest='dry_run',
54+
help='Run without making any changes to the database.',
55+
)
56+
parser.add_argument(
57+
'--batch-size',
58+
type=int,
59+
default=1000,
60+
help='Number of records to process in each batch (default: 1000).',
61+
)
62+
parser.add_argument(
63+
'--skip-reindex',
64+
action='store_true',
65+
dest='skip_reindex',
66+
help='Skip triggering SHARE/DataCite re-indexing after migration. '
67+
'Use this if you plan to run recatalog_metadata separately.',
68+
)
69+
70+
def handle(self, *args, **options):
71+
csv_file = options['csv_file']
72+
dry_run = options['dry_run']
73+
batch_size = options['batch_size']
74+
reindex = not options['skip_reindex']
75+
76+
if dry_run:
77+
self.stdout.write(self.style.WARNING('[DRY RUN] No changes will be made to the database.'))
78+
79+
if not reindex:
80+
self.stdout.write(self.style.WARNING('Re-indexing is disabled. Run recatalog_metadata after migration.'))
81+
82+
# Load the mapping
83+
mapping = self.load_mapping(csv_file)
84+
if not mapping:
85+
self.stdout.write(self.style.ERROR('No valid mappings found in CSV file.'))
86+
return
87+
88+
self.stdout.write(f'Loaded {len(mapping)} ROR id to name mappings.')
89+
90+
# Find and update records
91+
stats = self.migrate_records(mapping, dry_run, batch_size, reindex)
92+
93+
# Print summary
94+
self.stdout.write('\n' + '=' * 60)
95+
self.stdout.write(self.style.SUCCESS('Migration Summary:'))
96+
self.stdout.write(f" Records scanned: {stats['scanned']}")
97+
self.stdout.write(f" Records updated: {stats['updated']}")
98+
self.stdout.write(f" Records re-indexed: {stats['reindexed']}")
99+
self.stdout.write(f" Funder names updated: {stats['funders_migrated']}")
100+
self.stdout.write(f" Unmapped funders removed: {stats['not_in_mapping']}")
101+
self.stdout.write(f" Unique funders not in mapping: {len(stats['unmapped_ids'])}")
102+
if stats['errors']:
103+
self.stdout.write(self.style.ERROR(f" Errors: {stats['errors']}"))
104+
105+
if stats['unmapped_ids']:
106+
self.stdout.write('\nUnmapped ROR Funder IDs (not in CSV):')
107+
for funder_id in sorted(stats['unmapped_ids'])[:50]: # Show first 50
108+
self.stdout.write(f' - {funder_id}')
109+
if len(stats['unmapped_ids']) > 50:
110+
self.stdout.write(f' ... and {len(stats["unmapped_ids"]) - 50} more')
111+
112+
def load_mapping(self, csv_file):
113+
"""Load the ROR ID to ROR info mapping from CSV file.
114+
115+
Returns a dict mapping ROR IDs to ROR info:
116+
{
117+
'https://ror.org/021nxhr62': {
118+
'ror_id': 'https://ror.org/021nxhr62',
119+
'ror_name': 'National Science Foundation'
120+
},
121+
...
122+
}
123+
"""
124+
mapping = {}
125+
126+
try:
127+
with open(csv_file, 'r', encoding='utf-8-sig') as f:
128+
# Try to detect delimiter
129+
sample = f.read(2048)
130+
f.seek(0)
131+
if '\t' in sample:
132+
delimiter = '\t'
133+
else:
134+
delimiter = ','
135+
136+
reader = csv.DictReader(f, delimiter=delimiter)
137+
138+
# Normalize column names (handle various formats)
139+
for row in reader:
140+
# Try to find the relevant columns
141+
ror_id = None
142+
ror_name = None
143+
144+
for key, value in row.items():
145+
if not key:
146+
continue
147+
key_lower = key.lower().strip()
148+
149+
if 'ror' in key_lower and 'id' in key_lower and 'ror_name' not in key_lower:
150+
ror_id = value.strip() if value else None
151+
elif 'ror' in key_lower and 'name' in key_lower:
152+
ror_name = value.strip() if value else None
153+
154+
if not ror_id:
155+
continue
156+
157+
ror_info = {
158+
'ror_id': ror_id,
159+
'ror_name': ror_name,
160+
}
161+
162+
# Add mappings for various ID formats
163+
mapping[ror_id] = ror_info
164+
165+
except FileNotFoundError:
166+
self.stdout.write(self.style.ERROR(f'CSV file not found: {csv_file}'))
167+
return None
168+
except Exception as e:
169+
self.stdout.write(self.style.ERROR(f'Error reading CSV file: {e}'))
170+
return None
171+
172+
return mapping
173+
174+
def migrate_records(self, mapping, dry_run, batch_size, reindex):
175+
"""Find and migrate all GuidMetadataRecord entries with Crossref Funder IDs."""
176+
stats = {
177+
'scanned': 0,
178+
'updated': 0,
179+
'reindexed': 0,
180+
'funders_migrated': 0,
181+
'not_in_mapping': 0,
182+
'errors': 0,
183+
'unmapped_ids': set(),
184+
}
185+
186+
# Query records that have non-empty funding_info
187+
# We need to check if any funder has 'Crossref Funder ID' type
188+
queryset = GuidMetadataRecord.objects.exclude(funding_info=[]).exclude(funding_info__isnull=True)
189+
190+
total_count = queryset.count()
191+
self.stdout.write(f'Found {total_count} records with funding_info to scan.')
192+
193+
processed = 0
194+
for record in queryset.iterator(chunk_size=batch_size):
195+
stats['scanned'] += 1
196+
processed += 1
197+
198+
if processed % 500 == 0:
199+
self.stdout.write(f' Processed {processed}/{total_count} records...')
200+
201+
try:
202+
updated, funder_stats = self.migrate_record(record, mapping, dry_run)
203+
if updated:
204+
stats['updated'] += 1
205+
if reindex and not dry_run:
206+
try:
207+
self.reindex_record(record)
208+
stats['reindexed'] += 1
209+
except Exception as e:
210+
logger.error(f'Error re-indexing record {record.guid._id}: {e}')
211+
stats['funders_migrated'] += funder_stats['migrated']
212+
stats['not_in_mapping'] += funder_stats['not_found']
213+
stats['unmapped_ids'].update(funder_stats['unmapped_ids'])
214+
except Exception as e:
215+
stats['errors'] += 1
216+
logger.error(f'Error migrating record {record.guid._id}: {e}')
217+
218+
return stats
219+
220+
def migrate_record(self, record, mapping, dry_run):
221+
"""Migrate a single GuidMetadataRecord's funding_info.
222+
223+
Returns (was_updated, funder_stats)
224+
"""
225+
funder_stats = {
226+
'migrated': 0,
227+
'not_found': 0,
228+
'unmapped_ids': set(),
229+
}
230+
231+
if not record.funding_info:
232+
return False, funder_stats
233+
234+
updated_funding_info = []
235+
record_modified = False
236+
237+
for funder in record.funding_info:
238+
funder_type = funder.get('funder_identifier_type', '')
239+
funder_identifier = funder.get('funder_identifier', '')
240+
241+
# Only update ROR funder records
242+
if funder_type != 'ROR':
243+
updated_funding_info.append(funder)
244+
continue
245+
246+
# Try to find in mapping
247+
ror_info = mapping.get(funder_identifier, None)
248+
if ror_info is None:
249+
logger.info(
250+
f'{"[DRY RUN] " if dry_run else ""}'
251+
f'Unrecognized ror id for {record.guid._id}: '
252+
f'{funder_identifier}'
253+
)
254+
updated_funding_info.append(funder)
255+
continue
256+
257+
# Has name changed?
258+
if funder.get('funder_name') == ror_info['ror_name']:
259+
logger.info(
260+
f'{"[DRY RUN] " if dry_run else ""}'
261+
f'ROR name unchanged for {record.guid._id}: '
262+
f'{funder_identifier} -> {funder.get("funder_name")}'
263+
)
264+
updated_funding_info.append(funder)
265+
continue
266+
267+
# Create updated funder entry
268+
logger.info(
269+
f'{"[DRY RUN] " if dry_run else ""}'
270+
f'Updating name for {record.guid._id}: '
271+
f'id {funder_identifier} from {funder["funder_name"]} to {ror_info["ror_name"]}'
272+
)
273+
updated_funder = funder.copy()
274+
updated_funder['funder_name'] = ror_info['ror_name']
275+
updated_funding_info.append(updated_funder)
276+
record_modified = True
277+
funder_stats['migrated'] += 1
278+
279+
# Warn about duplicate ROR IDs that would result from migration
280+
# THIS SHOULDN'T HAPPEN
281+
if record_modified:
282+
ror_identifiers = [
283+
f['funder_identifier']
284+
for f in updated_funding_info
285+
if f.get('funder_identifier_type') == 'ROR'
286+
]
287+
seen = set()
288+
duplicates = {rid for rid in ror_identifiers if rid in seen or seen.add(rid)}
289+
if duplicates:
290+
logger.warning(
291+
f'Record {record.guid._id} has duplicate ROR IDs after migration: {duplicates}'
292+
)
293+
294+
if record_modified and not dry_run:
295+
with transaction.atomic():
296+
record.funding_info = updated_funding_info
297+
record.save(update_fields=['funding_info'])
298+
299+
return record_modified, funder_stats
300+
301+
def reindex_record(self, record):
302+
"""Trigger SHARE/ElasticSearch and DataCite re-indexing for the record's referent."""
303+
referent = record.guid.referent
304+
if hasattr(referent, 'update_search'):
305+
referent.update_search()
306+
if hasattr(referent, 'request_identifier_update'):
307+
referent.request_identifier_update('doi')

0 commit comments

Comments
 (0)