Skip to content

Commit e1d4931

Browse files
committed
Add daily Celery beat task and unversioned DOI tracking for preprint DOI resync
1 parent 3ac96f3 commit e1d4931

File tree

5 files changed

+180
-5
lines changed

5 files changed

+180
-5
lines changed

api/crossref/views.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,20 @@ def post(self, request):
4949
if record.get('status').lower() == 'success' and doi:
5050
msg = record.find('msg').text
5151
created = bool(msg == 'Successfully added')
52+
# Unversioned DOIs (no _vN suffix, e.g. 10.31233/osf.io/tnaqp) are routing
53+
# aliases that always resolve to the latest version via OSF's GUID routing.
54+
# Store them as 'doi_unversioned' on the v1 preprint so we can track which
55+
# preprint series have had their unversioned DOI registered.
5256
_, version = Guid.split_guid(guid) if guid else (None, None)
5357
if not version:
54-
logger.info(f'Unversioned DOI confirmed by CrossRef (no identifier update needed): {doi}')
58+
logger.info(f'Unversioned DOI confirmed by CrossRef: {doi}')
59+
if created and guid:
60+
v1_preprint = Preprint.objects.filter(
61+
versioned_guids__guid___id=guid,
62+
versioned_guids__version=1,
63+
).first()
64+
if v1_preprint:
65+
v1_preprint.set_identifier_value(category='doi_unversioned', value=doi)
5566
dois_processed += 1
5667
continue
5768

@@ -74,9 +85,14 @@ def post(self, request):
7485
if 'Relation target DOI does not exist' in record.find('msg').text:
7586
logger.warning('Related publication DOI does not exist, sending metadata again without it...')
7687
mint_doi_on_crossref_fail.apply_async(kwargs={'preprint_id': preprint._id})
77-
# This error occurs when a single preprint is being updated several times in a row with the same metadata [#PLAT-944]
78-
elif 'less or equal to previously submitted version' in record.find('msg').text and record_count == 2:
79-
break
88+
# This error occurs when a single preprint is being updated several times in a row
89+
# with the same metadata [#PLAT-944]. Previously this broke out of the loop when
90+
# record_count == 2 (single DOI submitted twice). Now batches legitimately contain
91+
# 2 records (versioned + unversioned DOI), so we continue instead of break to allow
92+
# the remaining record to be processed.
93+
elif 'less or equal to previously submitted version' in record.find('msg').text:
94+
dois_processed += 1
95+
continue
8096
else:
8197
unexpected_errors = True
8298
logger.info(f'Creation success email received from CrossRef for preprints: {guids}')

api_tests/crossref/views/test_crossref_email_response.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,3 +263,48 @@ def test_unversioned_doi_confirmation_skips_identifier_update(self, app, url, pr
263263

264264
preprint.reload()
265265
assert preprint.get_identifier_value('doi') == versioned_doi
266+
assert preprint.get_identifier_value('doi_unversioned') == unversioned_doi
267+
268+
def test_unversioned_doi_confirmation_update_does_not_store_doi_unversioned(self, app, url, preprint):
269+
versioned_doi = settings.DOI_FORMAT.format(
270+
prefix=preprint.provider.doi_prefix, guid=preprint._id
271+
)
272+
preprint.set_identifier_value(category='doi', value=versioned_doi)
273+
274+
base_guid = preprint.get_guid()._id
275+
unversioned_doi = settings.DOI_FORMAT.format(
276+
prefix=preprint.provider.doi_prefix, guid=base_guid
277+
)
278+
update_confirmation_xml = """
279+
<?xml version="1.0" encoding="UTF-8"?>
280+
<doi_batch_diagnostic status="completed" sp="cs3.crossref.org">
281+
<submission_id>1390676000</submission_id>
282+
<batch_id>{batch_id}</batch_id>
283+
<record_diagnostic status="Success">
284+
<doi>{versioned_doi}</doi>
285+
<msg>Successfully updated</msg>
286+
</record_diagnostic>
287+
<record_diagnostic status="Success">
288+
<doi>{unversioned_doi}</doi>
289+
<msg>Successfully updated</msg>
290+
</record_diagnostic>
291+
<batch_data>
292+
<record_count>2</record_count>
293+
<success_count>2</success_count>
294+
<warning_count>0</warning_count>
295+
<failure_count>0</failure_count>
296+
</batch_data>
297+
</doi_batch_diagnostic>
298+
""".format(
299+
batch_id=preprint._id,
300+
versioned_doi=versioned_doi,
301+
unversioned_doi=unversioned_doi,
302+
)
303+
304+
context_data = self.make_mailgun_payload(crossref_response=update_confirmation_xml)
305+
with capture_notifications(expect_none=True):
306+
app.post(url, context_data)
307+
308+
preprint.reload()
309+
assert preprint.get_identifier_value('doi') == versioned_doi
310+
assert preprint.get_identifier_value('doi_unversioned') is None

osf/management/commands/resync_preprint_dois_v1.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from django.core.management.base import BaseCommand
66
from django.db.models import Q
77

8+
from framework.celery_tasks import app
89
from osf.models import Preprint, Identifier
910
from osf.models.base import VersionedGuidMixin
1011
from osf.management.commands.sync_doi_metadata import async_request_identifier_update
@@ -101,6 +102,114 @@ def resync_preprint_dois_v1(dry_run=True, batch_size=500, rate_limit=100, provid
101102
)
102103

103104

105+
def get_preprints_needing_unversioned_doi(provider_id=None):
106+
content_type = ContentType.objects.get_for_model(Preprint)
107+
108+
already_has_unversioned = Identifier.objects.filter(
109+
content_type=content_type,
110+
category='doi_unversioned',
111+
deleted__isnull=True,
112+
).values_list('object_id', flat=True)
113+
114+
has_versioned_doi = Identifier.objects.filter(
115+
content_type=content_type,
116+
category='doi',
117+
deleted__isnull=True,
118+
value__contains=VersionedGuidMixin.GUID_VERSION_DELIMITER,
119+
).values_list('object_id', flat=True)
120+
121+
public_query = Q(is_published=True, is_public=True, deleted__isnull=True)
122+
withdrawn_query = Q(date_withdrawn__isnull=False, ever_public=True)
123+
124+
qs = Preprint.objects.filter(
125+
versioned_guids__version=1,
126+
id__in=has_versioned_doi,
127+
).filter(
128+
public_query | withdrawn_query
129+
).exclude(
130+
id__in=already_has_unversioned
131+
).exclude(
132+
tags__name='qatest',
133+
tags__system=True,
134+
).select_related('provider').distinct()
135+
136+
if provider_id:
137+
qs = qs.filter(provider___id=provider_id)
138+
139+
return qs
140+
141+
142+
def register_missing_unversioned_dois(dry_run=True, batch_size=500, rate_limit=100, provider_id=None):
143+
preprints_to_update = get_preprints_needing_unversioned_doi(provider_id=provider_id)
144+
145+
total = preprints_to_update.count()
146+
logger.info(
147+
f'{"[DRY RUN] " if dry_run else ""}'
148+
f'{total} preprints need unversioned DOI registration'
149+
+ (f' (provider={provider_id})' if provider_id else '')
150+
)
151+
152+
if batch_size:
153+
preprints_iterable = preprints_to_update[:batch_size]
154+
else:
155+
preprints_iterable = preprints_to_update.iterator()
156+
157+
queued = 0
158+
skipped = 0
159+
errored = 0
160+
for record_number, preprint in enumerate(preprints_iterable, 1):
161+
if not preprint.provider.doi_prefix:
162+
logger.warning(
163+
f'Skipping preprint {preprint._id}: '
164+
f'provider {preprint.provider._id} has no DOI prefix'
165+
)
166+
skipped += 1
167+
continue
168+
169+
if dry_run:
170+
logger.info(f'[DRY RUN] Would register unversioned DOI for preprint {preprint._id}')
171+
queued += 1
172+
continue
173+
174+
if rate_limit and not record_number % rate_limit:
175+
logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s')
176+
time.sleep(RATE_LIMIT_SLEEP)
177+
178+
try:
179+
async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id})
180+
logger.info(f'Queued unversioned DOI registration for preprint {preprint._id}')
181+
queued += 1
182+
except Exception:
183+
logger.exception(f'Failed to queue unversioned DOI registration for preprint {preprint._id}')
184+
errored += 1
185+
186+
logger.info(
187+
f'{"[DRY RUN] " if dry_run else ""}'
188+
f'Unversioned DOI pass done: {queued} queued, {skipped} skipped, {errored} errored'
189+
)
190+
if not dry_run and batch_size:
191+
logger.info(
192+
f'Estimated unversioned remaining after this batch: ~{max(0, total - queued - skipped - errored)}. '
193+
f'Re-run until 0 preprints remain.'
194+
)
195+
196+
197+
@app.task(name='osf.management.commands.resync_preprint_dois_v1', max_retries=0)
198+
def resync_preprint_dois_v1_task(batch_size=500, rate_limit=100, dry_run=False, provider_id=None):
199+
resync_preprint_dois_v1(
200+
dry_run=dry_run,
201+
batch_size=batch_size,
202+
rate_limit=rate_limit,
203+
provider_id=provider_id,
204+
)
205+
register_missing_unversioned_dois(
206+
dry_run=dry_run,
207+
batch_size=batch_size,
208+
rate_limit=rate_limit,
209+
provider_id=provider_id,
210+
)
211+
212+
104213
class Command(BaseCommand):
105214
help = (
106215
'Resync DOIs for version-1 preprints that are missing the versioned DOI suffix (_v1). '

website/identifiers/clients/crossref.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def build_posted_content(self, preprint, element, include_relation, doi_override
148148
preprint_versions = preprint.get_preprint_versions(
149149
versioned_guids__version__lt=preprint.version,
150150
include_rejected=False,
151-
)
151+
) if include_relation else []
152152
if preprint_versions:
153153
for previous_version in preprint_versions:
154154

website/settings/defaults.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,11 @@ class CeleryConfig:
711711
'schedule': crontab(minute=0, hour=5), # Daily 12 a.m
712712
'kwargs': {'dry_run': False},
713713
},
714+
'resync_preprint_dois_v1': {
715+
'task': 'osf.management.commands.resync_preprint_dois_v1',
716+
'schedule': crontab(minute=0, hour=5), # Daily 12 a.m EDT
717+
'kwargs': {'dry_run': False},
718+
},
714719
}
715720

716721

0 commit comments

Comments
 (0)