Skip to content

Commit 1c049a4

Browse files
committed
Add manage command to resync preprint dois v1
1 parent f7a489b commit 1c049a4

File tree

2 files changed

+316
-0
lines changed

2 files changed

+316
-0
lines changed
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import logging
2+
import time
3+
4+
from django.contrib.contenttypes.models import ContentType
5+
from django.core.management.base import BaseCommand
6+
from django.db.models import Q
7+
8+
from osf.models import Preprint, Identifier
9+
from osf.models.base import VersionedGuidMixin
10+
from osf.management.commands.sync_doi_metadata import async_request_identifier_update
11+
12+
logger = logging.getLogger(__name__)
13+
14+
RATE_LIMIT_SLEEP = 60 * 5
15+
16+
def get_preprints_needing_v1_doi(provider_id=None):
17+
content_type = ContentType.objects.get_for_model(Preprint)
18+
19+
already_versioned_ids = Identifier.objects.filter(
20+
content_type=content_type,
21+
category='doi',
22+
deleted__isnull=True,
23+
value__contains=VersionedGuidMixin.GUID_VERSION_DELIMITER,
24+
).values_list('object_id', flat=True)
25+
26+
public_query = Q(is_published=True, is_public=True, deleted__isnull=True)
27+
withdrawn_query = Q(date_withdrawn__isnull=False, ever_public=True)
28+
29+
qs = Preprint.objects.filter(
30+
versioned_guids__version=1,
31+
).filter(
32+
public_query | withdrawn_query
33+
).exclude(
34+
id__in=already_versioned_ids
35+
).exclude(
36+
tags__name='qatest',
37+
tags__system=True,
38+
).select_related('provider').distinct()
39+
40+
if provider_id:
41+
qs = qs.filter(provider___id=provider_id)
42+
43+
return qs
44+
45+
46+
def resync_preprint_dois_v1(dry_run=True, batch_size=0, rate_limit=100, provider_id=None):
47+
preprints_to_update = get_preprints_needing_v1_doi(provider_id=provider_id)
48+
49+
total = preprints_to_update.count()
50+
logger.info(
51+
f'{"[DRY RUN] " if dry_run else ""}'
52+
f'{total} preprints need v1 DOI resync'
53+
+ (f' (provider={provider_id})' if provider_id else '')
54+
)
55+
56+
if batch_size:
57+
preprints_iterable = preprints_to_update[:batch_size]
58+
else:
59+
preprints_iterable = preprints_to_update.iterator()
60+
61+
queued = 0
62+
skipped = 0
63+
for record_number, preprint in enumerate(preprints_iterable, 1):
64+
if not preprint.provider.doi_prefix:
65+
logger.warning(
66+
f'Skipping preprint {preprint._id}: '
67+
f'provider {preprint.provider._id} has no DOI prefix'
68+
)
69+
skipped += 1
70+
continue
71+
72+
if dry_run:
73+
logger.info(f'[DRY RUN] Would resync DOI for preprint {preprint._id}')
74+
queued += 1
75+
continue
76+
77+
if rate_limit and not record_number % rate_limit:
78+
logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s')
79+
time.sleep(RATE_LIMIT_SLEEP)
80+
81+
async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id})
82+
logger.info(f'Queued DOI resync for preprint {preprint._id}')
83+
queued += 1
84+
85+
logger.info(
86+
f'{"[DRY RUN] " if dry_run else ""}'
87+
f'Done: {queued} preprints queued, {skipped} skipped (no DOI prefix)'
88+
)
89+
90+
91+
class Command(BaseCommand):
92+
def add_arguments(self, parser):
93+
super().add_arguments(parser)
94+
parser.add_argument(
95+
'--dry_run',
96+
action='store_true',
97+
dest='dry_run',
98+
help='Log what would be done without submitting to Crossref.',
99+
)
100+
parser.add_argument(
101+
'--batch_size',
102+
'-b',
103+
type=int,
104+
default=0,
105+
help='Maximum number of preprints to process (0 = no limit).',
106+
)
107+
parser.add_argument(
108+
'--rate_limit',
109+
'-r',
110+
type=int,
111+
default=100,
112+
help='Sleep between Crossref submissions every N preprints.',
113+
)
114+
parser.add_argument(
115+
'--provider',
116+
'-p',
117+
type=str,
118+
default=None,
119+
dest='provider_id',
120+
help='Restrict to a single provider _id (e.g. socarxiv).',
121+
)
122+
123+
def handle(self, *args, **options):
124+
resync_preprint_dois_v1(
125+
dry_run=options['dry_run'],
126+
batch_size=options['batch_size'],
127+
rate_limit=options['rate_limit'],
128+
provider_id=options['provider_id'],
129+
)
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import pytest
2+
from unittest import mock
3+
from django.utils import timezone
4+
5+
from osf.models import Preprint
6+
from osf_tests.factories import PreprintFactory, PreprintProviderFactory
7+
from osf.management.commands.resync_preprint_dois_v1 import (
8+
get_preprints_needing_v1_doi,
9+
resync_preprint_dois_v1,
10+
)
11+
from website import settings
12+
13+
pytestmark = pytest.mark.django_db
14+
15+
16+
@pytest.fixture()
17+
def provider():
18+
p = PreprintProviderFactory()
19+
p.doi_prefix = '10.31219'
20+
p.save()
21+
return p
22+
23+
24+
@pytest.fixture()
25+
def preprint(provider):
26+
pp = PreprintFactory(provider=provider, is_published=True)
27+
old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id)
28+
pp.set_identifier_values(doi=old_doi, save=True)
29+
return pp
30+
31+
32+
@pytest.fixture()
33+
def preprint_with_v1_doi(provider):
34+
pp = PreprintFactory(provider=provider, is_published=True)
35+
v1_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp._id)
36+
pp.set_identifier_values(doi=v1_doi, save=True)
37+
return pp
38+
39+
40+
class TestGetPreprrintsNeedingV1Doi:
41+
42+
def test_includes_public_preprint_without_versioned_doi(self, preprint):
43+
qs = get_preprints_needing_v1_doi()
44+
assert preprint in qs
45+
46+
def test_excludes_preprint_with_versioned_doi(self, preprint_with_v1_doi):
47+
qs = get_preprints_needing_v1_doi()
48+
assert preprint_with_v1_doi not in qs
49+
50+
def test_excludes_preprint_with_no_doi_if_private(self, provider):
51+
private_preprint = PreprintFactory(provider=provider, is_published=False)
52+
private_preprint.is_public = False
53+
private_preprint.save()
54+
qs = get_preprints_needing_v1_doi()
55+
assert private_preprint not in qs
56+
57+
def test_includes_withdrawn_preprint_with_ever_public(self, provider):
58+
pp = PreprintFactory(provider=provider, is_published=True)
59+
old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id)
60+
pp.set_identifier_values(doi=old_doi, save=True)
61+
pp.date_withdrawn = timezone.now()
62+
pp.ever_public = True
63+
pp.save()
64+
qs = get_preprints_needing_v1_doi()
65+
assert pp in qs
66+
67+
def test_excludes_withdrawn_preprint_never_public(self, provider):
68+
pp = PreprintFactory(provider=provider, is_published=False)
69+
Preprint.objects.filter(pk=pp.pk).update(date_withdrawn=timezone.now())
70+
qs = get_preprints_needing_v1_doi()
71+
assert pp not in qs
72+
73+
def test_excludes_version_2_preprint(self, preprint):
74+
from tests.utils import capture_notifications
75+
with capture_notifications():
76+
v2 = PreprintFactory.create_version(preprint, is_published=True, set_doi=False)
77+
old_doi = settings.DOI_FORMAT.format(prefix=preprint.provider.doi_prefix, guid=v2.get_guid()._id)
78+
v2.set_identifier_values(doi=old_doi, save=True)
79+
qs = get_preprints_needing_v1_doi()
80+
assert v2 not in qs
81+
82+
def test_excludes_qatest_tagged_preprint(self, preprint):
83+
preprint.add_system_tag('qatest')
84+
qs = get_preprints_needing_v1_doi()
85+
assert preprint not in qs
86+
87+
def test_excludes_deleted_preprint(self, preprint):
88+
preprint.deleted = timezone.now()
89+
preprint.save()
90+
qs = get_preprints_needing_v1_doi()
91+
assert preprint not in qs
92+
93+
def test_provider_filter_limits_results(self, preprint, provider):
94+
other_provider = PreprintProviderFactory()
95+
other_provider.doi_prefix = '10.12345'
96+
other_provider.save()
97+
other_preprint = PreprintFactory(provider=other_provider, is_published=True)
98+
old_doi = settings.DOI_FORMAT.format(prefix=other_provider.doi_prefix, guid=other_preprint.get_guid()._id)
99+
other_preprint.set_identifier_values(doi=old_doi, save=True)
100+
101+
qs = get_preprints_needing_v1_doi(provider_id=provider._id)
102+
assert preprint in qs
103+
assert other_preprint not in qs
104+
105+
def test_preprint_with_no_doi_identifier_is_included(self, provider):
106+
pp = PreprintFactory(provider=provider, is_published=True, set_doi=False)
107+
qs = get_preprints_needing_v1_doi()
108+
assert pp in qs
109+
110+
111+
class TestResyncPreprintDoisV1:
112+
113+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
114+
def test_dry_run_does_not_queue_tasks(self, mock_task, preprint):
115+
resync_preprint_dois_v1(dry_run=True)
116+
mock_task.apply_async.assert_not_called()
117+
118+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
119+
def test_live_run_queues_task_for_each_preprint(self, mock_task, preprint):
120+
resync_preprint_dois_v1(dry_run=False, rate_limit=0)
121+
mock_task.apply_async.assert_called_once_with(kwargs={'preprint_id': preprint._id})
122+
123+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
124+
def test_batch_size_limits_processed_count(self, mock_task, provider):
125+
preprints = []
126+
for _ in range(5):
127+
pp = PreprintFactory(provider=provider, is_published=True)
128+
old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id)
129+
pp.set_identifier_values(doi=old_doi, save=True)
130+
preprints.append(pp)
131+
132+
resync_preprint_dois_v1(dry_run=False, batch_size=2, rate_limit=0)
133+
assert mock_task.apply_async.call_count == 2
134+
135+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
136+
def test_skips_provider_without_doi_prefix(self, mock_task, provider):
137+
no_prefix_provider = PreprintProviderFactory()
138+
no_prefix_provider.doi_prefix = ''
139+
no_prefix_provider.save()
140+
pp = PreprintFactory(provider=no_prefix_provider, is_published=True)
141+
old_doi = '10.000/old-doi'
142+
pp.set_identifier_values(doi=old_doi, save=True)
143+
144+
resync_preprint_dois_v1(dry_run=False, rate_limit=0)
145+
queued_ids = [
146+
call.kwargs['kwargs']['preprint_id']
147+
for call in mock_task.apply_async.call_args_list
148+
]
149+
assert pp._id not in queued_ids
150+
151+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
152+
def test_provider_filter_is_applied(self, mock_task, preprint, provider):
153+
other_provider = PreprintProviderFactory()
154+
other_provider.doi_prefix = '10.99999'
155+
other_provider.save()
156+
other_pp = PreprintFactory(provider=other_provider, is_published=True)
157+
old_doi = settings.DOI_FORMAT.format(prefix=other_provider.doi_prefix, guid=other_pp.get_guid()._id)
158+
other_pp.set_identifier_values(doi=old_doi, save=True)
159+
160+
resync_preprint_dois_v1(dry_run=False, rate_limit=0, provider_id=provider._id)
161+
162+
queued_ids = [
163+
call.kwargs['kwargs']['preprint_id']
164+
for call in mock_task.apply_async.call_args_list
165+
]
166+
assert preprint._id in queued_ids
167+
assert other_pp._id not in queued_ids
168+
169+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
170+
def test_already_versioned_doi_is_not_queued(self, mock_task, preprint_with_v1_doi):
171+
resync_preprint_dois_v1(dry_run=False, rate_limit=0)
172+
queued_ids = [
173+
call.kwargs['kwargs']['preprint_id']
174+
for call in mock_task.apply_async.call_args_list
175+
]
176+
assert preprint_with_v1_doi._id not in queued_ids
177+
178+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.time.sleep')
179+
@mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update')
180+
def test_rate_limit_triggers_sleep(self, mock_task, mock_sleep, provider):
181+
for _ in range(3):
182+
pp = PreprintFactory(provider=provider, is_published=True)
183+
old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id)
184+
pp.set_identifier_values(doi=old_doi, save=True)
185+
186+
resync_preprint_dois_v1(dry_run=False, rate_limit=2)
187+
mock_sleep.assert_called_once()

0 commit comments

Comments
 (0)