|
5 | 5 | from django.core.management.base import BaseCommand |
6 | 6 | from django.db.models import Q |
7 | 7 |
|
| 8 | +from framework.celery_tasks import app |
8 | 9 | from osf.models import Preprint, Identifier |
9 | 10 | from osf.models.base import VersionedGuidMixin |
10 | 11 | from osf.management.commands.sync_doi_metadata import async_request_identifier_update |
@@ -101,6 +102,114 @@ def resync_preprint_dois_v1(dry_run=True, batch_size=500, rate_limit=100, provid |
101 | 102 | ) |
102 | 103 |
|
103 | 104 |
|
| 105 | +def get_preprints_needing_unversioned_doi(provider_id=None): |
| 106 | + content_type = ContentType.objects.get_for_model(Preprint) |
| 107 | + |
| 108 | + already_has_unversioned = Identifier.objects.filter( |
| 109 | + content_type=content_type, |
| 110 | + category='doi_unversioned', |
| 111 | + deleted__isnull=True, |
| 112 | + ).values_list('object_id', flat=True) |
| 113 | + |
| 114 | + has_versioned_doi = Identifier.objects.filter( |
| 115 | + content_type=content_type, |
| 116 | + category='doi', |
| 117 | + deleted__isnull=True, |
| 118 | + value__contains=VersionedGuidMixin.GUID_VERSION_DELIMITER, |
| 119 | + ).values_list('object_id', flat=True) |
| 120 | + |
| 121 | + public_query = Q(is_published=True, is_public=True, deleted__isnull=True) |
| 122 | + withdrawn_query = Q(date_withdrawn__isnull=False, ever_public=True) |
| 123 | + |
| 124 | + qs = Preprint.objects.filter( |
| 125 | + versioned_guids__version=1, |
| 126 | + id__in=has_versioned_doi, |
| 127 | + ).filter( |
| 128 | + public_query | withdrawn_query |
| 129 | + ).exclude( |
| 130 | + id__in=already_has_unversioned |
| 131 | + ).exclude( |
| 132 | + tags__name='qatest', |
| 133 | + tags__system=True, |
| 134 | + ).select_related('provider').distinct() |
| 135 | + |
| 136 | + if provider_id: |
| 137 | + qs = qs.filter(provider___id=provider_id) |
| 138 | + |
| 139 | + return qs |
| 140 | + |
| 141 | + |
| 142 | +def register_missing_unversioned_dois(dry_run=True, batch_size=500, rate_limit=100, provider_id=None): |
| 143 | + preprints_to_update = get_preprints_needing_unversioned_doi(provider_id=provider_id) |
| 144 | + |
| 145 | + total = preprints_to_update.count() |
| 146 | + logger.info( |
| 147 | + f'{"[DRY RUN] " if dry_run else ""}' |
| 148 | + f'{total} preprints need unversioned DOI registration' |
| 149 | + + (f' (provider={provider_id})' if provider_id else '') |
| 150 | + ) |
| 151 | + |
| 152 | + if batch_size: |
| 153 | + preprints_iterable = preprints_to_update[:batch_size] |
| 154 | + else: |
| 155 | + preprints_iterable = preprints_to_update.iterator() |
| 156 | + |
| 157 | + queued = 0 |
| 158 | + skipped = 0 |
| 159 | + errored = 0 |
| 160 | + for record_number, preprint in enumerate(preprints_iterable, 1): |
| 161 | + if not preprint.provider.doi_prefix: |
| 162 | + logger.warning( |
| 163 | + f'Skipping preprint {preprint._id}: ' |
| 164 | + f'provider {preprint.provider._id} has no DOI prefix' |
| 165 | + ) |
| 166 | + skipped += 1 |
| 167 | + continue |
| 168 | + |
| 169 | + if dry_run: |
| 170 | + logger.info(f'[DRY RUN] Would register unversioned DOI for preprint {preprint._id}') |
| 171 | + queued += 1 |
| 172 | + continue |
| 173 | + |
| 174 | + if rate_limit and not record_number % rate_limit: |
| 175 | + logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s') |
| 176 | + time.sleep(RATE_LIMIT_SLEEP) |
| 177 | + |
| 178 | + try: |
| 179 | + async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) |
| 180 | + logger.info(f'Queued unversioned DOI registration for preprint {preprint._id}') |
| 181 | + queued += 1 |
| 182 | + except Exception: |
| 183 | + logger.exception(f'Failed to queue unversioned DOI registration for preprint {preprint._id}') |
| 184 | + errored += 1 |
| 185 | + |
| 186 | + logger.info( |
| 187 | + f'{"[DRY RUN] " if dry_run else ""}' |
| 188 | + f'Unversioned DOI pass done: {queued} queued, {skipped} skipped, {errored} errored' |
| 189 | + ) |
| 190 | + if not dry_run and batch_size: |
| 191 | + logger.info( |
| 192 | + f'Estimated unversioned remaining after this batch: ~{max(0, total - queued - skipped - errored)}. ' |
| 193 | + f'Re-run until 0 preprints remain.' |
| 194 | + ) |
| 195 | + |
| 196 | + |
| 197 | +@app.task(name='osf.management.commands.resync_preprint_dois_v1', max_retries=0) |
| 198 | +def resync_preprint_dois_v1_task(batch_size=500, rate_limit=100, dry_run=False, provider_id=None): |
| 199 | + resync_preprint_dois_v1( |
| 200 | + dry_run=dry_run, |
| 201 | + batch_size=batch_size, |
| 202 | + rate_limit=rate_limit, |
| 203 | + provider_id=provider_id, |
| 204 | + ) |
| 205 | + register_missing_unversioned_dois( |
| 206 | + dry_run=dry_run, |
| 207 | + batch_size=batch_size, |
| 208 | + rate_limit=rate_limit, |
| 209 | + provider_id=provider_id, |
| 210 | + ) |
| 211 | + |
| 212 | + |
104 | 213 | class Command(BaseCommand): |
105 | 214 | help = ( |
106 | 215 | 'Resync DOIs for version-1 preprints that are missing the versioned DOI suffix (_v1). ' |
|
0 commit comments