Skip to content

Commit ba251e9

Browse files
committed
Add extraction failure tracking with 10% threshold validation
1 parent dce7db5 commit ba251e9

1 file changed

Lines changed: 18 additions & 0 deletions

File tree

scripts/1-fetch/zenodo_fetch.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ def main():
380380

381381
session = setup_session()
382382
all_records = []
383+
extraction_failures = 0
384+
total_processed = 0
383385
license_counts = Counter()
384386
year_counts = defaultdict(Counter)
385387
type_counts = defaultdict(Counter)
@@ -440,9 +442,11 @@ def main():
440442
break
441443

442444
record_info = extract_record_info(record)
445+
total_processed += 1
443446

444447
# Skip records where extraction failed
445448
if not record_info:
449+
extraction_failures += 1
446450
continue
447451

448452
# Only include records with valid licenses (CC filtering at API)
@@ -471,6 +475,20 @@ def main():
471475
# Be respectful to the API - increased delay for rate limiting
472476
time.sleep(2.0)
473477

478+
# Check for excessive extraction failures
479+
if total_processed > 0:
480+
failure_rate = extraction_failures / total_processed
481+
if failure_rate > 0.1: # More than 10% failures
482+
raise shared.QuantifyingException(
483+
f"Too many extraction failures: {extraction_failures}/{total_processed} "
484+
f"({failure_rate:.1%}) - data quality issues detected"
485+
)
486+
elif extraction_failures > 0:
487+
LOGGER.warning(
488+
f"Extraction failures: {extraction_failures}/{total_processed} "
489+
f"({failure_rate:.1%})"
490+
)
491+
474492
if not all_records:
475493
LOGGER.warning("No CC-licensed records found")
476494
else:

0 commit comments

Comments
 (0)