@@ -380,6 +380,8 @@ def main():
380380
381381 session = setup_session ()
382382 all_records = []
383+ extraction_failures = 0
384+ total_processed = 0
383385 license_counts = Counter ()
384386 year_counts = defaultdict (Counter )
385387 type_counts = defaultdict (Counter )
@@ -440,9 +442,11 @@ def main():
440442 break
441443
442444 record_info = extract_record_info (record )
445+ total_processed += 1
443446
444447 # Skip records where extraction failed
445448 if not record_info :
449+ extraction_failures += 1
446450 continue
447451
448452 # Only include records with valid licenses (CC filtering at API)
@@ -471,6 +475,20 @@ def main():
471475 # Be respectful to the API - increased delay for rate limiting
472476 time .sleep (2.0 )
473477
478+ # Check for excessive extraction failures
479+ if total_processed > 0 :
480+ failure_rate = extraction_failures / total_processed
481+ if failure_rate > 0.1 : # More than 10% failures
482+ raise shared .QuantifyingException (
483+ f"Too many extraction failures: { extraction_failures } /{ total_processed } "
484+ f"({ failure_rate :.1%} ) - data quality issues detected"
485+ )
486+ elif extraction_failures > 0 :
487+ LOGGER .warning (
488+ f"Extraction failures: { extraction_failures } /{ total_processed } "
489+ f"({ failure_rate :.1%} )"
490+ )
491+
474492 if not all_records :
475493 LOGGER .warning ("No CC-licensed records found" )
476494 else :
0 commit comments