99
1010import logging
1111from itertools import groupby
12- from operator import attrgetter
1312
13+ from aboutcode .pipeline import LoopProgress
1414from django .db .models import Count
1515from django .db .models import Q
1616
@@ -26,54 +26,58 @@ class RemoveDuplicateAdvisoriesPipeline(VulnerableCodePipeline):
2626
2727 @classmethod
2828 def steps (cls ):
29- return (cls .remove_duplicates ,)
29+ return (
30+ cls .recompute_content_ids ,
31+ cls .remove_duplicates ,
32+ )
3033
3134 def remove_duplicates (self ):
3235 """
3336 Find advisories with the same content and keep only the latest one.
3437 """
35- # Get all advisories that have duplicates based on content ID
36- duplicate_content_ids = (
37- Advisory .objects .values ("unique_content_id" )
38- .annotate (count = Count ("id" ))
39- .filter (count__gt = 1 )
40- .values_list ("unique_content_id" , flat = True )
41- )
4238
43- self .log (
44- f"Found { len (duplicate_content_ids )} content IDs with duplicates" , level = logging .INFO
39+ duplicated_advisories = groupby (
40+ Advisory .objects .order_by ("unique_content_id" ).all ().paginated (),
41+ key = lambda x : x .unique_content_id ,
4542 )
46-
47- for content_id in duplicate_content_ids :
48- # Get all advisories with this content ID
49- advisories = Advisory .objects .filter (unique_content_id = content_id )
50-
51- # Find the latest advisory
52- latest = advisories .latest ("date_imported" )
53-
54- # Delete all except the latest
55- advisories .exclude (id = latest .id ).delete ()
56-
57- if self .log :
58- self .log (
59- f"Kept advisory { latest .id } and removed "
60- f"{ advisories .count () - 1 } duplicates for content ID { content_id } " ,
61- level = logging .INFO ,
62- )
63-
64- def update_content_ids (self ):
43+ progress = LoopProgress (total_iterations = Advisory .objects .count (), logger = self .log )
44+ for _content_id , advisories in progress .iter (duplicated_advisories ):
45+ advisories = list (advisories )
46+ self .log (
47+ f"Removing duplicates for content ID { _content_id } { len (advisories )} " ,
48+ level = logging .INFO ,
49+ )
50+ oldest = min (advisories , key = lambda x : x .date_imported )
51+ try :
52+ advisory_ids = []
53+ for adv in advisories :
54+ if adv .id != oldest .id :
55+ advisory_ids .append (adv .id )
56+ Advisory .objects .filter (id__in = advisory_ids ).delete ()
57+ except Exception as e :
58+ self .log (f"Error deleting advisories: { e } " , level = logging .ERROR )
59+
60+ self .log (
61+ f"Kept advisory { oldest .id } and removed "
62+ f"{ len (list (advisories )) - 1 } duplicates for content ID { _content_id } " ,
63+ level = logging .INFO ,
64+ )
65+
66+ def recompute_content_ids (self ):
6567 """
66- Update content IDs for all advisories that don't have one .
68+ Recompute content IDs for all advisories.
6769 """
68- advisories = Advisory .objects .filter (
69- Q (unique_content_id = "" ) | Q (unique_content_id__isnull = True )
70- )
7170
72- self .log (f"Found { advisories .count ()} advisories without content ID" , level = logging .INFO )
71+ advisories = []
72+
73+ progress = LoopProgress (
74+ total_iterations = Advisory .objects .count (),
75+ progress_step = 1 ,
76+ logger = self .log ,
77+ )
7378
74- for advisory in advisories :
79+ for advisory in progress . iter ( Advisory . objects . all (). paginated ()) :
7580 advisory .unique_content_id = compute_content_id (advisory )
76- advisory . save ( )
81+ advisories . append ( advisory )
7782
78- if self .log :
79- self .log (f"Updated content ID for advisory { advisory .id } " , level = logging .DEBUG )
83+ Advisory .objects .bulk_update (advisories , ["unique_content_id" ], batch_size = 1000 )
0 commit comments