@@ -232,59 +232,123 @@ def are_endpoints_duplicates(new_finding, to_duplicate_finding):
232232 return False
233233
234234
235- def build_dedupe_scope_queryset (test ):
236- scope_on_engagement = test .engagement .deduplication_on_engagement
237- if scope_on_engagement :
238- scope_q = Q (test__engagement = test .engagement )
239- else :
240- # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe
241- scope_q = Q (test__engagement__product = test .engagement .product ) & (
242- Q (test__engagement = test .engagement )
243- | Q (test__engagement__deduplication_on_engagement = False )
244- )
235+ def build_candidate_scope_queryset (test , mode = "deduplication" , service = None ):
236+ """
237+ Build a queryset for candidate finding.
238+
239+ Args:
240+ test: The test to scope from
241+ mode: "deduplication" (can match across tests) or "reimport" (same test only)
242+ service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
243+
244+ """
245+ if mode == "reimport" :
246+ # For reimport, only filter by test. Service filtering is not needed because
247+ # service is included in hash_code calculation (HASH_CODE_FIELDS_ALWAYS = ["service"]),
248+ # so matching by hash_code automatically ensures correct service match.
249+ queryset = Finding .objects .filter (test = test )
250+ else : # deduplication mode
251+ scope_on_engagement = test .engagement .deduplication_on_engagement
252+ if scope_on_engagement :
253+ scope_q = Q (test__engagement = test .engagement )
254+ else :
255+ # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe
256+ scope_q = Q (test__engagement__product = test .engagement .product ) & (
257+ Q (test__engagement = test .engagement )
258+ | Q (test__engagement__deduplication_on_engagement = False )
259+ )
260+ queryset = Finding .objects .filter (scope_q )
261+
262+ # Base prefetches for both modes
263+ prefetch_list = ["endpoints" , "vulnerability_id_set" , "found_by" ]
264+
265+ # Additional prefetches for reimport mode
266+ if mode == "reimport" :
267+ prefetch_list .extend ([
268+ "status_finding" ,
269+ "status_finding__endpoint" ,
270+ ])
245271
246272 return (
247- Finding . objects . filter ( scope_q )
273+ queryset
248274 .select_related ("test" , "test__engagement" , "test__test_type" )
249- .prefetch_related ("endpoints" , "found_by" )
275+ .prefetch_related (* prefetch_list )
250276 )
251277
252278
253- def find_candidates_for_deduplication_hash (test , findings ):
254- base_queryset = build_dedupe_scope_queryset (test )
279+ def find_candidates_for_deduplication_hash (test , findings , mode = "deduplication" , service = None ):
280+ """
281+ Find candidates by hash_code. Works for both deduplication and reimport.
282+
283+ Args:
284+ test: The test to scope from
285+ findings: List of findings to find candidates for
286+ mode: "deduplication" or "reimport"
287+ service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
288+
289+ """
290+ base_queryset = build_candidate_scope_queryset (test , mode = mode , service = service )
255291 hash_codes = {f .hash_code for f in findings if getattr (f , "hash_code" , None ) is not None }
256292 if not hash_codes :
257293 return {}
258- existing_qs = (
259- base_queryset .filter (hash_code__in = hash_codes )
260- . exclude ( hash_code = None )
261- .exclude (duplicate = True )
262- .order_by ("id" )
263- )
294+
295+ existing_qs = base_queryset .filter (hash_code__in = hash_codes ). exclude ( hash_code = None )
296+ if mode == "deduplication" :
297+ existing_qs = existing_qs .exclude (duplicate = True )
298+ existing_qs = existing_qs .order_by ("id" )
299+
264300 existing_by_hash = {}
265301 for ef in existing_qs :
266302 existing_by_hash .setdefault (ef .hash_code , []).append (ef )
267- deduplicationLogger .debug (f"Found { len (existing_by_hash )} existing findings by hash codes" )
303+
304+ log_msg = "for reimport" if mode == "reimport" else ""
305+ deduplicationLogger .debug (f"Found { len (existing_by_hash )} existing findings by hash codes { log_msg } " )
268306 return existing_by_hash
269307
270308
271- def find_candidates_for_deduplication_unique_id (test , findings ):
272- base_queryset = build_dedupe_scope_queryset (test )
309+ def find_candidates_for_deduplication_unique_id (test , findings , mode = "deduplication" , service = None ):
310+ """
311+ Find candidates by unique_id_from_tool. Works for both deduplication and reimport.
312+
313+ Args:
314+ test: The test to scope from
315+ findings: List of findings to find candidates for
316+ mode: "deduplication" or "reimport"
317+ service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
318+
319+ """
320+ base_queryset = build_candidate_scope_queryset (test , mode = mode , service = service )
273321 unique_ids = {f .unique_id_from_tool for f in findings if getattr (f , "unique_id_from_tool" , None ) is not None }
274322 if not unique_ids :
275323 return {}
276- existing_qs = base_queryset .filter (unique_id_from_tool__in = unique_ids ).exclude (unique_id_from_tool = None ).exclude (duplicate = True ).order_by ("id" )
324+
325+ existing_qs = base_queryset .filter (unique_id_from_tool__in = unique_ids ).exclude (unique_id_from_tool = None )
326+ if mode == "deduplication" :
327+ existing_qs = existing_qs .exclude (duplicate = True )
277328 # unique_id_from_tool can only apply to the same test_type because it is parser dependent
278- existing_qs = existing_qs .filter (test__test_type = test .test_type )
329+ existing_qs = existing_qs .filter (test__test_type = test .test_type ).order_by ("id" )
330+
279331 existing_by_uid = {}
280332 for ef in existing_qs :
281333 existing_by_uid .setdefault (ef .unique_id_from_tool , []).append (ef )
282- deduplicationLogger .debug (f"Found { len (existing_by_uid )} existing findings by unique IDs" )
334+
335+ log_msg = "for reimport" if mode == "reimport" else ""
336+ deduplicationLogger .debug (f"Found { len (existing_by_uid )} existing findings by unique IDs { log_msg } " )
283337 return existing_by_uid
284338
285339
286- def find_candidates_for_deduplication_uid_or_hash (test , findings ):
287- base_queryset = build_dedupe_scope_queryset (test )
340+ def find_candidates_for_deduplication_uid_or_hash (test , findings , mode = "deduplication" , service = None ):
341+ """
342+ Find candidates by unique_id_from_tool or hash_code. Works for both deduplication and reimport.
343+
344+ Args:
345+ test: The test to scope from
346+ findings: List of findings to find candidates for
347+ mode: "deduplication" or "reimport"
348+ service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
349+
350+ """
351+ base_queryset = build_candidate_scope_queryset (test , mode = mode , service = service )
288352 hash_codes = {f .hash_code for f in findings if getattr (f , "hash_code" , None ) is not None }
289353 unique_ids = {f .unique_id_from_tool for f in findings if getattr (f , "unique_id_from_tool" , None ) is not None }
290354 if not hash_codes and not unique_ids :
@@ -298,7 +362,11 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings):
298362 uid_q = Q (unique_id_from_tool__isnull = False , unique_id_from_tool__in = unique_ids ) & Q (test__test_type = test .test_type )
299363 cond |= uid_q
300364
301- existing_qs = base_queryset .filter (cond ).exclude (duplicate = True ).order_by ("id" )
365+ existing_qs = base_queryset .filter (cond )
366+ if mode == "deduplication" :
367+ # reimport matching will match against duplicates, import/deduplication doesn't.
368+ existing_qs = existing_qs .exclude (duplicate = True )
369+ existing_qs = existing_qs .order_by ("id" )
302370
303371 existing_by_hash = {}
304372 existing_by_uid = {}
@@ -307,13 +375,15 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings):
307375 existing_by_hash .setdefault (ef .hash_code , []).append (ef )
308376 if ef .unique_id_from_tool is not None :
309377 existing_by_uid .setdefault (ef .unique_id_from_tool , []).append (ef )
310- deduplicationLogger .debug (f"Found { len (existing_by_uid )} existing findings by unique IDs" )
311- deduplicationLogger .debug (f"Found { len (existing_by_hash )} existing findings by hash codes" )
378+
379+ log_msg = "for reimport" if mode == "reimport" else ""
380+ deduplicationLogger .debug (f"Found { len (existing_by_uid )} existing findings by unique IDs { log_msg } " )
381+ deduplicationLogger .debug (f"Found { len (existing_by_hash )} existing findings by hash codes { log_msg } " )
312382 return existing_by_uid , existing_by_hash
313383
314384
315385def find_candidates_for_deduplication_legacy (test , findings ):
316- base_queryset = build_dedupe_scope_queryset (test )
386+ base_queryset = build_candidate_scope_queryset (test , mode = "deduplication" )
317387 titles = {f .title for f in findings if getattr (f , "title" , None )}
318388 cwes = {f .cwe for f in findings if getattr (f , "cwe" , 0 )}
319389 cwes .discard (0 )
@@ -335,6 +405,52 @@ def find_candidates_for_deduplication_legacy(test, findings):
335405 return by_title , by_cwe
336406
337407
408+ # TODO: should we align this with deduplication?
409+ def find_candidates_for_reimport_legacy (test , findings , service = None ):
410+ """
411+ Find all existing findings in the test that match any of the given findings by title and severity.
412+ Used for batch reimport to avoid 1+N query problem.
413+ Legacy reimport matches by title (case-insensitive), severity, and numerical_severity.
414+ Note: This function is kept separate because legacy reimport has fundamentally different matching logic
415+ than legacy deduplication (title+severity vs title+CWE).
416+ Note: service parameter is kept for backward compatibility but not used since service is in hash_code.
417+ """
418+ base_queryset = build_candidate_scope_queryset (test , mode = "reimport" , service = None )
419+
420+ # Collect all unique title/severity combinations
421+ title_severity_pairs = set ()
422+ for finding in findings :
423+ if finding .title :
424+ title_severity_pairs .add ((
425+ finding .title .lower (), # Case-insensitive matching
426+ finding .severity ,
427+ Finding .get_numerical_severity (finding .severity ),
428+ ))
429+
430+ if not title_severity_pairs :
431+ return {}
432+
433+ # Build query to find all matching findings
434+ conditions = Q ()
435+ for title_lower , severity , numerical_severity in title_severity_pairs :
436+ conditions |= (
437+ Q (title__iexact = title_lower ) &
438+ Q (severity = severity ) &
439+ Q (numerical_severity = numerical_severity )
440+ )
441+
442+ existing_qs = base_queryset .filter (conditions ).order_by ("id" )
443+
444+ # Build dictionary keyed by (title_lower, severity) for quick lookup
445+ existing_by_key = {}
446+ for ef in existing_qs :
447+ key = (ef .title .lower (), ef .severity )
448+ existing_by_key .setdefault (key , []).append (ef )
449+
450+ deduplicationLogger .debug (f"Found { sum (len (v ) for v in existing_by_key .values ())} existing findings by legacy matching for reimport" )
451+ return existing_by_key
452+
453+
338454def _is_candidate_older (new_finding , candidate ):
339455 # Ensure the newer finding is marked as duplicate of the older finding
340456 is_older = candidate .id < new_finding .id
0 commit comments