@@ -412,38 +412,30 @@ def find_close_pairs(similarity_matrix, min_similarity=0.9):
412412 matching_idxs = np .where (similarity_matrix >= min_similarity )
413413 return list (zip (matching_idxs [0 ], matching_idxs [1 ], similarity_matrix [matching_idxs ]))
414414
415- if __name__ == "__main__" :
415+ def review_close_pairs (close_pairs : list [tuple [int , int , float ]]):
416+ """
417+ Iterates through close pairs and prompts the user for decisions.
418+ Handles skipping files that were already moved or marked distinct.
419+ """
416420 import random
417421 import gdrive
418- from clean_google_drive import remove_duplicate_files
419- print ("Cleaning up files that are MD5 identical first..." )
420- remove_duplicate_files ()
421- print ("Loading the latest PDF embeddings..." )
422- load (True )
423- print ("Successfully loaded the latest PDF embeddings!" )
424- if not prompt ("Would you like to review the embeddings for any duplicates?" ):
425- print ("Okay then :)" )
426- exit (0 )
427- min_similarity = - 1
428- while min_similarity < 0 or min_similarity > 1 :
429- min_similarity = float (input_with_prefill ("Min similarity: " , "0.93" , float ))
430- print ("Loading the similarity matrix..." )
431- similarity_matrix = calculate_similarity_matrix ()
432- print ("Selecting close neighboring pairs..." )
433- close_pairs = find_close_pairs (similarity_matrix , min_similarity = min_similarity )
434- del similarity_matrix
422+
435423 MAX_TO_CONSIDER = 1000
436424 if len (close_pairs ) > MAX_TO_CONSIDER :
437425 print (f"INFO: Restricting to { MAX_TO_CONSIDER } pairs of the { len (close_pairs )} found" )
438426 close_pairs = random .sample (close_pairs , MAX_TO_CONSIDER )
439427 else :
440428 random .shuffle (close_pairs )
429+
441430 all_decisions = []
442431 if DECISION_HISTORY_FILE .exists ():
443432 all_decisions = joblib .load (DECISION_HISTORY_FILE )
444433 assert isinstance (all_decisions , list ), "Expected DECISION_HISTORY to be a list"
434+
445435 distinctions = gdrive .FileDistinctionManager ()
446- close_pairs = [
436+
437+ # Initial filter
438+ pairs_to_review = [
447439 (
448440 google_files [idx ],
449441 google_files [jdx ],
@@ -454,26 +446,58 @@ def find_close_pairs(similarity_matrix, min_similarity=0.9):
454446 google_files [jdx ]['parent_id' ] != gdrive .OLD_VERSIONS_FOLDER_ID and
455447 not distinctions .are_distinct (google_files [idx ]['id' ], google_files [jdx ]['id' ])
456448 ]
457- print (f"Found { len (close_pairs )} close pairs that need review..." )
458- if len (close_pairs ) > 0 :
449+
450+ print (f"Found { len (pairs_to_review )} close pairs that need review..." )
451+ if len (pairs_to_review ) > 0 :
459452 print ("Don't forget to open your browser!" )
453+
460454 done = 0
461- for gfa , gfb , sim in close_pairs :
455+ for gfa , gfb , sim in pairs_to_review :
462456 done += 1
463- print (f"\n ---{ done } /{ len (close_pairs )} \n " )
457+ print (f"\n ---{ done } /{ len (pairs_to_review )} \n " )
458+
464459 # handle_close_pair_decisions can sometimes mark other pairs distinct
465460 # or even move other files to the graveyard, so have to recheck here
466461 if distinctions .are_distinct (gfa ['id' ], gfb ['id' ]):
467462 print ("Already merged as distinct :)" )
468463 continue
464+
465+ # Refresh from cache to get latest parent_id
469466 gfa = gdrive .gcache .get_item (gfa ['id' ])
470467 gfb = gdrive .gcache .get_item (gfb ['id' ])
468+
469+ if not gfa or not gfb :
470+ print ("One of the files is missing from cache, skipping..." )
471+ continue
472+
471473 if gdrive .OLD_VERSIONS_FOLDER_ID in [gfa ['parent_id' ], gfb ['parent_id' ]]:
472474 print ("Already moved to Old Versions :)" )
473475 continue
476+
474477 decision = gdrive .is_duplicate_prompt (gfa , gfb , similariy = sim )
475478 all_decisions .append ((decision , gfa , gfb , sim ))
476479 joblib .dump (all_decisions , DECISION_HISTORY_FILE , compress = 2 )
477480 distinctions .handle_close_pair_decision (decision , gfa , gfb )
478481
482+ if __name__ == "__main__" :
483+ from clean_google_drive import remove_duplicate_files
484+ print ("Cleaning up files that are MD5 identical first..." )
485+ remove_duplicate_files ()
486+ print ("Loading the latest PDF embeddings..." )
487+ load (True )
488+ print ("Successfully loaded the latest PDF embeddings!" )
489+ if not prompt ("Would you like to review the embeddings for any duplicates?" ):
490+ print ("Okay then :)" )
491+ exit (0 )
492+ min_similarity = - 1
493+ while min_similarity < 0 or min_similarity > 1 :
494+ min_similarity = float (input_with_prefill ("Min similarity: " , "0.93" , float ))
495+ print ("Loading the similarity matrix..." )
496+ similarity_matrix = calculate_similarity_matrix ()
497+ print ("Selecting close neighboring pairs..." )
498+ close_pairs = find_close_pairs (similarity_matrix , min_similarity = min_similarity )
499+ del similarity_matrix
500+
501+ review_close_pairs (close_pairs )
502+
479503
0 commit comments