buddhist-uni
diff --git a/‎scripts/gdrive.py‎
Lines changed: 15 additions & 16 deletions b/‎scripts/gdrive.py‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎scripts/nearestpdf.py‎
Lines changed: 47 additions & 23 deletions b/‎scripts/nearestpdf.py‎
Lines changed: 47 additions & 23 deletions
diff --git a/‎scripts/test_fixtures/drive.sqlite‎
76 KB b/‎scripts/test_fixtures/drive.sqlite‎
76 KB
@@ -11,6 +11,7 @@
 #   (see get_gfolders_for_course for how those slugs are parsed)
 ########
 
+from enum import unique
 import requests
 import enum
 from datetime import datetime
@@ -477,9 +478,6 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
   """
 
   import website
-  if not website.content:
-    with yaspin(text="Loading website..."):
-      website.load()
   UNIMPORTANT_SLUGS = [
     'to-go-through',
     'to-split',
@@ -504,22 +502,20 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
   TAG_ORDER = {
     str(tf).removesuffix('.md'): idx+1
     for idx, tf in enumerate(website.config['collections']['tags']['order'])
+    # `website.config` is accessible without needing to `load` it
   }
   LO_PRI = len(TAG_ORDER)+1000
 
   #####
-  # If only one is in a slugged folder, keep that one
+  # If only one (unique) slug is represented among important folders, keep all files in that slug
   ####
   slugs = [folder_slugs.get(f['parents'][0]) for f in files]
-  filter_list = []
-  for unimportant in UNIMPORTANT_SLUGS:
-    filter_list.append(unimportant)
-    important_slugs = [slug for slug in slugs if slug not in filter_list]
-    num_slugs = len(important_slugs)
-    if num_slugs == 1:
-      # if there's only one file in a slugged folder, keep that one
-      # no need to even check for permissions
-      return [files[slugs.index(important_slugs[0])]['id']], IDSelectionReason.TAG_FOLDER
+  slugs = [s if s not in UNIMPORTANT_SLUGS else None for s in slugs]
+  unique_slugs = set(slugs)
+  unique_slugs.discard(None)
+  if len(unique_slugs) == 1:
+    important_slug_indexes = [i for i, slug in enumerate(slugs) if slug == list(unique_slugs)[0]]
+    return [files[i]['id'] for i in important_slug_indexes], IDSelectionReason.TAG_FOLDER
 
   #####
   # Don't trash any publicly-launched files
@@ -781,8 +777,10 @@ def _print_shortcuts(shortcuts: list[dict]):
     selected_to_keep = None
     selected_to_not = None
     if len(would_keep) > 1 and decision == ClosePairDecision.THEY_ARE_THE_SAME:
-      assert reason == IDSelectionReason.IS_PUBLIC
-      print("Both files are publicly launched!")
+      if reason == IDSelectionReason.IS_PUBLIC:
+        print("Both files are publicly launched!")
+      else:
+        print(f"Heuristics say to keep both files (Reason: {reason})")
       print("Please handle manually and select one of these to keep:")
       choice = radio_dial([
         DRIVE_LINK.format(actual_file_a['id']),
@@ -837,7 +835,7 @@ def _print_shortcuts(shortcuts: list[dict]):
         # marked as the same, ergo not distinct
         assert selected_to_not['id'] not in self.fileid_to_distinct_neighbors[selected_to_keep['id']]
         # since these two are marked the same, we should merge their clusters into a super-cluster
-        super_cluster = self.fileid_to_distinct_neighbors[select_ids_to_keep['id']] | \
+        super_cluster = self.fileid_to_distinct_neighbors[selected_to_keep['id']] | \
           self.fileid_to_distinct_neighbors[selected_to_not['id']]
         super_cluster.add(selected_to_keep['id'])
         points_to_not = fetch_distinct_file_pointing_to(self.gcache, selected_to_not['id'])
@@ -849,6 +847,7 @@ def _print_shortcuts(shortcuts: list[dict]):
           nn = super_cluster.copy()
           nn.remove(n)
           self.fileid_to_distinct_neighbors[n] = nn
+        del self.fileid_to_distinct_neighbors[selected_to_not['id']]
     # else: # the one we've marked for removal isn't part of the distinctions graph, so nothing to do here
     # Now, all that's left is to handle the marking!
     print(f"[Action] Moving old version to Old Versions...")
 
@@ -412,38 +412,30 @@ def find_close_pairs(similarity_matrix, min_similarity=0.9):
   matching_idxs = np.where(similarity_matrix >= min_similarity)
   return list(zip(matching_idxs[0], matching_idxs[1], similarity_matrix[matching_idxs]))
 
-if __name__ == "__main__":
+def review_close_pairs(close_pairs: list[tuple[int, int, float]]):
+  """
+  Iterates through close pairs and prompts the user for decisions.
+  Handles skipping files that were already moved or marked distinct.
+  """
   import random
   import gdrive
-  from clean_google_drive import remove_duplicate_files
-  print("Cleaning up files that are MD5 identical first...")
-  remove_duplicate_files()
-  print("Loading the latest PDF embeddings...")
-  load(True)
-  print("Successfully loaded the latest PDF embeddings!")
-  if not prompt("Would you like to review the embeddings for any duplicates?"):
-    print("Okay then :)")
-    exit(0)
-  min_similarity = -1
-  while min_similarity < 0 or min_similarity > 1:
-    min_similarity = float(input_with_prefill("Min similarity: ", "0.93", float))
-  print("Loading the similarity matrix...")
-  similarity_matrix = calculate_similarity_matrix()
-  print("Selecting close neighboring pairs...")
-  close_pairs = find_close_pairs(similarity_matrix, min_similarity=min_similarity)
-  del similarity_matrix
+  
   MAX_TO_CONSIDER = 1000
   if len(close_pairs) > MAX_TO_CONSIDER:
     print(f"INFO: Restricting to {MAX_TO_CONSIDER} pairs of the {len(close_pairs)} found")
     close_pairs = random.sample(close_pairs, MAX_TO_CONSIDER)
   else:
     random.shuffle(close_pairs)
+  
   all_decisions = []
   if DECISION_HISTORY_FILE.exists():
     all_decisions = joblib.load(DECISION_HISTORY_FILE)
     assert isinstance(all_decisions, list), "Expected DECISION_HISTORY to be a list"
+  
   distinctions = gdrive.FileDistinctionManager()
-  close_pairs = [
+  
+  # Initial filter
+  pairs_to_review = [
     (
       google_files[idx],
       google_files[jdx],
@@ -454,26 +446,58 @@ def find_close_pairs(similarity_matrix, min_similarity=0.9):
     google_files[jdx]['parent_id'] != gdrive.OLD_VERSIONS_FOLDER_ID and
     not distinctions.are_distinct(google_files[idx]['id'], google_files[jdx]['id'])
   ]
-  print(f"Found {len(close_pairs)} close pairs that need review...")
-  if len(close_pairs) > 0:
+  
+  print(f"Found {len(pairs_to_review)} close pairs that need review...")
+  if len(pairs_to_review) > 0:
     print("Don't forget to open your browser!")
+    
   done = 0
-  for gfa, gfb, sim in close_pairs:
+  for gfa, gfb, sim in pairs_to_review:
       done += 1
-      print(f"\n---{done}/{len(close_pairs)}\n")
+      print(f"\n---{done}/{len(pairs_to_review)}\n")
+      
       # handle_close_pair_decisions can sometimes mark other pairs distinct
       # or even move other files to the graveyard, so have to recheck here
       if distinctions.are_distinct(gfa['id'], gfb['id']):
         print("Already merged as distinct :)")
         continue
+        
+      # Refresh from cache to get latest parent_id
       gfa = gdrive.gcache.get_item(gfa['id'])
       gfb = gdrive.gcache.get_item(gfb['id'])
+      
+      if not gfa or not gfb:
+          print("One of the files is missing from cache, skipping...")
+          continue
+
       if gdrive.OLD_VERSIONS_FOLDER_ID in [gfa['parent_id'], gfb['parent_id']]:
         print("Already moved to Old Versions :)")
         continue
+        
       decision = gdrive.is_duplicate_prompt(gfa, gfb, similariy=sim)    
       all_decisions.append((decision, gfa, gfb, sim))
       joblib.dump(all_decisions, DECISION_HISTORY_FILE, compress=2)
       distinctions.handle_close_pair_decision(decision, gfa, gfb)
 
+if __name__ == "__main__":
+  from clean_google_drive import remove_duplicate_files
+  print("Cleaning up files that are MD5 identical first...")
+  remove_duplicate_files()
+  print("Loading the latest PDF embeddings...")
+  load(True)
+  print("Successfully loaded the latest PDF embeddings!")
+  if not prompt("Would you like to review the embeddings for any duplicates?"):
+    print("Okay then :)")
+    exit(0)
+  min_similarity = -1
+  while min_similarity < 0 or min_similarity > 1:
+    min_similarity = float(input_with_prefill("Min similarity: ", "0.93", float))
+  print("Loading the similarity matrix...")
+  similarity_matrix = calculate_similarity_matrix()
+  print("Selecting close neighboring pairs...")
+  close_pairs = find_close_pairs(similarity_matrix, min_similarity=min_similarity)
+  del similarity_matrix
+  
+  review_close_pairs(close_pairs)
+