Skip to content

Commit bfc1e2c

Browse files
committed
Add gdrive and nearestpdf tests
[skip ci]
1 parent a4454b7 commit bfc1e2c

5 files changed

Lines changed: 946 additions & 39 deletions

File tree

scripts/gdrive.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# (see get_gfolders_for_course for how those slugs are parsed)
1212
########
1313

14+
from enum import unique
1415
import requests
1516
import enum
1617
from datetime import datetime
@@ -477,9 +478,6 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
477478
"""
478479

479480
import website
480-
if not website.content:
481-
with yaspin(text="Loading website..."):
482-
website.load()
483481
UNIMPORTANT_SLUGS = [
484482
'to-go-through',
485483
'to-split',
@@ -504,22 +502,20 @@ def select_ids_to_keep(files: list[dict[str, any]], folder_slugs: dict[str, str]
504502
TAG_ORDER = {
505503
str(tf).removesuffix('.md'): idx+1
506504
for idx, tf in enumerate(website.config['collections']['tags']['order'])
505+
# `website.config` is accessible without needing to `load` it
507506
}
508507
LO_PRI = len(TAG_ORDER)+1000
509508

510509
#####
511-
# If only one is in a slugged folder, keep that one
510+
# If only one (unique) slug is represented among important folders, keep all files in that slug
512511
####
513512
slugs = [folder_slugs.get(f['parents'][0]) for f in files]
514-
filter_list = []
515-
for unimportant in UNIMPORTANT_SLUGS:
516-
filter_list.append(unimportant)
517-
important_slugs = [slug for slug in slugs if slug not in filter_list]
518-
num_slugs = len(important_slugs)
519-
if num_slugs == 1:
520-
# if there's only one file in a slugged folder, keep that one
521-
# no need to even check for permissions
522-
return [files[slugs.index(important_slugs[0])]['id']], IDSelectionReason.TAG_FOLDER
513+
slugs = [s if s not in UNIMPORTANT_SLUGS else None for s in slugs]
514+
unique_slugs = set(slugs)
515+
unique_slugs.discard(None)
516+
if len(unique_slugs) == 1:
517+
important_slug_indexes = [i for i, slug in enumerate(slugs) if slug == list(unique_slugs)[0]]
518+
return [files[i]['id'] for i in important_slug_indexes], IDSelectionReason.TAG_FOLDER
523519

524520
#####
525521
# Don't trash any publicly-launched files
@@ -781,8 +777,10 @@ def _print_shortcuts(shortcuts: list[dict]):
781777
selected_to_keep = None
782778
selected_to_not = None
783779
if len(would_keep) > 1 and decision == ClosePairDecision.THEY_ARE_THE_SAME:
784-
assert reason == IDSelectionReason.IS_PUBLIC
785-
print("Both files are publicly launched!")
780+
if reason == IDSelectionReason.IS_PUBLIC:
781+
print("Both files are publicly launched!")
782+
else:
783+
print(f"Heuristics say to keep both files (Reason: {reason})")
786784
print("Please handle manually and select one of these to keep:")
787785
choice = radio_dial([
788786
DRIVE_LINK.format(actual_file_a['id']),
@@ -837,7 +835,7 @@ def _print_shortcuts(shortcuts: list[dict]):
837835
# marked as the same, ergo not distinct
838836
assert selected_to_not['id'] not in self.fileid_to_distinct_neighbors[selected_to_keep['id']]
839837
# since these two are marked the same, we should merge their clusters into a super-cluster
840-
super_cluster = self.fileid_to_distinct_neighbors[select_ids_to_keep['id']] | \
838+
super_cluster = self.fileid_to_distinct_neighbors[selected_to_keep['id']] | \
841839
self.fileid_to_distinct_neighbors[selected_to_not['id']]
842840
super_cluster.add(selected_to_keep['id'])
843841
points_to_not = fetch_distinct_file_pointing_to(self.gcache, selected_to_not['id'])
@@ -849,6 +847,7 @@ def _print_shortcuts(shortcuts: list[dict]):
849847
nn = super_cluster.copy()
850848
nn.remove(n)
851849
self.fileid_to_distinct_neighbors[n] = nn
850+
del self.fileid_to_distinct_neighbors[selected_to_not['id']]
852851
# else: # the one we've marked for removal isn't part of the distinctions graph, so nothing to do here
853852
# Now, all that's left is to handle the marking!
854853
print(f"[Action] Moving old version to Old Versions...")

scripts/nearestpdf.py

Lines changed: 47 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -412,38 +412,30 @@ def find_close_pairs(similarity_matrix, min_similarity=0.9):
412412
matching_idxs = np.where(similarity_matrix >= min_similarity)
413413
return list(zip(matching_idxs[0], matching_idxs[1], similarity_matrix[matching_idxs]))
414414

415-
if __name__ == "__main__":
415+
def review_close_pairs(close_pairs: list[tuple[int, int, float]]):
416+
"""
417+
Iterates through close pairs and prompts the user for decisions.
418+
Handles skipping files that were already moved or marked distinct.
419+
"""
416420
import random
417421
import gdrive
418-
from clean_google_drive import remove_duplicate_files
419-
print("Cleaning up files that are MD5 identical first...")
420-
remove_duplicate_files()
421-
print("Loading the latest PDF embeddings...")
422-
load(True)
423-
print("Successfully loaded the latest PDF embeddings!")
424-
if not prompt("Would you like to review the embeddings for any duplicates?"):
425-
print("Okay then :)")
426-
exit(0)
427-
min_similarity = -1
428-
while min_similarity < 0 or min_similarity > 1:
429-
min_similarity = float(input_with_prefill("Min similarity: ", "0.93", float))
430-
print("Loading the similarity matrix...")
431-
similarity_matrix = calculate_similarity_matrix()
432-
print("Selecting close neighboring pairs...")
433-
close_pairs = find_close_pairs(similarity_matrix, min_similarity=min_similarity)
434-
del similarity_matrix
422+
435423
MAX_TO_CONSIDER = 1000
436424
if len(close_pairs) > MAX_TO_CONSIDER:
437425
print(f"INFO: Restricting to {MAX_TO_CONSIDER} pairs of the {len(close_pairs)} found")
438426
close_pairs = random.sample(close_pairs, MAX_TO_CONSIDER)
439427
else:
440428
random.shuffle(close_pairs)
429+
441430
all_decisions = []
442431
if DECISION_HISTORY_FILE.exists():
443432
all_decisions = joblib.load(DECISION_HISTORY_FILE)
444433
assert isinstance(all_decisions, list), "Expected DECISION_HISTORY to be a list"
434+
445435
distinctions = gdrive.FileDistinctionManager()
446-
close_pairs = [
436+
437+
# Initial filter
438+
pairs_to_review = [
447439
(
448440
google_files[idx],
449441
google_files[jdx],
@@ -454,26 +446,58 @@ def find_close_pairs(similarity_matrix, min_similarity=0.9):
454446
google_files[jdx]['parent_id'] != gdrive.OLD_VERSIONS_FOLDER_ID and
455447
not distinctions.are_distinct(google_files[idx]['id'], google_files[jdx]['id'])
456448
]
457-
print(f"Found {len(close_pairs)} close pairs that need review...")
458-
if len(close_pairs) > 0:
449+
450+
print(f"Found {len(pairs_to_review)} close pairs that need review...")
451+
if len(pairs_to_review) > 0:
459452
print("Don't forget to open your browser!")
453+
460454
done = 0
461-
for gfa, gfb, sim in close_pairs:
455+
for gfa, gfb, sim in pairs_to_review:
462456
done += 1
463-
print(f"\n---{done}/{len(close_pairs)}\n")
457+
print(f"\n---{done}/{len(pairs_to_review)}\n")
458+
464459
# handle_close_pair_decisions can sometimes mark other pairs distinct
465460
# or even move other files to the graveyard, so have to recheck here
466461
if distinctions.are_distinct(gfa['id'], gfb['id']):
467462
print("Already merged as distinct :)")
468463
continue
464+
465+
# Refresh from cache to get latest parent_id
469466
gfa = gdrive.gcache.get_item(gfa['id'])
470467
gfb = gdrive.gcache.get_item(gfb['id'])
468+
469+
if not gfa or not gfb:
470+
print("One of the files is missing from cache, skipping...")
471+
continue
472+
471473
if gdrive.OLD_VERSIONS_FOLDER_ID in [gfa['parent_id'], gfb['parent_id']]:
472474
print("Already moved to Old Versions :)")
473475
continue
476+
474477
decision = gdrive.is_duplicate_prompt(gfa, gfb, similariy=sim)
475478
all_decisions.append((decision, gfa, gfb, sim))
476479
joblib.dump(all_decisions, DECISION_HISTORY_FILE, compress=2)
477480
distinctions.handle_close_pair_decision(decision, gfa, gfb)
478481

482+
if __name__ == "__main__":
483+
from clean_google_drive import remove_duplicate_files
484+
print("Cleaning up files that are MD5 identical first...")
485+
remove_duplicate_files()
486+
print("Loading the latest PDF embeddings...")
487+
load(True)
488+
print("Successfully loaded the latest PDF embeddings!")
489+
if not prompt("Would you like to review the embeddings for any duplicates?"):
490+
print("Okay then :)")
491+
exit(0)
492+
min_similarity = -1
493+
while min_similarity < 0 or min_similarity > 1:
494+
min_similarity = float(input_with_prefill("Min similarity: ", "0.93", float))
495+
print("Loading the similarity matrix...")
496+
similarity_matrix = calculate_similarity_matrix()
497+
print("Selecting close neighboring pairs...")
498+
close_pairs = find_close_pairs(similarity_matrix, min_similarity=min_similarity)
499+
del similarity_matrix
500+
501+
review_close_pairs(close_pairs)
502+
479503

scripts/test_fixtures/drive.sqlite

76 KB
Binary file not shown.

0 commit comments

Comments
 (0)