-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrepo_sim.py
More file actions
88 lines (73 loc) · 2.27 KB
/
repo_sim.py
File metadata and controls
88 lines (73 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import argparse
import os
import pickle
from itertools import combinations
from pathlib import Path
import pandas as pd
import torch
from torch.nn.functional import cosine_similarity
from transformers import pipeline
def cossim(l1: list, l2: list):
if l1 is None or l2 is None:
return None
l1 = torch.tensor(l1, dtype=torch.float32).unsqueeze(0)
l2 = torch.tensor(l2, dtype=torch.float32).unsqueeze(0)
return cosine_similarity(l1, l2).item()
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-i",
"--input",
nargs="+",
help="Input repositories",
required=True,
)
parser.add_argument("-o", "--output", help="Output directory", required=True)
parser.add_argument(
"-e",
"--eval",
help="Evaluate cosine similarities between all repository combinations",
action="store_true",
)
args = parser.parse_args()
output_dir = Path(args.output)
output_dir.mkdir(exist_ok=True)
model = pipeline(
model="Lazyhope/RepoSim",
trust_remote_code=True,
device_map="auto",
github_token=os.environ.get("GITHUB_TOKEN"),
)
REPOS = args.input
output = model(tuple(REPOS))
with open(output_dir / "output.pkl", "wb") as f:
pickle.dump(output, f)
if not args.eval:
return
if len(REPOS) < 2:
print("[-] At least 2 repositories are required for evaluation.")
return
# Evaluation
rows_list = []
for info1, info2 in combinations(output, 2):
rows_list.append(
{
"repo1": info1["name"],
"repo2": info2["name"],
"topics1": info1["topics"],
"topics2": info2["topics"],
"code_sim": cossim(
info1["mean_code_embedding"], info2["mean_code_embedding"]
),
"doc_sim": cossim(
info1["mean_doc_embedding"], info2["mean_doc_embedding"]
),
}
)
df = pd.DataFrame(rows_list)
df.to_csv(output_dir / "eval_res.csv", index=False)
print(f"[+] Evaluation results saved to {output_dir}")
if __name__ == "__main__":
main()