Skip to content

Commit b97c189

Browse files
initialize-semantic-issue-similarity-analysis-and-duplicate-detection-automation-workflow (#1175)
* Create duplicate_issue_detector.yaml * Update .github/workflows/duplicate_issue_detector.yaml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * Code rabbit follow-up * Update duplicate_issue_detector.yaml --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent fde9a06 commit b97c189

1 file changed

Lines changed: 179 additions & 0 deletions

File tree

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
name: Smart Duplicate Issue Detector (Semantic)
2+
3+
on:
4+
issues:
5+
types: [opened]
6+
7+
permissions:
8+
issues: write
9+
10+
jobs:
11+
detect-duplicates:
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- name: Set up Python
16+
uses: actions/setup-python@v5
17+
with:
18+
python-version: '3.11'
19+
20+
- name: Install dependencies
21+
run: |
22+
pip install --no-cache-dir sentence-transformers scikit-learn
23+
24+
- name: Fetch upstream issues (AOSSIE-Org/PictoPy)
25+
uses: actions/github-script@v7
26+
with:
27+
script: |
28+
const fs = require('fs');
29+
const issue = context.payload.issue;
30+
31+
const upstreamIssues = await github.paginate(
32+
github.rest.issues.listForRepo,
33+
{
34+
owner: "AOSSIE-Org",
35+
repo: "PictoPy",
36+
state: "all",
37+
per_page: 100
38+
}
39+
);
40+
41+
const data = {
42+
current: {
43+
number: issue.number,
44+
title: issue.title,
45+
body: issue.body || ""
46+
},
47+
others: upstreamIssues
48+
.filter(i => !i.pull_request)
49+
.map(i => ({
50+
number: i.number,
51+
title: i.title,
52+
body: i.body || "",
53+
url: i.html_url,
54+
state: i.state
55+
}))
56+
};
57+
58+
fs.writeFileSync("issues.json", JSON.stringify(data));
59+
60+
- name: Run semantic similarity analysis
61+
run: |
62+
python << 'EOF'
63+
import json
64+
from sentence_transformers import SentenceTransformer
65+
from sklearn.metrics.pairwise import cosine_similarity
66+
67+
THRESHOLD = 0.82
68+
MAX_RESULTS = 3
69+
70+
with open("issues.json") as f:
71+
data = json.load(f)
72+
73+
model = SentenceTransformer("all-MiniLM-L6-v2")
74+
75+
def text(issue):
76+
return f"{issue['title']} {issue['body']}".strip()
77+
78+
current_text = text(data["current"])
79+
others = data["others"]
80+
81+
if not others:
82+
with open("matches.json", "w") as f:
83+
json.dump([], f)
84+
exit()
85+
86+
embeddings = model.encode(
87+
[current_text] + [text(i) for i in others],
88+
normalize_embeddings=True
89+
)
90+
91+
current_vec = embeddings[0]
92+
other_vecs = embeddings[1:]
93+
94+
sims = cosine_similarity([current_vec], other_vecs)[0]
95+
96+
matches = []
97+
for issue, score in zip(others, sims):
98+
if score >= THRESHOLD:
99+
matches.append({
100+
"number": issue["number"],
101+
"title": issue["title"],
102+
"url": issue["url"],
103+
"state": issue["state"],
104+
"score": round(float(score) * 100, 1)
105+
})
106+
107+
matches = sorted(matches, key=lambda x: x["score"], reverse=True)[:MAX_RESULTS]
108+
109+
with open("matches.json", "w") as f:
110+
json.dump(matches, f)
111+
EOF
112+
113+
- name: Comment and soft-label in fork (non-blocking)
114+
uses: actions/github-script@v7
115+
with:
116+
script: |
117+
const fs = require("fs");
118+
const matches = JSON.parse(fs.readFileSync("matches.json", "utf8"));
119+
120+
if (matches.length === 0) {
121+
core.notice("No semantic duplicates found.");
122+
return;
123+
}
124+
125+
const list = matches.map(
126+
(m, i) =>
127+
`${i + 1}. **${m.title}** (#${m.number}, ${m.state})\n` +
128+
` ${m.url}\n` +
129+
` Similarity: ${m.score}%`
130+
).join("\n\n");
131+
132+
const safe = async (fn) => {
133+
try { await fn(); }
134+
catch (e) { core.notice(`Skipped write action: ${e.message}`); }
135+
};
136+
137+
await safe(() =>
138+
github.rest.issues.createComment({
139+
owner: context.repo.owner,
140+
repo: context.repo.repo,
141+
issue_number: context.payload.issue.number,
142+
body:
143+
`⚠️ **Potential Duplicate Issue (Semantic Match)**\n\n` +
144+
`This issue appears semantically similar to the following issues in AOSSIE-Org/PictoPy:\n\n` +
145+
`${list}\n\n` +
146+
`Please review before proceeding.`
147+
})
148+
);
149+
150+
const labelName = "possible-duplicate";
151+
152+
try {
153+
await github.rest.issues.getLabel({
154+
owner: context.repo.owner,
155+
repo: context.repo.repo,
156+
name: labelName
157+
});
158+
} catch (e) {
159+
if (e.status === 404) {
160+
await safe(() =>
161+
github.rest.issues.createLabel({
162+
owner: context.repo.owner,
163+
repo: context.repo.repo,
164+
name: labelName,
165+
color: "FBCA04",
166+
description: "Potential semantic duplicate (upstream comparison)"
167+
})
168+
);
169+
}
170+
}
171+
172+
await safe(() =>
173+
github.rest.issues.addLabels({
174+
owner: context.repo.owner,
175+
repo: context.repo.repo,
176+
issue_number: context.payload.issue.number,
177+
labels: [labelName]
178+
})
179+
);

0 commit comments

Comments
 (0)