Skip to content

Commit ef5cacf

Browse files
authored
feat: impl jaccardSimilarity for improved cross-post detection (#15)
1 parent 88a9f4e commit ef5cacf

5 files changed

Lines changed: 68 additions & 5 deletions

File tree

src/events/spam-detection/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ import { rules } from "./rules-config.js";
33
export const MAX_RULE_TIMEFRAME = Math.max(
44
...rules.filter((rule) => rule.type !== "contentBased").map((rule) => rule.timeframe)
55
);
6+
7+
export const MESSAGE_SIMILARITY_THRESHOLD = 0.8;

src/events/spam-detection/detectors.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { Message } from "discord.js";
2-
import { replaceSpoilerHack, stripCode } from "../../utils/messages.js";
2+
import { jaccardSimilarity, replaceSpoilerHack, stripCode } from "../../utils/messages.js";
3+
import { MESSAGE_SIMILARITY_THRESHOLD } from "./constants.js";
34

45
export const containsLink = (message: Message): boolean => {
56
const withoutCode = stripCode(message.content);
@@ -24,11 +25,18 @@ export const containsSpoilerHack = (message: Message) => {
2425
};
2526

2627
export const isDuplicate = (message: Message, oldMessage: Message) => {
27-
return message.content.toLowerCase().trim() === oldMessage.content.toLowerCase().trim();
28+
// cheaper comparison first
29+
const a = message.content.toLowerCase().trim();
30+
const b = oldMessage.content.toLowerCase().trim();
31+
if (a === b) {
32+
return true;
33+
}
34+
// followed by jaccard for catching reordered/slightly altered messages with high similarity
35+
return jaccardSimilarity(a, b) > MESSAGE_SIMILARITY_THRESHOLD;
2836
};
2937

3038
export const isCrossPost = (message: Message, oldMessage: Message) => {
31-
return isDuplicate(message, oldMessage) && message.channelId !== oldMessage.channelId;
39+
return message.channelId !== oldMessage.channelId && isDuplicate(message, oldMessage);
3240
};
3341

3442
export const anyMessage = () => true;

src/utils/messages.test.ts

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import assert from "node:assert";
22
import { describe, it } from "node:test";
3-
import { replaceSpoilerHack, stripCode, stripEmoji } from "./messages.js";
3+
import { replaceSpoilerHack, stripCode, stripEmoji, jaccardSimilarity } from "./messages.js";
44

55
describe("utils/messages -> stripCode", () => {
66
it("should remove inline code blocks", () => {
@@ -71,3 +71,36 @@ describe("utils/messages -> replaceSpoilerHack", () => {
7171
assert.strictEqual(actual, expected);
7272
});
7373
});
74+
75+
describe("jaccardSimilarity - crosspost detection", () => {
76+
it("catches identical self-promotion spam", () => {
77+
const msg1 = "Check out my new portfolio website! Built with React and Tailwind";
78+
const msg2 = "Check out my new portfolio website! Built with React and Tailwind";
79+
const actual = jaccardSimilarity(msg1, msg2);
80+
81+
assert.strictEqual(actual, 1);
82+
});
83+
84+
it("catches copy-paste spam with minor punctuation differences", () => {
85+
const msg1 = "hey guys check out my new website!";
86+
const msg2 = "hey guys, check out my new website";
87+
const actual = jaccardSimilarity(msg1, msg2);
88+
89+
assert.strictEqual(actual, 1);
90+
});
91+
92+
it("catches reordered messages", () => {
93+
const msg1 = "I just launched my SaaS app! Check it out and let me know what you think";
94+
const msg2 = "Check it out and let me know what you think! I just launched my SaaS app";
95+
const actual = jaccardSimilarity(msg1, msg2);
96+
assert.strictEqual(actual, 1);
97+
});
98+
99+
it("does not flag similar but different questions", () => {
100+
const msg1 = "How do I center a div in CSS?";
101+
const msg2 = "How do I align a div to the right in CSS?";
102+
const actual = jaccardSimilarity(msg1, msg2); // 0.5833333333333334
103+
104+
assert.ok(actual > 0.5 && actual < 0.8);
105+
});
106+
});

src/utils/messages.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,23 @@ export const stripEmoji = (content: string): string => content.replace(/:\w+:/g,
2424
export function replaceSpoilerHack(messageContent: string | null, replacement = "[...]") {
2525
return (messageContent ?? "").replace(/(\|\|\u200b\|\|)+/g, replacement);
2626
}
27+
28+
// https://en.wikipedia.org/wiki/Jaccard_index
29+
export function jaccardSimilarity(text1: string, text2: string): number {
30+
const words1 = new Set(normalizeText(text1));
31+
const words2 = new Set(normalizeText(text2));
32+
33+
const intersection = words1.intersection(words2);
34+
const union = words1.union(words2);
35+
36+
return union.size === 0 ? 0 : intersection.size / union.size;
37+
}
38+
39+
const normalizeText = (text: string) => {
40+
return text
41+
.toLowerCase()
42+
.replace(/[^\w\s]/g, "") // Remove punctuation & symbols
43+
.trim()
44+
.split(/\s+/)
45+
.filter(Boolean);
46+
};

tsconfig.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"compilerOptions": {
33
"target": "ES2022",
4-
"lib": ["ES2022"],
4+
"lib": ["ESNext"],
55
"module": "nodenext",
66
"moduleResolution": "nodenext",
77
"allowSyntheticDefaultImports": true,

0 commit comments

Comments
 (0)