diff --git a/src/events/spam-detection/constants.ts b/src/events/spam-detection/constants.ts index ddbf5c6..3aa00be 100644 --- a/src/events/spam-detection/constants.ts +++ b/src/events/spam-detection/constants.ts @@ -3,3 +3,5 @@ import { rules } from "./rules-config.js"; export const MAX_RULE_TIMEFRAME = Math.max( ...rules.filter((rule) => rule.type !== "contentBased").map((rule) => rule.timeframe) ); + +export const MESSAGE_SIMILARITY_THRESHOLD = 0.8; diff --git a/src/events/spam-detection/detectors.ts b/src/events/spam-detection/detectors.ts index 3e4a6a0..64ca79a 100644 --- a/src/events/spam-detection/detectors.ts +++ b/src/events/spam-detection/detectors.ts @@ -1,5 +1,6 @@ import type { Message } from "discord.js"; -import { replaceSpoilerHack, stripCode } from "../../utils/messages.js"; +import { jaccardSimilarity, replaceSpoilerHack, stripCode } from "../../utils/messages.js"; +import { MESSAGE_SIMILARITY_THRESHOLD } from "./constants.js"; export const containsLink = (message: Message): boolean => { const withoutCode = stripCode(message.content); @@ -24,11 +25,18 @@ export const containsSpoilerHack = (message: Message) => { }; export const isDuplicate = (message: Message, oldMessage: Message) => { - return message.content.toLowerCase().trim() === oldMessage.content.toLowerCase().trim(); + // cheaper comparison first + const a = message.content.toLowerCase().trim(); + const b = oldMessage.content.toLowerCase().trim(); + if (a === b) { + return true; + } + // followed by jaccard for catching reordered/slightly altered messages with high similarity + return jaccardSimilarity(a, b) > MESSAGE_SIMILARITY_THRESHOLD; }; export const isCrossPost = (message: Message, oldMessage: Message) => { - return isDuplicate(message, oldMessage) && message.channelId !== oldMessage.channelId; + return message.channelId !== oldMessage.channelId && isDuplicate(message, oldMessage); }; export const anyMessage = () => true; diff --git a/src/utils/messages.test.ts b/src/utils/messages.test.ts index f3b162f..86edecf 100644 --- a/src/utils/messages.test.ts +++ b/src/utils/messages.test.ts @@ -1,6 +1,6 @@ import assert from "node:assert"; import { describe, it } from "node:test"; -import { replaceSpoilerHack, stripCode, stripEmoji } from "./messages.js"; +import { replaceSpoilerHack, stripCode, stripEmoji, jaccardSimilarity } from "./messages.js"; describe("utils/messages -> stripCode", () => { it("should remove inline code blocks", () => { @@ -71,3 +71,36 @@ describe("utils/messages -> replaceSpoilerHack", () => { assert.strictEqual(actual, expected); }); }); + +describe("jaccardSimilarity - crosspost detection", () => { + it("catches identical self-promotion spam", () => { + const msg1 = "Check out my new portfolio website! Built with React and Tailwind"; + const msg2 = "Check out my new portfolio website! Built with React and Tailwind"; + const actual = jaccardSimilarity(msg1, msg2); + + assert.strictEqual(actual, 1); + }); + + it("catches copy-paste spam with minor punctuation differences", () => { + const msg1 = "hey guys check out my new website!"; + const msg2 = "hey guys, check out my new website"; + const actual = jaccardSimilarity(msg1, msg2); + + assert.strictEqual(actual, 1); + }); + + it("catches reordered messages", () => { + const msg1 = "I just launched my SaaS app! Check it out and let me know what you think"; + const msg2 = "Check it out and let me know what you think! I just launched my SaaS app"; + const actual = jaccardSimilarity(msg1, msg2); + assert.strictEqual(actual, 1); + }); + + it("does not flag similar but different questions", () => { + const msg1 = "How do I center a div in CSS?"; + const msg2 = "How do I align a div to the right in CSS?"; + const actual = jaccardSimilarity(msg1, msg2); // 0.5833333333333334 + + assert.ok(actual > 0.5 && actual < 0.8); + }); +}); diff --git a/src/utils/messages.ts b/src/utils/messages.ts index 8714398..0e840fe 100644 --- a/src/utils/messages.ts +++ b/src/utils/messages.ts @@ -24,3 +24,23 @@ export const stripEmoji = (content: string): string => content.replace(/:\w+:/g, export function replaceSpoilerHack(messageContent: string | null, replacement = "[...]") { return (messageContent ?? "").replace(/(\|\|\u200b\|\|)+/g, replacement); } + +// https://en.wikipedia.org/wiki/Jaccard_index +export function jaccardSimilarity(text1: string, text2: string): number { + const words1 = new Set(normalizeText(text1)); + const words2 = new Set(normalizeText(text2)); + + const intersection = words1.intersection(words2); + const union = words1.union(words2); + + return union.size === 0 ? 0 : intersection.size / union.size; +} + +const normalizeText = (text: string) => { + return text + .toLowerCase() + .replace(/[^\w\s]/g, "") // Remove punctuation & symbols + .trim() + .split(/\s+/) + .filter(Boolean); +}; diff --git a/tsconfig.json b/tsconfig.json index 06e3bb9..41068d3 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,7 +1,7 @@ { "compilerOptions": { "target": "ES2022", - "lib": ["ES2022"], + "lib": ["ESNext"], "module": "nodenext", "moduleResolution": "nodenext", "allowSyntheticDefaultImports": true,