Split "matched_questions" into "keywords" and "questions" (#24)

ButterscotchV · web-flow · commit adee9d07021c · 2025-06-30T21:32:28.000-04:00
* Split "matched_questions" to separate data better

* Update code for "matched_questions" split

* Update docker images

* Update GitHub Actions workflows

* Fix missing questions field

* Filter special characters from dictionary matching

* Add keywords to dictionary matching
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -32,7 +32,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: [ 'csharp', 'javascript' ]
+        language: [ 'csharp', 'javascript', 'python' ]
         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
         # Use only 'java' to analyze code written in Java, Kotlin or both
         # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
diff --git a/.github/workflows/frontend.yml b/.github/workflows/frontend.yml
@@ -38,7 +38,7 @@ jobs:
       - name: Use Node.js
         uses: actions/setup-node@v4
         with:
-          node-version: 20
+          node-version: 22
 
       - name: Setup Pages
         uses: actions/configure-pages@v5
diff --git a/BingusApi/config/faq_config.json b/BingusApi/config/faq_config.json
diff --git a/BingusLib/FaqHandling/FaqConfig.cs b/BingusLib/FaqHandling/FaqConfig.cs
@@ -14,7 +14,10 @@ public record FaqConfigEntry
             [JsonPropertyName("answer")]
             public string Answer { get; set; } = "";
 
-            [JsonPropertyName("matched_questions")]
+            [JsonPropertyName("keywords")]
+            public string[] Keywords { get; set; } = [];
+
+            [JsonPropertyName("questions")]
             public string[] Questions { get; set; } = [];
 
             public FaqConfigEntry() { }
@@ -25,9 +28,14 @@ public FaqConfigEntry(string answer)
                 Answer = answer;
             }
 
-            public FaqConfigEntry(string answer, IEnumerable<string> questions)
+            public FaqConfigEntry(
+                string answer,
+                IEnumerable<string> keywords,
+                IEnumerable<string> questions
+            )
                 : this(answer)
             {
+                Keywords = keywords.ToArray();
                 Questions = questions.ToArray();
             }
         }
@@ -50,6 +58,11 @@ public FaqConfigEntry(string answer, IEnumerable<string> questions)
         {
             foreach (var entry in FaqEntries)
             {
+                foreach (var keyword in entry.Keywords)
+                {
+                    yield return (entry.Title, keyword, entry.Answer);
+                }
+
                 foreach (var question in entry.Questions)
                 {
                     yield return (entry.Title, question, entry.Answer);
diff --git a/BingusLib/FaqHandling/FaqDict.cs b/BingusLib/FaqHandling/FaqDict.cs
@@ -1,8 +1,9 @@
+using System.Text.RegularExpressions;
 using static BingusLib.FaqHandling.FaqConfig;
 
 namespace BingusLib.FaqHandling
 {
-    public class FaqDict
+    public partial class FaqDict
     {
         private readonly Dictionary<string, FaqEntry> _faqDict = [];
 
@@ -20,6 +21,16 @@ public FaqDict(IEnumerable<FaqConfigEntry> faqConfigEntries)
                     Answer = entry.Answer,
                 };
 
+                foreach (var keyword in entry.Keywords)
+                {
+                    _faqDict[CleanQuery(keyword)] = new FaqEntry()
+                    {
+                        Title = entry.Title,
+                        Question = keyword,
+                        Answer = entry.Answer,
+                    };
+                }
+
                 foreach (var question in entry.Questions)
                 {
                     _faqDict[CleanQuery(question)] = new FaqEntry()
@@ -32,11 +43,15 @@ public FaqDict(IEnumerable<FaqConfigEntry> faqConfigEntries)
             }
         }
 
-        private static string CleanQuery(string query) => query.Trim().ToLowerInvariant();
+        private static string CleanQuery(string query) =>
+            QueryFilterRegex().Replace(query.ToLowerInvariant(), "");
 
         public FaqEntry? Search(string query)
         {
             return _faqDict.TryGetValue(CleanQuery(query), out var entry) ? entry : null;
         }
+
+        [GeneratedRegex("[^A-Za-z]")]
+        private static partial Regex QueryFilterRegex();
     }
 }
diff --git a/bingus-bot/src/commands/ask.ts b/bingus-bot/src/commands/ask.ts
@@ -9,7 +9,7 @@ import { EmbedList, fetchBingus, fetchBingusData } from "../util.js";
 
 async function getFaqConfig() {
   return (await fetchBingusData()).faqs.flatMap((x) =>
-    x.matched_questions.filter((x) => x.length > 0 && x.length <= 100),
+    x.keywords.filter((x) => x.length > 0 && x.length <= 100),
   );
 }
 
diff --git a/bingus-bot/src/util.ts b/bingus-bot/src/util.ts
@@ -168,11 +168,11 @@ export class EmbedList {
 }
 
 export interface FaqConfig {
-  average_questions: boolean;
   faqs: {
     title: string;
     answer: string;
-    matched_questions: string[];
+    keywords: string[];
+    questions: string[];
   }[];
 }
 
diff --git a/bingus-python-encoder/Dockerfile b/bingus-python-encoder/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-slim
+FROM python:3.14-slim
 
 WORKDIR /usr/src/app
 
diff --git a/bingus-python-encoder/data_utils.py b/bingus-python-encoder/data_utils.py
@@ -69,7 +69,8 @@ def random_typo(str_err: StrErrer, random: Random) -> StrErrer:
 class FaqEntry(BaseModel):
     title: str | None
     answer: str
-    matched_questions: list[str]
+    keywords: list[str]
+    questions: list[str]
 
 
 class FaqConfig(BaseModel):
@@ -102,21 +103,21 @@ def iterate_answers(self):
 
     def iterate_questions(self):
         for faq in self.faqs:
-            for question in faq.matched_questions:
+            for question in faq.questions:
                 yield question
 
     def question_count(self):
-        return sum((len(faq.matched_questions) for faq in self.faqs))
+        return sum((len(faq.questions) for faq in self.faqs))
 
     def filter_short_questions(self, min_words: int):
         """
         Filters out questions shorter than min_words and removes empty entries.
         """
         for faq in self.faqs:
-            faq.matched_questions = [
-                q for q in faq.matched_questions if len(q.split()) >= min_words]
+            faq.questions = [
+                q for q in faq.questions if len(q.split()) >= min_words]
         self.faqs = [faq for faq in self.faqs if len(
-            faq.matched_questions) > 0]
+            faq.questions) > 0]
 
     def make_typos(
             self,
@@ -149,7 +150,7 @@ def make_typos(
         for faq in self.faqs:
             new_qs: list[str] = []
 
-            for question in faq.matched_questions:
+            for question in faq.questions:
                 q_min_typos = min_typos
                 q_max_typos = max_typos
                 if scale_max_per_word:
@@ -168,7 +169,7 @@ def make_typos(
                     new_qs.append(typo_q.result)
                     typo_count += num_typos
 
-            faq.matched_questions.extend(new_qs)
+            faq.questions.extend(new_qs)
             typo_entry_count += len(new_qs)
 
         return typo_entry_count, typo_count
@@ -178,7 +179,7 @@ def make_question_pairs(self) -> Dataset:
         Makes question-to-question pairs from the FAQs, where each question is paired with all
         other questions in its set (positive samples) and from other sets (negative sample).
         """
-        return make_entry_pairs([faq.matched_questions for faq in self.faqs])
+        return make_entry_pairs([faq.questions for faq in self.faqs])
 
     def make_question_answer_pairs(self) -> Dataset:
         """
@@ -188,7 +189,7 @@ def make_question_answer_pairs(self) -> Dataset:
         questions, answers, scores = [], [], []
 
         for faq in self.faqs:
-            for question in faq.matched_questions:
+            for question in faq.questions:
                 # Positive sample (correct answer)
                 questions.append(question)
                 answers.append(faq.answer)
@@ -212,7 +213,7 @@ def make_everything_pairs(self) -> Dataset:
         Makes pairs of titles, answers, and questions from the FAQs, where each set is paired with its correct
         answer (positive sample) and other incorrect answers (negative samples).
         """
-        return make_entry_pairs([[faq.title, faq.answer, *faq.matched_questions] for faq in self.faqs])
+        return make_entry_pairs([[faq.title, faq.answer, *faq.questions] for faq in self.faqs])
 
 
 def make_wiki_qa_dataset(faqs: FaqConfig, max_count: int = -1) -> Dataset:
diff --git a/bot.Dockerfile b/bot.Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20-slim AS base
+FROM node:22-slim AS base
 
 COPY ./bingus-bot/ /app/bingus-bot/
 COPY ./package*.json /app/

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,10 @@ public record FaqConfigEntry`
`14`	`14`	`[JsonPropertyName("answer")]`
`15`	`15`	`public string Answer { get; set; } = "";`
`16`	`16`
`17`		`- [JsonPropertyName("matched_questions")]`
	`17`	`+ [JsonPropertyName("keywords")]`
	`18`	`+ public string[] Keywords { get; set; } = [];`
	`19`	`+`
	`20`	`+ [JsonPropertyName("questions")]`
`18`	`21`	`public string[] Questions { get; set; } = [];`
`19`	`22`
`20`	`23`	`public FaqConfigEntry() { }`
`@@ -25,9 +28,14 @@ public FaqConfigEntry(string answer)`
`25`	`28`	`Answer = answer;`
`26`	`29`	`}`
`27`	`30`
`28`		`- public FaqConfigEntry(string answer, IEnumerable<string> questions)`
	`31`	`+ public FaqConfigEntry(`
	`32`	`+ string answer,`
	`33`	`+ IEnumerable<string> keywords,`
	`34`	`+ IEnumerable<string> questions`
	`35`	`+ )`
`29`	`36`	`: this(answer)`
`30`	`37`	`{`
	`38`	`+ Keywords = keywords.ToArray();`
`31`	`39`	`Questions = questions.ToArray();`
`32`	`40`	`}`
`33`	`41`	`}`
`@@ -50,6 +58,11 @@ public FaqConfigEntry(string answer, IEnumerable<string> questions)`
`50`	`58`	`{`
`51`	`59`	`foreach (var entry in FaqEntries)`
`52`	`60`	`{`
	`61`	`+ foreach (var keyword in entry.Keywords)`
	`62`	`+ {`
	`63`	`+ yield return (entry.Title, keyword, entry.Answer);`
	`64`	`+ }`
	`65`	`+`
`53`	`66`	`foreach (var question in entry.Questions)`
`54`	`67`	`{`
`55`	`68`	`yield return (entry.Title, question, entry.Answer);`
Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,9 @@`
	`1`	`+using System.Text.RegularExpressions;`
`1`	`2`	`using static BingusLib.FaqHandling.FaqConfig;`
`2`	`3`
`3`	`4`	`namespace BingusLib.FaqHandling`
`4`	`5`	`{`
`5`		`- public class FaqDict`
	`6`	`+ public partial class FaqDict`
`6`	`7`	`{`
`7`	`8`	`private readonly Dictionary<string, FaqEntry> _faqDict = [];`
`8`	`9`
`@@ -20,6 +21,16 @@ public FaqDict(IEnumerable<FaqConfigEntry> faqConfigEntries)`
`20`	`21`	`Answer = entry.Answer,`
`21`	`22`	`};`
`22`	`23`
	`24`	`+ foreach (var keyword in entry.Keywords)`
	`25`	`+ {`
	`26`	`+ _faqDict[CleanQuery(keyword)] = new FaqEntry()`
	`27`	`+ {`
	`28`	`+ Title = entry.Title,`
	`29`	`+ Question = keyword,`
	`30`	`+ Answer = entry.Answer,`
	`31`	`+ };`
	`32`	`+ }`
	`33`	`+`
`23`	`34`	`foreach (var question in entry.Questions)`
`24`	`35`	`{`
`25`	`36`	`_faqDict[CleanQuery(question)] = new FaqEntry()`
`@@ -32,11 +43,15 @@ public FaqDict(IEnumerable<FaqConfigEntry> faqConfigEntries)`
`32`	`43`	`}`
`33`	`44`	`}`
`34`	`45`
`35`		`- private static string CleanQuery(string query) => query.Trim().ToLowerInvariant();`
	`46`	`+ private static string CleanQuery(string query) =>`
	`47`	`+ QueryFilterRegex().Replace(query.ToLowerInvariant(), "");`
`36`	`48`
`37`	`49`	`public FaqEntry? Search(string query)`
`38`	`50`	`{`
`39`	`51`	`return _faqDict.TryGetValue(CleanQuery(query), out var entry) ? entry : null;`
`40`	`52`	`}`
	`53`	`+`
	`54`	`+ [GeneratedRegex("[^A-Za-z]")]`
	`55`	`+ private static partial Regex QueryFilterRegex();`
`41`	`56`	`}`
`42`	`57`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ import { EmbedList, fetchBingus, fetchBingusData } from "../util.js";`
`9`	`9`
`10`	`10`	`async function getFaqConfig() {`
`11`	`11`	`return (await fetchBingusData()).faqs.flatMap((x) =>`
`12`		`- x.matched_questions.filter((x) => x.length > 0 && x.length <= 100),`
	`12`	`+ x.keywords.filter((x) => x.length > 0 && x.length <= 100),`
`13`	`13`	`);`
`14`	`14`	`}`
`15`	`15`
Original file line number	Diff line number	Diff line change
`@@ -168,11 +168,11 @@ export class EmbedList {`
`168`	`168`	`}`
`169`	`169`
`170`	`170`	`export interface FaqConfig {`
`171`		`- average_questions: boolean;`
`172`	`171`	`faqs: {`
`173`	`172`	`title: string;`
`174`	`173`	`answer: string;`
`175`		`- matched_questions: string[];`
	`174`	`+ keywords: string[];`
	`175`	`+ questions: string[];`
`176`	`176`	`}[];`
`177`	`177`	`}`
`178`	`178`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.11-slim`
	`1`	`+FROM python:3.14-slim`
`2`	`2`
`3`	`3`	`WORKDIR /usr/src/app`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM node:20-slim AS base`
	`1`	`+FROM node:22-slim AS base`
`2`	`2`
`3`	`3`	`COPY ./bingus-bot/ /app/bingus-bot/`
`4`	`4`	`COPY ./package*.json /app/`