Skip to content

Commit 2ef2e61

Browse files
committed
feat: account for common words not needed for FTS
1 parent 6dd96e0 commit 2ef2e61

3 files changed

Lines changed: 60 additions & 1 deletion

File tree

internal/appstream/parse.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,13 +192,17 @@ type component struct {
192192
parts []string
193193
}
194194

195-
// dedupeWords joins fragments and drops duplicate tokens (case-insensitive) for FTS.
195+
// dedupeWords joins fragments, removes English/German stop words, and deduplicates
196+
// tokens (case-insensitive) for FTS.
196197
func dedupeWords(parts []string) string {
197198
seen := make(map[string]struct{})
198199
var b strings.Builder
199200
for _, part := range parts {
200201
for _, w := range strings.Fields(part) {
201202
key := strings.ToLower(w)
203+
if _, ok := stopword[key]; ok {
204+
continue
205+
}
202206
if _, ok := seen[key]; ok {
203207
continue
204208
}

internal/appstream/stopwords.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package appstream
2+
3+
// stopword is a small English + German closed-class word set (articles,
4+
// conjunctions, common prepositions, auxiliaries, pronouns). It trims noise for
5+
// FTS without pulling in NLP dependencies. Extend deliberately: short words like
6+
// "go" or "c" are omitted because they double as names.
7+
var stopword map[string]struct{}
8+
9+
func init() {
10+
words := []string{
11+
// English
12+
"a", "about", "after", "again", "all", "am", "an", "and", "any", "are", "as", "at",
13+
"be", "been", "before", "being", "between", "both", "but", "by",
14+
"can", "could",
15+
"did", "do", "does", "doing", "done", "during",
16+
"each", "few", "for", "from", "further",
17+
"had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",
18+
"i", "if", "in", "into", "is", "it", "its", "itself",
19+
"just",
20+
"me", "more", "most", "my", "myself",
21+
"no", "nor", "not",
22+
"of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own",
23+
"same", "she", "should", "so", "some", "such",
24+
"than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too",
25+
"under", "until", "up",
26+
"very",
27+
"was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would",
28+
"you", "your", "yours", "yourself", "yourselves",
29+
// German
30+
"als", "am", "an", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "das", "dass", "dein", "deine", "dem", "den", "der", "des", "dich", "die", "dir", "doch", "du", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "er", "es", "euch", "euer", "eure", "für", "hab", "habe", "haben", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "hier", "ich", "ihm", "ihn", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "ist", "ja", "jede", "jedem", "jeden", "jeder", "jedes", "kann", "kannst", "können", "könnt", "machen", "man", "mein", "meine", "mich", "mir", "mit", "muss", "musst", "nach", "nicht", "noch", "nun", "nur", "ob", "oder", "ohne", "seid", "sein", "seine", "seinem", "seinen", "seiner", "seines", "sich", "sie", "sind", "so", "soll", "sollen", "sollst", "sollt", "sonst", "sowie", "um", "und", "uns", "unser", "unsere", "unter", "vom", "von", "vor", "war", "waren", "warst", "wart", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "wer", "werde", "werden", "werdet", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wohin", "wollen", "wollt", "würde", "würden", "zu", "zum", "zur", "über",
31+
}
32+
stopword = make(map[string]struct{}, len(words))
33+
for _, w := range words {
34+
stopword[w] = struct{}{}
35+
}
36+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package appstream
2+
3+
import "testing"
4+
5+
func TestDedupeWords_Stopwords(t *testing.T) {
6+
got := dedupeWords([]string{"The cat and the dog in a box"})
7+
want := "cat dog box"
8+
if got != want {
9+
t.Fatalf("got %q want %q", got, want)
10+
}
11+
}
12+
13+
func TestDedupeWords_GermanStopwords(t *testing.T) {
14+
got := dedupeWords([]string{"der schnelle braune Fuchs"})
15+
want := "schnelle braune Fuchs"
16+
if got != want {
17+
t.Fatalf("got %q want %q", got, want)
18+
}
19+
}

0 commit comments

Comments
 (0)