Skip to content

Commit 466f89c

Browse files
committed
refactor: use appstream data keywords only for keywords bc. other data (summary, description etc.) was too messy and thus messed up search rankings
1 parent 2ef2e61 commit 466f89c

4 files changed

Lines changed: 131 additions & 75 deletions

File tree

ARCHITECTURE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ Six CLI subcommands fetch data from external sources, invoked by external system
5252

5353
## Search
5454

55-
FTS5 indexes: name, base, description, groups, provides (denormalized from package relations). Hyphenated queries are split into individual terms for tokenizer compatibility.
55+
FTS5 indexes: name, base, description, groups, provides, keywords (AppStream `<keyword>` inside `<keywords>` blocks whose `xml:lang` is absent or en/de). Hyphenated queries are split into individual terms for tokenizer compatibility.
5656

57-
Ranking: exact name match first, then `bm25(10,5,1,1,3) - ln(1+popularity)` — name-weighted with log-scaled popularity boost.
57+
Ranking: exact name match first, then `bm25` with higher weights on name/description than on keywords (so long keyword blobs do not bury short pacman descriptions), minus `ln(1+popularity)`.
5858

5959
## Middleware Stack
6060

internal/appstream/parse.go

Lines changed: 42 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and
33
// builds per-pkgname search text for SQLite FTS.
44
//
5-
// Parsing model: encoding/xml streams tokens (start/end/CharData); we keep one
6-
// open component in docParser.cur. stack + muteLeaf track the path so
7-
// text is attributed to the right element; name/summary with xml:lang outside
8-
// en/de are skipped. flush runs at </component> (and before the next <component>)
9-
// to emit (pkgname, parts); the caller merges duplicate pkgnames.
5+
// Parsing model: encoding/xml streams tokens; we keep one open <component> in
6+
// docParser.cur. Only <pkgname> and <keywords>/<keyword> text are read; other
7+
// elements (name, summary, description, categories) are ignored for indexing.
8+
// AppStream puts xml:lang on <keywords> blocks (not each <keyword>). Only blocks
9+
// with no lang or en/de are indexed; per-<keyword> xml:lang is also respected when present.
10+
// flush runs at </component> (and before the next <component>) to emit (pkgname, parts).
1011
package appstream
1112

1213
import (
@@ -22,8 +23,8 @@ const (
2223
)
2324

2425
// ParseComponentsXML streams the decoder and calls fn once per completed
25-
// <component> (same pkgname may appear many times). fn receives raw text
26-
// fragments in parts; dedupeWords runs in the caller after merge.
26+
// <component> (same pkgname may appear many times). parts contains only
27+
// <keyword> text; dedupeWords runs in the caller after merge.
2728
func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error {
2829
d := xml.NewDecoder(r)
2930
d.Strict = false
@@ -52,18 +53,18 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err
5253
}
5354

5455
// docParser holds decoder state between tokens.
55-
// stack/muteLeaf are parallel: element names and whether that leaf skips CharData (non-en/de name/summary).
56-
// inKeywords/inKeyword/inCats/inDesc gate text from nested sections. cur is the open <component> or nil.
56+
// keywordsBlockSkip drops a whole <keywords xml:lang="…"> block when not en/de/neutral.
57+
// keywordSkip does the same for a single <keyword> when it carries xml:lang.
58+
// cur is the open <component> or nil.
5759
type docParser struct {
58-
fn func(string, []string) error
59-
dec *xml.Decoder
60-
stack []string
61-
muteLeaf []bool
62-
inKeywords bool
63-
inKeyword bool
64-
inCats bool
65-
inDesc int
66-
cur *component
60+
fn func(string, []string) error
61+
dec *xml.Decoder
62+
stack []string
63+
inKeywords bool
64+
keywordsBlockSkip bool
65+
inKeyword bool
66+
keywordSkip bool
67+
cur *component
6768
}
6869

6970
// flush emits cur via fn and clears it. EOF calls flush for the last component.
@@ -80,23 +81,10 @@ func (p *docParser) flush() error {
8081
return p.fn(name, parts)
8182
}
8283

83-
// startElement pushes stack/muteLeaf; on <component> flushes the previous component then starts a new cur.
84+
// startElement pushes stack; on <component> flushes the previous component then starts a new cur.
8485
func (p *docParser) startElement(t xml.StartElement) error {
8586
local := t.Name.Local
8687
p.stack = append(p.stack, local)
87-
muted := false
88-
if local == "name" || local == "summary" {
89-
for _, a := range t.Attr {
90-
if a.Name.Local != "lang" || a.Value == "" {
91-
continue
92-
}
93-
if a.Value != "en" && a.Value != "de" {
94-
muted = true
95-
break
96-
}
97-
}
98-
}
99-
p.muteLeaf = append(p.muteLeaf, muted)
10088

10189
switch local {
10290
case "component":
@@ -106,56 +94,43 @@ func (p *docParser) startElement(t xml.StartElement) error {
10694
p.cur = &component{}
10795
case "keywords":
10896
p.inKeywords = true
97+
p.keywordsBlockSkip = !keywordLangAccepted(t.Attr)
10998
case elKeyword:
11099
if p.inKeywords {
111100
p.inKeyword = true
101+
p.keywordSkip = !keywordLangAccepted(t.Attr)
112102
}
113-
case "categories":
114-
p.inCats = true
115-
case "description":
116-
p.inDesc++
117103
}
118104
return nil
119105
}
120106

121-
// endElement pops stack/muteLeaf; on </component> flushes the finished component.
107+
// endElement pops stack; on </component> flushes the finished component.
122108
func (p *docParser) endElement(t xml.EndElement) error {
123109
local := t.Name.Local
124110
if len(p.stack) == 0 {
125111
return nil
126112
}
127113
p.stack = p.stack[:len(p.stack)-1]
128-
if len(p.muteLeaf) > 0 {
129-
p.muteLeaf = p.muteLeaf[:len(p.muteLeaf)-1]
130-
}
131114

132115
switch local {
133116
case "component":
134117
return p.flush()
135118
case "keywords":
136119
p.inKeywords = false
137120
p.inKeyword = false
121+
p.keywordsBlockSkip = false
138122
case elKeyword:
139123
p.inKeyword = false
140-
case "categories":
141-
p.inCats = false
142-
case "description":
143-
if p.inDesc > 0 {
144-
p.inDesc--
145-
}
124+
p.keywordSkip = false
146125
}
147126
return nil
148127
}
149128

150-
// charData routes text to pkgname or parts by parent element name (stack tip).
129+
// charData collects pkgname and <keyword> text only.
151130
func (p *docParser) charData(t xml.CharData) {
152131
if p.cur == nil {
153132
return
154133
}
155-
muted := len(p.muteLeaf) > 0 && p.muteLeaf[len(p.muteLeaf)-1]
156-
if muted {
157-
return
158-
}
159134
text := strings.TrimSpace(string(t))
160135
if text == "" {
161136
return
@@ -169,21 +144,27 @@ func (p *docParser) charData(t xml.CharData) {
169144
switch parent {
170145
case "pkgname":
171146
p.cur.pkgname += text
172-
case "name", "summary":
173-
p.cur.parts = append(p.cur.parts, text)
174-
case "category":
175-
if p.inCats {
176-
p.cur.parts = append(p.cur.parts, text)
177-
}
178147
case elKeyword:
179-
if p.inKeyword {
148+
if p.inKeyword && !p.keywordsBlockSkip && !p.keywordSkip {
180149
p.cur.parts = append(p.cur.parts, text)
181150
}
182-
case "p":
183-
if p.inDesc > 0 {
184-
p.cur.parts = append(p.cur.parts, text)
151+
}
152+
}
153+
154+
// keywordLangAccepted is used for both <keywords> and <keyword> start tags: true if
155+
// there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE).
156+
func keywordLangAccepted(attrs []xml.Attr) bool {
157+
for _, a := range attrs {
158+
if a.Name.Local != "lang" || a.Value == "" {
159+
continue
160+
}
161+
v := strings.ToLower(strings.TrimSpace(a.Value))
162+
if i := strings.IndexByte(v, '-'); i > 0 {
163+
v = v[:i]
185164
}
165+
return v == "en" || v == "de"
186166
}
167+
return true
187168
}
188169

189170
// component is one AppStream <component> being accumulated until flush.

internal/appstream/parse_test.go

Lines changed: 68 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,26 @@
11
package appstream
22

33
import (
4+
"encoding/xml"
45
"strings"
56
"testing"
67
)
78

8-
func TestParseComponentsXML(t *testing.T) {
9+
func TestParseComponentsXML_KeywordsOnly(t *testing.T) {
910
const xml = `<?xml version="1.0"?>
1011
<components>
1112
<component type="desktop-application">
1213
<pkgname>firefox</pkgname>
1314
<name>Firefox</name>
1415
<summary>Web browser</summary>
15-
<summary xml:lang="de">Webbrowser</summary>
16-
<summary xml:lang="fr">Navigateur</summary>
1716
<description><p>Free software web browser.</p></description>
18-
<categories><category>WebBrowser</category><category>Network</category></categories>
17+
<categories><category>WebBrowser</category></categories>
1918
<keywords><keyword>internet</keyword><keyword>www</keyword></keywords>
2019
</component>
2120
<component type="desktop-application">
2221
<pkgname>firefox</pkgname>
2322
<name>Firefox ESR</name>
24-
<summary>Extended support</summary>
23+
<keywords><keyword>mozilla</keyword></keywords>
2524
</component>
2625
</components>`
2726

@@ -37,13 +36,71 @@ func TestParseComponentsXML(t *testing.T) {
3736
if got == "" {
3837
t.Fatal("expected merged keywords for firefox")
3938
}
40-
if !strings.Contains(got, "Webbrowser") {
41-
t.Errorf("expected German summary term, got %q", got)
39+
if !strings.Contains(got, "internet") || !strings.Contains(got, "www") || !strings.Contains(got, "mozilla") {
40+
t.Errorf("expected AppStream <keyword> terms only, got %q", got)
4241
}
43-
if strings.Contains(strings.ToLower(got), "navigateur") {
44-
t.Errorf("did not expect French summary, got %q", got)
42+
if strings.Contains(strings.ToLower(got), "browser") || strings.Contains(got, "WebBrowser") {
43+
t.Errorf("did not expect description/name/category text in keywords, got %q", got)
4544
}
46-
if !strings.Contains(got, "Network") || !strings.Contains(got, "internet") {
47-
t.Errorf("expected category and keyword terms, got %q", got)
45+
}
46+
47+
func TestParseComponentsXML_KeywordLangFilter(t *testing.T) {
48+
// AppStream often sets xml:lang on <keywords>, not on each <keyword>.
49+
const xml = `<?xml version="1.0"?>
50+
<components>
51+
<component>
52+
<pkgname>demo</pkgname>
53+
<keywords>
54+
<keyword>neutral</keyword>
55+
</keywords>
56+
<keywords xml:lang="en">
57+
<keyword>english</keyword>
58+
</keywords>
59+
<keywords xml:lang="de">
60+
<keyword>deutsch</keyword>
61+
</keywords>
62+
<keywords xml:lang="de-DE">
63+
<keyword>deutsch2</keyword>
64+
</keywords>
65+
<keywords xml:lang="fr">
66+
<keyword>francais</keyword>
67+
</keywords>
68+
</component>
69+
</components>`
70+
acc := make(map[string][]string)
71+
err := ParseComponentsXML(strings.NewReader(xml), func(name string, parts []string) error {
72+
acc[name] = append(acc[name], parts...)
73+
return nil
74+
})
75+
if err != nil {
76+
t.Fatal(err)
77+
}
78+
got := dedupeWords(acc["demo"])
79+
for _, need := range []string{"neutral", "english", "deutsch", "deutsch2"} {
80+
if !strings.Contains(got, need) {
81+
t.Errorf("missing %q in %q", need, got)
82+
}
83+
}
84+
if strings.Contains(got, "francais") {
85+
t.Errorf("did not want fr keyword, got %q", got)
86+
}
87+
}
88+
89+
func TestKeywordLangAccepted(t *testing.T) {
90+
tests := []struct {
91+
attrs []xml.Attr
92+
want bool
93+
}{
94+
{nil, true},
95+
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "en"}}, true},
96+
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de"}}, true},
97+
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de-AT"}}, true},
98+
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "fr"}}, false},
99+
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "pl"}}, false},
100+
}
101+
for _, tt := range tests {
102+
if got := keywordLangAccepted(tt.attrs); got != tt.want {
103+
t.Errorf("keywordLangAccepted(%v) = %v, want %v", tt.attrs, got, tt.want)
104+
}
48105
}
49106
}

internal/packages/repository.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,25 @@ import (
44
"context"
55
"database/sql"
66
"errors"
7+
"fmt"
78
"strings"
89

910
fts "archded/internal/search"
1011
)
1112

13+
// bm25Weights are per FTS5 column: name, base, description, groups, provides, keywords.
14+
// Long AppStream keyword fields increase BM25 document length; description (short pacman
15+
// text) must stay heavily weighted so queries like "browser" still rank packages that
16+
// only match strongly there.
17+
const (
18+
bm25Name = 12
19+
bm25Base = 5
20+
bm25Description = 10
21+
bm25Groups = 1
22+
bm25Provides = 3
23+
bm25Keywords = 0.5
24+
)
25+
1226
type PackageSummary struct {
1327
Repository string
1428
Architecture string
@@ -93,8 +107,12 @@ func (r *Repository) Search(ctx context.Context, search, repo, arch string, limi
93107
}
94108

95109
countQuery = `SELECT COUNT(*) ` + baseWhere
110+
bm25 := fmt.Sprintf(
111+
"bm25(package_fts, %d, %d, %d, %d, %d, %g)",
112+
bm25Name, bm25Base, bm25Description, bm25Groups, bm25Provides, bm25Keywords,
113+
)
96114
dataQuery = `SELECT r.name, r.architecture, p.name, p.version, p.description, p.build_date, p.popularity_recent, r.testing
97-
` + baseWhere + ` ORDER BY (p.name = ?) DESC, bm25(package_fts, 10, 5, 1, 1, 3, 2) - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?`
115+
` + baseWhere + ` ORDER BY (p.name = ?) DESC, ` + bm25 + ` - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?`
98116
dataArgs = append(dataArgs, search, limit, offset)
99117
} else {
100118
baseWhere := `FROM package p

0 commit comments

Comments
 (0)