From 5e210b732d88cd81a6c48287cf200263f5164974 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 15:24:22 +0200 Subject: [PATCH 01/18] feat: initial implementation for appstream data fetcher in Golang --- internal/appstream/parse.go | 186 ++++++++++++++++++ internal/appstream/parse_test.go | 49 +++++ internal/appstream/update.go | 161 +++++++++++++++ internal/config/config.go | 18 +- .../000004_package_keywords.down.sql | 10 + .../migrations/000004_package_keywords.up.sql | 10 + internal/packagedetail/handler_test.go | 4 +- internal/packages/repository.go | 2 +- internal/packages/repository_test.go | 4 +- main.go | 6 + 10 files changed, 437 insertions(+), 13 deletions(-) create mode 100644 internal/appstream/parse.go create mode 100644 internal/appstream/parse_test.go create mode 100644 internal/appstream/update.go create mode 100644 internal/database/migrations/000004_package_keywords.down.sql create mode 100644 internal/database/migrations/000004_package_keywords.up.sql diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go new file mode 100644 index 00000000..8811d61a --- /dev/null +++ b/internal/appstream/parse.go @@ -0,0 +1,186 @@ +// Package appstream parses Arch Linux AppStream component XML (from +// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and +// builds per-pkgname search text for SQLite FTS. +package appstream + +import ( + "encoding/xml" + "io" + "strings" +) + +// ParseComponentsXML reads a Components-*.xml stream and calls fn for each +// , as soon as the element is complete. Multiple components with the +// same produce multiple invocations; the caller merges by name. This +// matches the streaming style of pacmandb.Parse: only one component is held in +// memory at a time. +func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error { + d := xml.NewDecoder(r) + d.Strict = false + + var ( + stack []string + muteLeaf []bool + inKeywords bool + inKeyword bool + inCats bool + inDesc int + ) + var cur *component + + flush := func() error { + if cur == nil { + return nil + } + name := strings.TrimSpace(cur.pkgname) + parts := append([]string(nil), cur.parts...) + cur = nil + if name == "" { + return nil + } + return fn(name, parts) + } + + for { + tok, err := d.Token() + if err == io.EOF { + if err := flush(); err != nil { + return err + } + break + } + if err != nil { + return err + } + + switch t := tok.(type) { + case xml.StartElement: + local := t.Name.Local + stack = append(stack, local) + muted := false + if local == "name" || local == "summary" { + for _, a := range t.Attr { + if a.Name.Local != "lang" || a.Value == "" { + continue + } + if a.Value != "en" && a.Value != "de" { + muted = true + break + } + } + } + muteLeaf = append(muteLeaf, muted) + + switch local { + case "component": + if err := flush(); err != nil { + return err + } + cur = &component{} + case "keywords": + inKeywords = true + case "keyword": + if inKeywords { + inKeyword = true + } + case "categories": + inCats = true + case "description": + inDesc++ + case "p": + // paragraph inside description + } + + case xml.EndElement: + local := t.Name.Local + if len(stack) == 0 { + continue + } + stack = stack[:len(stack)-1] + if len(muteLeaf) > 0 { + muteLeaf = muteLeaf[:len(muteLeaf)-1] + } + + switch local { + case "component": + if err := flush(); err != nil { + return err + } + case "keywords": + inKeywords = false + inKeyword = false + case "keyword": + inKeyword = false + case "categories": + inCats = false + case "description": + if inDesc > 0 { + inDesc-- + } + } + + case xml.CharData: + if cur == nil { + continue + } + muted := len(muteLeaf) > 0 && muteLeaf[len(muteLeaf)-1] + if muted { + continue + } + text := strings.TrimSpace(string(t)) + if text == "" { + continue + } + + parent := "" + if len(stack) > 0 { + parent = stack[len(stack)-1] + } + + switch parent { + case "pkgname": + cur.pkgname += text + case "name", "summary": + cur.parts = append(cur.parts, text) + case "category": + if inCats { + cur.parts = append(cur.parts, text) + } + case "keyword": + if inKeyword { + cur.parts = append(cur.parts, text) + } + case "p": + if inDesc > 0 { + cur.parts = append(cur.parts, text) + } + } + } + } + + return nil +} + +type component struct { + pkgname string + parts []string +} + +func dedupeWords(parts []string) string { + seen := make(map[string]struct{}) + var b strings.Builder + for _, p := range parts { + for _, w := range strings.Fields(p) { + key := strings.ToLower(w) + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + if b.Len() > 0 { + b.WriteByte(' ') + } + b.WriteString(w) + } + } + return b.String() +} diff --git a/internal/appstream/parse_test.go b/internal/appstream/parse_test.go new file mode 100644 index 00000000..5dc67f0b --- /dev/null +++ b/internal/appstream/parse_test.go @@ -0,0 +1,49 @@ +package appstream + +import ( + "strings" + "testing" +) + +func TestParseComponentsXML(t *testing.T) { + const xml = ` + + + firefox + Firefox + Web browser + Webbrowser + Navigateur +

Free software web browser.

+ WebBrowserNetwork + internetwww +
+ + firefox + Firefox ESR + Extended support + +
` + + acc := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, parts []string) error { + acc[name] = append(acc[name], parts...) + return nil + }) + if err != nil { + t.Fatal(err) + } + got := dedupeWords(acc["firefox"]) + if got == "" { + t.Fatal("expected merged keywords for firefox") + } + if !strings.Contains(got, "Webbrowser") { + t.Errorf("expected German summary term, got %q", got) + } + if strings.Contains(strings.ToLower(got), "navigateur") { + t.Errorf("did not expect French summary, got %q", got) + } + if !strings.Contains(got, "Network") || !strings.Contains(got, "internet") { + t.Errorf("expected category and keyword terms, got %q", got) + } +} diff --git a/internal/appstream/update.go b/internal/appstream/update.go new file mode 100644 index 00000000..79f36b51 --- /dev/null +++ b/internal/appstream/update.go @@ -0,0 +1,161 @@ +package appstream + +import ( + "compress/gzip" + "context" + "database/sql" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "strings" + "time" +) + +// DefaultSourcesBase is the directory listing published by Arch that contains +// versioned snapshots (e.g. …/20260326/{core,extra,multilib}/Components-x86_64.xml.gz). +const DefaultSourcesBase = "https://sources.archlinux.org/other/packages/archlinux-appstream-data/" + +const archlinuxPackageJSON = "https://archlinux.org/packages/extra/any/archlinux-appstream-data/json/" + +var componentRepos = []string{"core", "extra", "multilib"} + +type pkgJSON struct { + Pkgver string `json:"pkgver"` +} + +// LatestRelease returns the snapshot directory name (e.g. "20260326") matching +// the current extra/any archlinux-appstream-data package in the official repos. +func LatestRelease(ctx context.Context, client *http.Client) (string, error) { + if client == nil { + client = &http.Client{Timeout: 2 * time.Minute} + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, archlinuxPackageJSON, nil) + if err != nil { + return "", err + } + req.Header.Set("User-Agent", "archded/1.0 (+https://www.archlinux.de)") + + resp, err := client.Do(req) + if err != nil { + return "", fmt.Errorf("fetch package json: %w", err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("fetch package json: status %d", resp.StatusCode) + } + var p pkgJSON + if err := json.NewDecoder(resp.Body).Decode(&p); err != nil { + return "", fmt.Errorf("decode package json: %w", err) + } + if p.Pkgver == "" { + return "", fmt.Errorf("empty pkgver in package json") + } + return p.Pkgver, nil +} + +// Update downloads AppStream component XML for core, extra, and multilib from +// sourcesBase, merges keywords by package name, writes the keywords column, +// and rebuilds the FTS index. +func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase string) error { + if client == nil { + client = &http.Client{Timeout: 15 * time.Minute} + } + sourcesBase = strings.TrimSuffix(sourcesBase, "/") + "/" + + version, err := LatestRelease(ctx, client) + if err != nil { + return err + } + slog.Info("appstream snapshot", "version", version) + + acc := make(map[string][]string) + for _, repo := range componentRepos { + var components int + err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, parts []string) error { + components++ + acc[name] = append(acc[name], parts...) + return nil + }) + if err != nil { + return fmt.Errorf("repo %s: %w", repo, err) + } + slog.Info("appstream components parsed", "repo", repo, "components", components) + } + + merged := make(map[string]string, len(acc)) + for name, parts := range acc { + merged[name] = dedupeWords(parts) + } + + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.ExecContext(ctx, `UPDATE package SET keywords = ''`); err != nil { + return fmt.Errorf("clear keywords: %w", err) + } + + stmt, err := tx.PrepareContext(ctx, `UPDATE package SET keywords = ? WHERE name = ?`) + if err != nil { + return fmt.Errorf("prepare keyword update: %w", err) + } + defer func() { _ = stmt.Close() }() + + var updated int64 + for name, kw := range merged { + if kw == "" { + continue + } + res, err := stmt.ExecContext(ctx, kw, name) + if err != nil { + return fmt.Errorf("update keywords for %q: %w", name, err) + } + n, err := res.RowsAffected() + if err != nil { + return err + } + updated += n + } + + if err := tx.Commit(); err != nil { + return err + } + + slog.Info("appstream keywords applied", "distinct_names", len(merged), "package_rows", updated) + + if _, err := db.ExecContext(ctx, `INSERT INTO package_fts(package_fts) VALUES('rebuild')`); err != nil { + return fmt.Errorf("rebuild fts: %w", err) + } + + return nil +} + +func fetchRepoComponents(ctx context.Context, client *http.Client, base, version, repo string, fn func(string, []string) error) error { + u := fmt.Sprintf("%s%s/%s/Components-x86_64.xml.gz", base, version, repo) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return err + } + req.Header.Set("User-Agent", "archded/1.0 (+https://www.archlinux.de)") + + resp, err := client.Do(req) + if err != nil { + return err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("GET %s: status %d", u, resp.StatusCode) + } + + gz, err := gzip.NewReader(resp.Body) + if err != nil { + return fmt.Errorf("gzip %s: %w", u, err) + } + defer func() { _ = gz.Close() }() + + return ParseComponentsXML(io.Reader(gz), fn) +} diff --git a/internal/config/config.go b/internal/config/config.go index 401cb44d..836ec993 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -6,18 +6,20 @@ import ( ) type Config struct { - Database string - Port string - PackagesMirror string - DefaultMirror string + Database string + Port string + PackagesMirror string + DefaultMirror string + AppStreamSourcesBase string } func Load() (Config, error) { cfg := Config{ - Database: getEnv("DATABASE", ""), - Port: getEnv("PORT", "8080"), - PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), - DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + Database: getEnv("DATABASE", ""), + Port: getEnv("PORT", "8080"), + PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), + DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + AppStreamSourcesBase: getEnv("APPSTREAM_SOURCES_BASE", "https://sources.archlinux.org/other/packages/archlinux-appstream-data/"), } if cfg.Database == "" { diff --git a/internal/database/migrations/000004_package_keywords.down.sql b/internal/database/migrations/000004_package_keywords.down.sql new file mode 100644 index 00000000..4479aae3 --- /dev/null +++ b/internal/database/migrations/000004_package_keywords.down.sql @@ -0,0 +1,10 @@ +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); + +ALTER TABLE package DROP COLUMN keywords; diff --git a/internal/database/migrations/000004_package_keywords.up.sql b/internal/database/migrations/000004_package_keywords.up.sql new file mode 100644 index 00000000..d030841e --- /dev/null +++ b/internal/database/migrations/000004_package_keywords.up.sql @@ -0,0 +1,10 @@ +ALTER TABLE package ADD COLUMN keywords TEXT NOT NULL DEFAULT ''; + +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, keywords, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); diff --git a/internal/packagedetail/handler_test.go b/internal/packagedetail/handler_test.go index 82781896..9ebd9369 100644 --- a/internal/packagedetail/handler_test.go +++ b/internal/packagedetail/handler_test.go @@ -44,8 +44,8 @@ func setupHandlerDB(t *testing.T) *sql.DB { usr/share/bash/bash_completion')`, // Populate FTS - `INSERT INTO package_fts (rowid, name, base, description, groups, provides) - SELECT id, name, base, description, groups, provides FROM package`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords) + SELECT id, name, base, description, groups, provides, keywords FROM package`, } { if _, err := db.Exec(stmt); err != nil { t.Fatalf("setup: %v", err) diff --git a/internal/packages/repository.go b/internal/packages/repository.go index 825ba699..1204e15e 100644 --- a/internal/packages/repository.go +++ b/internal/packages/repository.go @@ -94,7 +94,7 @@ func (r *Repository) Search(ctx context.Context, search, repo, arch string, limi countQuery = `SELECT COUNT(*) ` + baseWhere dataQuery = `SELECT r.name, r.architecture, p.name, p.version, p.description, p.build_date, p.popularity_recent, r.testing - ` + baseWhere + ` ORDER BY (p.name = ?) DESC, bm25(package_fts, 10, 5, 1, 1, 3) - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` + ` + baseWhere + ` ORDER BY (p.name = ?) DESC, bm25(package_fts, 10, 5, 1, 1, 3, 2) - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` dataArgs = append(dataArgs, search, limit, offset) } else { baseWhere := `FROM package p diff --git a/internal/packages/repository_test.go b/internal/packages/repository_test.go index a0041923..f37209ff 100644 --- a/internal/packages/repository_test.go +++ b/internal/packages/repository_test.go @@ -31,8 +31,8 @@ func setupTestDB(t *testing.T) *sql.DB { (4, 3, 'linux', 'linux', '6.7-rc1', 'The Linux kernel (testing)', 1700400000, 'Jan', 0.0)`, // Populate FTS - `INSERT INTO package_fts (rowid, name, base, description, groups, provides) - SELECT id, name, base, description, groups, provides FROM package`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords) + SELECT id, name, base, description, groups, provides, keywords FROM package`, } { if _, err := db.Exec(stmt); err != nil { t.Fatalf("setup: %s...: %v", stmt[:40], err) diff --git a/main.go b/main.go index aed50dd9..c577024e 100644 --- a/main.go +++ b/main.go @@ -7,6 +7,7 @@ import ( "os" "time" + "archded/internal/appstream" "archded/internal/config" "archded/internal/database" "archded/internal/legacy" @@ -85,6 +86,11 @@ func runCommand(cmd string, cfg config.Config) int { slog.Error("update-mirror-popularities failed", "error", err) return 1 } + case "update-appstream": + if err := appstream.Update(ctx, db, nil, cfg.AppStreamSourcesBase); err != nil { + slog.Error("update-appstream failed", "error", err) + return 1 + } default: slog.Error("unknown command", "command", cmd) //nolint:gosec // cmd is from os.Args return 1 From 601206f6ed3359bd9b2859738bc3bc8fdbd022c7 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 15:27:44 +0200 Subject: [PATCH 02/18] fix errors --- internal/appstream/parse.go | 262 ++++++++++++++++++----------------- internal/appstream/update.go | 12 +- 2 files changed, 147 insertions(+), 127 deletions(-) diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index 8811d61a..8f2f58d9 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -5,10 +5,16 @@ package appstream import ( "encoding/xml" + "errors" "io" "strings" ) +// XML element names referenced more than once in the decoder. +const ( + elKeyword = "keyword" +) + // ParseComponentsXML reads a Components-*.xml stream and calls fn for each // , as soon as the element is complete. Multiple components with the // same produce multiple invocations; the caller merges by name. This @@ -17,150 +23,158 @@ import ( func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error { d := xml.NewDecoder(r) d.Strict = false - - var ( - stack []string - muteLeaf []bool - inKeywords bool - inKeyword bool - inCats bool - inDesc int - ) - var cur *component - - flush := func() error { - if cur == nil { - return nil - } - name := strings.TrimSpace(cur.pkgname) - parts := append([]string(nil), cur.parts...) - cur = nil - if name == "" { - return nil - } - return fn(name, parts) - } - + p := &docParser{fn: fn, dec: d} for { - tok, err := d.Token() - if err == io.EOF { - if err := flush(); err != nil { - return err - } - break + tok, err := p.dec.Token() + if errors.Is(err, io.EOF) { + return p.flush() } if err != nil { return err } - switch t := tok.(type) { case xml.StartElement: - local := t.Name.Local - stack = append(stack, local) - muted := false - if local == "name" || local == "summary" { - for _, a := range t.Attr { - if a.Name.Local != "lang" || a.Value == "" { - continue - } - if a.Value != "en" && a.Value != "de" { - muted = true - break - } - } - } - muteLeaf = append(muteLeaf, muted) - - switch local { - case "component": - if err := flush(); err != nil { - return err - } - cur = &component{} - case "keywords": - inKeywords = true - case "keyword": - if inKeywords { - inKeyword = true - } - case "categories": - inCats = true - case "description": - inDesc++ - case "p": - // paragraph inside description + if err := p.startElement(t); err != nil { + return err } - case xml.EndElement: - local := t.Name.Local - if len(stack) == 0 { - continue - } - stack = stack[:len(stack)-1] - if len(muteLeaf) > 0 { - muteLeaf = muteLeaf[:len(muteLeaf)-1] + if err := p.endElement(t); err != nil { + return err } + case xml.CharData: + p.charData(t) + } + } +} - switch local { - case "component": - if err := flush(); err != nil { - return err - } - case "keywords": - inKeywords = false - inKeyword = false - case "keyword": - inKeyword = false - case "categories": - inCats = false - case "description": - if inDesc > 0 { - inDesc-- - } - } +type docParser struct { + fn func(string, []string) error + dec *xml.Decoder + stack []string + muteLeaf []bool + inKeywords bool + inKeyword bool + inCats bool + inDesc int + cur *component +} - case xml.CharData: - if cur == nil { - continue - } - muted := len(muteLeaf) > 0 && muteLeaf[len(muteLeaf)-1] - if muted { - continue - } - text := strings.TrimSpace(string(t)) - if text == "" { +func (p *docParser) flush() error { + if p.cur == nil { + return nil + } + name := strings.TrimSpace(p.cur.pkgname) + parts := append([]string(nil), p.cur.parts...) + p.cur = nil + if name == "" { + return nil + } + return p.fn(name, parts) +} + +func (p *docParser) startElement(t xml.StartElement) error { + local := t.Name.Local + p.stack = append(p.stack, local) + muted := false + if local == "name" || local == "summary" { + for _, a := range t.Attr { + if a.Name.Local != "lang" || a.Value == "" { continue } - - parent := "" - if len(stack) > 0 { - parent = stack[len(stack)-1] + if a.Value != "en" && a.Value != "de" { + muted = true + break } + } + } + p.muteLeaf = append(p.muteLeaf, muted) - switch parent { - case "pkgname": - cur.pkgname += text - case "name", "summary": - cur.parts = append(cur.parts, text) - case "category": - if inCats { - cur.parts = append(cur.parts, text) - } - case "keyword": - if inKeyword { - cur.parts = append(cur.parts, text) - } - case "p": - if inDesc > 0 { - cur.parts = append(cur.parts, text) - } - } + switch local { + case "component": + if err := p.flush(); err != nil { + return err + } + p.cur = &component{} + case "keywords": + p.inKeywords = true + case elKeyword: + if p.inKeywords { + p.inKeyword = true } + case "categories": + p.inCats = true + case "description": + p.inDesc++ + } + return nil +} + +func (p *docParser) endElement(t xml.EndElement) error { + local := t.Name.Local + if len(p.stack) == 0 { + return nil + } + p.stack = p.stack[:len(p.stack)-1] + if len(p.muteLeaf) > 0 { + p.muteLeaf = p.muteLeaf[:len(p.muteLeaf)-1] } + switch local { + case "component": + return p.flush() + case "keywords": + p.inKeywords = false + p.inKeyword = false + case elKeyword: + p.inKeyword = false + case "categories": + p.inCats = false + case "description": + if p.inDesc > 0 { + p.inDesc-- + } + } return nil } +func (p *docParser) charData(t xml.CharData) { + if p.cur == nil { + return + } + muted := len(p.muteLeaf) > 0 && p.muteLeaf[len(p.muteLeaf)-1] + if muted { + return + } + text := strings.TrimSpace(string(t)) + if text == "" { + return + } + + parent := "" + if len(p.stack) > 0 { + parent = p.stack[len(p.stack)-1] + } + + switch parent { + case "pkgname": + p.cur.pkgname += text + case "name", "summary": + p.cur.parts = append(p.cur.parts, text) + case "category": + if p.inCats { + p.cur.parts = append(p.cur.parts, text) + } + case elKeyword: + if p.inKeyword { + p.cur.parts = append(p.cur.parts, text) + } + case "p": + if p.inDesc > 0 { + p.cur.parts = append(p.cur.parts, text) + } + } +} + type component struct { pkgname string parts []string @@ -169,8 +183,8 @@ type component struct { func dedupeWords(parts []string) string { seen := make(map[string]struct{}) var b strings.Builder - for _, p := range parts { - for _, w := range strings.Fields(p) { + for _, part := range parts { + for _, w := range strings.Fields(part) { key := strings.ToLower(w) if _, ok := seen[key]; ok { continue diff --git a/internal/appstream/update.go b/internal/appstream/update.go index 79f36b51..36643ceb 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -5,6 +5,7 @@ import ( "context" "database/sql" "encoding/json" + "errors" "fmt" "io" "log/slog" @@ -19,6 +20,11 @@ const DefaultSourcesBase = "https://sources.archlinux.org/other/packages/archlin const archlinuxPackageJSON = "https://archlinux.org/packages/extra/any/archlinux-appstream-data/json/" +const ( + httpClientTimeoutRelease = 2 * time.Minute + httpClientTimeoutUpdate = 15 * time.Minute +) + var componentRepos = []string{"core", "extra", "multilib"} type pkgJSON struct { @@ -29,7 +35,7 @@ type pkgJSON struct { // the current extra/any archlinux-appstream-data package in the official repos. func LatestRelease(ctx context.Context, client *http.Client) (string, error) { if client == nil { - client = &http.Client{Timeout: 2 * time.Minute} + client = &http.Client{Timeout: httpClientTimeoutRelease} } req, err := http.NewRequestWithContext(ctx, http.MethodGet, archlinuxPackageJSON, nil) if err != nil { @@ -50,7 +56,7 @@ func LatestRelease(ctx context.Context, client *http.Client) (string, error) { return "", fmt.Errorf("decode package json: %w", err) } if p.Pkgver == "" { - return "", fmt.Errorf("empty pkgver in package json") + return "", errors.New("empty pkgver in package json") } return p.Pkgver, nil } @@ -60,7 +66,7 @@ func LatestRelease(ctx context.Context, client *http.Client) (string, error) { // and rebuilds the FTS index. func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase string) error { if client == nil { - client = &http.Client{Timeout: 15 * time.Minute} + client = &http.Client{Timeout: httpClientTimeoutUpdate} } sourcesBase = strings.TrimSuffix(sourcesBase, "/") + "/" From e73e135e8065243103c3d92d7cc9f69e811aa57e Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 15:36:43 +0200 Subject: [PATCH 03/18] dev: add just recipes for appstream data for convenience and consistency --- justfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/justfile b/justfile index 32fe5afe..ff75886a 100644 --- a/justfile +++ b/justfile @@ -98,12 +98,16 @@ update-pnpm: update: update-go update-pnpm # fetch all external data into the local database -update-data: update-packages update-news update-mirrors update-releases update-package-popularities update-mirror-popularities +update-data: update-packages update-appstream update-news update-mirrors update-releases update-package-popularities update-mirror-popularities # fetch package data from Arch Linux repositories update-packages: go run . update-packages +# fetch AppStream component metadata (keywords for package search) +update-appstream: + go run . update-appstream + # fetch news from archlinux.org update-news: go run . update-news From a5e0628811cd8bef487316c3ec8179750d3dc3b9 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 15:45:18 +0200 Subject: [PATCH 04/18] dev: document how the code works --- internal/appstream/parse.go | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index 8f2f58d9..694d9b16 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -1,6 +1,12 @@ // Package appstream parses Arch Linux AppStream component XML (from // https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and // builds per-pkgname search text for SQLite FTS. +// +// Parsing model: encoding/xml streams tokens (start/end/CharData); we keep one +// open component in docParser.cur. stack + muteLeaf track the path so +// text is attributed to the right element; name/summary with xml:lang outside +// en/de are skipped. flush runs at (and before the next ) +// to emit (pkgname, parts); the caller merges duplicate pkgnames. package appstream import ( @@ -15,11 +21,9 @@ const ( elKeyword = "keyword" ) -// ParseComponentsXML reads a Components-*.xml stream and calls fn for each -// , as soon as the element is complete. Multiple components with the -// same produce multiple invocations; the caller merges by name. This -// matches the streaming style of pacmandb.Parse: only one component is held in -// memory at a time. +// ParseComponentsXML streams the decoder and calls fn once per completed +// (same pkgname may appear many times). fn receives raw text +// fragments in parts; dedupeWords runs in the caller after merge. func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error { d := xml.NewDecoder(r) d.Strict = false @@ -47,6 +51,9 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err } } +// docParser holds decoder state between tokens. +// stack/muteLeaf are parallel: element names and whether that leaf skips CharData (non-en/de name/summary). +// inKeywords/inKeyword/inCats/inDesc gate text from nested sections. cur is the open or nil. type docParser struct { fn func(string, []string) error dec *xml.Decoder @@ -59,6 +66,7 @@ type docParser struct { cur *component } +// flush emits cur via fn and clears it. EOF calls flush for the last component. func (p *docParser) flush() error { if p.cur == nil { return nil @@ -72,6 +80,7 @@ func (p *docParser) flush() error { return p.fn(name, parts) } +// startElement pushes stack/muteLeaf; on flushes the previous component then starts a new cur. func (p *docParser) startElement(t xml.StartElement) error { local := t.Name.Local p.stack = append(p.stack, local) @@ -109,6 +118,7 @@ func (p *docParser) startElement(t xml.StartElement) error { return nil } +// endElement pops stack/muteLeaf; on flushes the finished component. func (p *docParser) endElement(t xml.EndElement) error { local := t.Name.Local if len(p.stack) == 0 { @@ -137,6 +147,7 @@ func (p *docParser) endElement(t xml.EndElement) error { return nil } +// charData routes text to pkgname or parts by parent element name (stack tip). func (p *docParser) charData(t xml.CharData) { if p.cur == nil { return @@ -175,11 +186,13 @@ func (p *docParser) charData(t xml.CharData) { } } +// component is one AppStream being accumulated until flush. type component struct { pkgname string parts []string } +// dedupeWords joins fragments and drops duplicate tokens (case-insensitive) for FTS. func dedupeWords(parts []string) string { seen := make(map[string]struct{}) var b strings.Builder From 1c2c8e499eeda5e60dcef4d508d48ec853f2a4c4 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 15:48:31 +0200 Subject: [PATCH 05/18] feat: account for common words not needed for FTS --- internal/appstream/parse.go | 6 ++++- internal/appstream/stopwords.go | 36 ++++++++++++++++++++++++++++ internal/appstream/stopwords_test.go | 19 +++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 internal/appstream/stopwords.go create mode 100644 internal/appstream/stopwords_test.go diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index 694d9b16..98b6ee98 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -192,13 +192,17 @@ type component struct { parts []string } -// dedupeWords joins fragments and drops duplicate tokens (case-insensitive) for FTS. +// dedupeWords joins fragments, removes English/German stop words, and deduplicates +// tokens (case-insensitive) for FTS. func dedupeWords(parts []string) string { seen := make(map[string]struct{}) var b strings.Builder for _, part := range parts { for _, w := range strings.Fields(part) { key := strings.ToLower(w) + if _, ok := stopword[key]; ok { + continue + } if _, ok := seen[key]; ok { continue } diff --git a/internal/appstream/stopwords.go b/internal/appstream/stopwords.go new file mode 100644 index 00000000..83d3d471 --- /dev/null +++ b/internal/appstream/stopwords.go @@ -0,0 +1,36 @@ +package appstream + +// stopword is a small English + German closed-class word set (articles, +// conjunctions, common prepositions, auxiliaries, pronouns). It trims noise for +// FTS without pulling in NLP dependencies. Extend deliberately: short words like +// "go" or "c" are omitted because they double as names. +var stopword map[string]struct{} + +func init() { + words := []string{ + // English + "a", "about", "after", "again", "all", "am", "an", "and", "any", "are", "as", "at", + "be", "been", "before", "being", "between", "both", "but", "by", + "can", "could", + "did", "do", "does", "doing", "done", "during", + "each", "few", "for", "from", "further", + "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", + "i", "if", "in", "into", "is", "it", "its", "itself", + "just", + "me", "more", "most", "my", "myself", + "no", "nor", "not", + "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", + "same", "she", "should", "so", "some", "such", + "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", + "under", "until", "up", + "very", + "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", + "you", "your", "yours", "yourself", "yourselves", + // German + "als", "am", "an", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "das", "dass", "dein", "deine", "dem", "den", "der", "des", "dich", "die", "dir", "doch", "du", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "er", "es", "euch", "euer", "eure", "für", "hab", "habe", "haben", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "hier", "ich", "ihm", "ihn", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "ist", "ja", "jede", "jedem", "jeden", "jeder", "jedes", "kann", "kannst", "können", "könnt", "machen", "man", "mein", "meine", "mich", "mir", "mit", "muss", "musst", "nach", "nicht", "noch", "nun", "nur", "ob", "oder", "ohne", "seid", "sein", "seine", "seinem", "seinen", "seiner", "seines", "sich", "sie", "sind", "so", "soll", "sollen", "sollst", "sollt", "sonst", "sowie", "um", "und", "uns", "unser", "unsere", "unter", "vom", "von", "vor", "war", "waren", "warst", "wart", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "wer", "werde", "werden", "werdet", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wohin", "wollen", "wollt", "würde", "würden", "zu", "zum", "zur", "über", + } + stopword = make(map[string]struct{}, len(words)) + for _, w := range words { + stopword[w] = struct{}{} + } +} diff --git a/internal/appstream/stopwords_test.go b/internal/appstream/stopwords_test.go new file mode 100644 index 00000000..6bb51771 --- /dev/null +++ b/internal/appstream/stopwords_test.go @@ -0,0 +1,19 @@ +package appstream + +import "testing" + +func TestDedupeWords_Stopwords(t *testing.T) { + got := dedupeWords([]string{"The cat and the dog in a box"}) + want := "cat dog box" + if got != want { + t.Fatalf("got %q want %q", got, want) + } +} + +func TestDedupeWords_GermanStopwords(t *testing.T) { + got := dedupeWords([]string{"der schnelle braune Fuchs"}) + want := "schnelle braune Fuchs" + if got != want { + t.Fatalf("got %q want %q", got, want) + } +} From a2387108a1da6b4f15eafba0eb81db537e0c3f46 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 16:01:39 +0200 Subject: [PATCH 06/18] refactor: use appstream data keywords only for keywords bc. other data (summary, description etc.) was too messy and thus messed up search rankings --- ARCHITECTURE.md | 4 +- internal/appstream/parse.go | 103 +++++++++++++------------------ internal/appstream/parse_test.go | 79 ++++++++++++++++++++---- internal/packages/repository.go | 20 +++++- 4 files changed, 131 insertions(+), 75 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 6ab435aa..498e6369 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -52,9 +52,9 @@ Six CLI subcommands fetch data from external sources, invoked by external system ## Search -FTS5 indexes: name, base, description, groups, provides (denormalized from package relations). Hyphenated queries are split into individual terms for tokenizer compatibility. +FTS5 indexes: name, base, description, groups, provides, keywords (AppStream `` inside `` blocks whose `xml:lang` is absent or en/de). Hyphenated queries are split into individual terms for tokenizer compatibility. -Ranking: exact name match first, then `bm25(10,5,1,1,3) - ln(1+popularity)` — name-weighted with log-scaled popularity boost. +Ranking: exact name match first, then `bm25` with higher weights on name/description than on keywords (so long keyword blobs do not bury short pacman descriptions), minus `ln(1+popularity)`. ## Middleware Stack diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index 98b6ee98..d3ffa674 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -2,11 +2,12 @@ // https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and // builds per-pkgname search text for SQLite FTS. // -// Parsing model: encoding/xml streams tokens (start/end/CharData); we keep one -// open component in docParser.cur. stack + muteLeaf track the path so -// text is attributed to the right element; name/summary with xml:lang outside -// en/de are skipped. flush runs at (and before the next ) -// to emit (pkgname, parts); the caller merges duplicate pkgnames. +// Parsing model: encoding/xml streams tokens; we keep one open in +// docParser.cur. Only and / text are read; other +// elements (name, summary, description, categories) are ignored for indexing. +// AppStream puts xml:lang on blocks (not each ). Only blocks +// with no lang or en/de are indexed; per- xml:lang is also respected when present. +// flush runs at (and before the next ) to emit (pkgname, parts). package appstream import ( @@ -22,8 +23,8 @@ const ( ) // ParseComponentsXML streams the decoder and calls fn once per completed -// (same pkgname may appear many times). fn receives raw text -// fragments in parts; dedupeWords runs in the caller after merge. +// (same pkgname may appear many times). parts contains only +// text; dedupeWords runs in the caller after merge. func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error { d := xml.NewDecoder(r) d.Strict = false @@ -52,18 +53,18 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err } // docParser holds decoder state between tokens. -// stack/muteLeaf are parallel: element names and whether that leaf skips CharData (non-en/de name/summary). -// inKeywords/inKeyword/inCats/inDesc gate text from nested sections. cur is the open or nil. +// keywordsBlockSkip drops a whole block when not en/de/neutral. +// keywordSkip does the same for a single when it carries xml:lang. +// cur is the open or nil. type docParser struct { - fn func(string, []string) error - dec *xml.Decoder - stack []string - muteLeaf []bool - inKeywords bool - inKeyword bool - inCats bool - inDesc int - cur *component + fn func(string, []string) error + dec *xml.Decoder + stack []string + inKeywords bool + keywordsBlockSkip bool + inKeyword bool + keywordSkip bool + cur *component } // flush emits cur via fn and clears it. EOF calls flush for the last component. @@ -80,23 +81,10 @@ func (p *docParser) flush() error { return p.fn(name, parts) } -// startElement pushes stack/muteLeaf; on flushes the previous component then starts a new cur. +// startElement pushes stack; on flushes the previous component then starts a new cur. func (p *docParser) startElement(t xml.StartElement) error { local := t.Name.Local p.stack = append(p.stack, local) - muted := false - if local == "name" || local == "summary" { - for _, a := range t.Attr { - if a.Name.Local != "lang" || a.Value == "" { - continue - } - if a.Value != "en" && a.Value != "de" { - muted = true - break - } - } - } - p.muteLeaf = append(p.muteLeaf, muted) switch local { case "component": @@ -106,28 +94,23 @@ func (p *docParser) startElement(t xml.StartElement) error { p.cur = &component{} case "keywords": p.inKeywords = true + p.keywordsBlockSkip = !keywordLangAccepted(t.Attr) case elKeyword: if p.inKeywords { p.inKeyword = true + p.keywordSkip = !keywordLangAccepted(t.Attr) } - case "categories": - p.inCats = true - case "description": - p.inDesc++ } return nil } -// endElement pops stack/muteLeaf; on flushes the finished component. +// endElement pops stack; on flushes the finished component. func (p *docParser) endElement(t xml.EndElement) error { local := t.Name.Local if len(p.stack) == 0 { return nil } p.stack = p.stack[:len(p.stack)-1] - if len(p.muteLeaf) > 0 { - p.muteLeaf = p.muteLeaf[:len(p.muteLeaf)-1] - } switch local { case "component": @@ -135,27 +118,19 @@ func (p *docParser) endElement(t xml.EndElement) error { case "keywords": p.inKeywords = false p.inKeyword = false + p.keywordsBlockSkip = false case elKeyword: p.inKeyword = false - case "categories": - p.inCats = false - case "description": - if p.inDesc > 0 { - p.inDesc-- - } + p.keywordSkip = false } return nil } -// charData routes text to pkgname or parts by parent element name (stack tip). +// charData collects pkgname and text only. func (p *docParser) charData(t xml.CharData) { if p.cur == nil { return } - muted := len(p.muteLeaf) > 0 && p.muteLeaf[len(p.muteLeaf)-1] - if muted { - return - } text := strings.TrimSpace(string(t)) if text == "" { return @@ -169,21 +144,27 @@ func (p *docParser) charData(t xml.CharData) { switch parent { case "pkgname": p.cur.pkgname += text - case "name", "summary": - p.cur.parts = append(p.cur.parts, text) - case "category": - if p.inCats { - p.cur.parts = append(p.cur.parts, text) - } case elKeyword: - if p.inKeyword { + if p.inKeyword && !p.keywordsBlockSkip && !p.keywordSkip { p.cur.parts = append(p.cur.parts, text) } - case "p": - if p.inDesc > 0 { - p.cur.parts = append(p.cur.parts, text) + } +} + +// keywordLangAccepted is used for both and start tags: true if +// there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE). +func keywordLangAccepted(attrs []xml.Attr) bool { + for _, a := range attrs { + if a.Name.Local != "lang" || a.Value == "" { + continue + } + v := strings.ToLower(strings.TrimSpace(a.Value)) + if i := strings.IndexByte(v, '-'); i > 0 { + v = v[:i] } + return v == "en" || v == "de" } + return true } // component is one AppStream being accumulated until flush. diff --git a/internal/appstream/parse_test.go b/internal/appstream/parse_test.go index 5dc67f0b..d5097d43 100644 --- a/internal/appstream/parse_test.go +++ b/internal/appstream/parse_test.go @@ -1,27 +1,26 @@ package appstream import ( + "encoding/xml" "strings" "testing" ) -func TestParseComponentsXML(t *testing.T) { +func TestParseComponentsXML_KeywordsOnly(t *testing.T) { const xml = ` firefox Firefox Web browser - Webbrowser - Navigateur

Free software web browser.

- WebBrowserNetwork + WebBrowser internetwww
firefox Firefox ESR - Extended support + mozilla
` @@ -37,13 +36,71 @@ func TestParseComponentsXML(t *testing.T) { if got == "" { t.Fatal("expected merged keywords for firefox") } - if !strings.Contains(got, "Webbrowser") { - t.Errorf("expected German summary term, got %q", got) + if !strings.Contains(got, "internet") || !strings.Contains(got, "www") || !strings.Contains(got, "mozilla") { + t.Errorf("expected AppStream terms only, got %q", got) } - if strings.Contains(strings.ToLower(got), "navigateur") { - t.Errorf("did not expect French summary, got %q", got) + if strings.Contains(strings.ToLower(got), "browser") || strings.Contains(got, "WebBrowser") { + t.Errorf("did not expect description/name/category text in keywords, got %q", got) } - if !strings.Contains(got, "Network") || !strings.Contains(got, "internet") { - t.Errorf("expected category and keyword terms, got %q", got) +} + +func TestParseComponentsXML_KeywordLangFilter(t *testing.T) { + // AppStream often sets xml:lang on , not on each . + const xml = ` + + + demo + + neutral + + + english + + + deutsch + + + deutsch2 + + + francais + + +` + acc := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, parts []string) error { + acc[name] = append(acc[name], parts...) + return nil + }) + if err != nil { + t.Fatal(err) + } + got := dedupeWords(acc["demo"]) + for _, need := range []string{"neutral", "english", "deutsch", "deutsch2"} { + if !strings.Contains(got, need) { + t.Errorf("missing %q in %q", need, got) + } + } + if strings.Contains(got, "francais") { + t.Errorf("did not want fr keyword, got %q", got) + } +} + +func TestKeywordLangAccepted(t *testing.T) { + tests := []struct { + attrs []xml.Attr + want bool + }{ + {nil, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "en"}}, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de"}}, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de-AT"}}, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "fr"}}, false}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "pl"}}, false}, + } + for _, tt := range tests { + if got := keywordLangAccepted(tt.attrs); got != tt.want { + t.Errorf("keywordLangAccepted(%v) = %v, want %v", tt.attrs, got, tt.want) + } } } diff --git a/internal/packages/repository.go b/internal/packages/repository.go index 1204e15e..b94e0729 100644 --- a/internal/packages/repository.go +++ b/internal/packages/repository.go @@ -4,11 +4,25 @@ import ( "context" "database/sql" "errors" + "fmt" "strings" fts "archded/internal/search" ) +// bm25Weights are per FTS5 column: name, base, description, groups, provides, keywords. +// Long AppStream keyword fields increase BM25 document length; description (short pacman +// text) must stay heavily weighted so queries like "browser" still rank packages that +// only match strongly there. +const ( + bm25Name = 12 + bm25Base = 5 + bm25Description = 10 + bm25Groups = 1 + bm25Provides = 3 + bm25Keywords = 0.5 +) + type PackageSummary struct { Repository string Architecture string @@ -93,8 +107,12 @@ func (r *Repository) Search(ctx context.Context, search, repo, arch string, limi } countQuery = `SELECT COUNT(*) ` + baseWhere + bm25 := fmt.Sprintf( + "bm25(package_fts, %d, %d, %d, %d, %d, %g)", + bm25Name, bm25Base, bm25Description, bm25Groups, bm25Provides, bm25Keywords, + ) dataQuery = `SELECT r.name, r.architecture, p.name, p.version, p.description, p.build_date, p.popularity_recent, r.testing - ` + baseWhere + ` ORDER BY (p.name = ?) DESC, bm25(package_fts, 10, 5, 1, 1, 3, 2) - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` + ` + baseWhere + ` ORDER BY (p.name = ?) DESC, ` + bm25 + ` - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` dataArgs = append(dataArgs, search, limit, offset) } else { baseWhere := `FROM package p From 812b7e9ba33d836f3cca8804b1a3516225d8c402 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 16:08:14 +0200 Subject: [PATCH 07/18] feat: add support for appstream data categories --- ARCHITECTURE.md | 2 +- internal/appstream/parse.go | 84 ++++++++++++------- internal/appstream/parse_test.go | 59 ++++++++++--- internal/appstream/update.go | 41 +++++---- .../000005_package_categories.down.sql | 10 +++ .../000005_package_categories.up.sql | 10 +++ internal/packagedetail/handler_test.go | 4 +- internal/packages/repository.go | 12 +-- internal/packages/repository_test.go | 4 +- 9 files changed, 157 insertions(+), 69 deletions(-) create mode 100644 internal/database/migrations/000005_package_categories.down.sql create mode 100644 internal/database/migrations/000005_package_categories.up.sql diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 498e6369..411a36d8 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -52,7 +52,7 @@ Six CLI subcommands fetch data from external sources, invoked by external system ## Search -FTS5 indexes: name, base, description, groups, provides, keywords (AppStream `` inside `` blocks whose `xml:lang` is absent or en/de). Hyphenated queries are split into individual terms for tokenizer compatibility. +FTS5 indexes: name, base, description, groups, provides, keywords, categories (AppStream `` / `` inside blocks whose `xml:lang` is absent or en/de). Hyphenated queries are split into individual terms for tokenizer compatibility. Ranking: exact name match first, then `bm25` with higher weights on name/description than on keywords (so long keyword blobs do not bury short pacman descriptions), minus `ln(1+popularity)`. diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index d3ffa674..273185ff 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -3,11 +3,9 @@ // builds per-pkgname search text for SQLite FTS. // // Parsing model: encoding/xml streams tokens; we keep one open in -// docParser.cur. Only and / text are read; other -// elements (name, summary, description, categories) are ignored for indexing. -// AppStream puts xml:lang on blocks (not each ). Only blocks -// with no lang or en/de are indexed; per- xml:lang is also respected when present. -// flush runs at (and before the next ) to emit (pkgname, parts). +// docParser.cur. We read , /, and /. +// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed. +// flush runs at (and before the next ) to emit pkgname + terms. package appstream import ( @@ -19,13 +17,19 @@ import ( // XML element names referenced more than once in the decoder. const ( - elKeyword = "keyword" + elKeyword = "keyword" + elCategory = "category" ) +// IndexTerms holds text extracted from one for FTS (merged by pkgname in update.go). +type IndexTerms struct { + Keywords []string + Categories []string +} + // ParseComponentsXML streams the decoder and calls fn once per completed -// (same pkgname may appear many times). parts contains only -// text; dedupeWords runs in the caller after merge. -func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error { +// (same pkgname may appear many times). dedupeWords runs in the caller after merge. +func ParseComponentsXML(r io.Reader, fn func(pkgname string, terms IndexTerms) error) error { d := xml.NewDecoder(r) d.Strict = false p := &docParser{fn: fn, dec: d} @@ -53,18 +57,19 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err } // docParser holds decoder state between tokens. -// keywordsBlockSkip drops a whole block when not en/de/neutral. -// keywordSkip does the same for a single when it carries xml:lang. -// cur is the open or nil. type docParser struct { - fn func(string, []string) error - dec *xml.Decoder - stack []string - inKeywords bool - keywordsBlockSkip bool - inKeyword bool - keywordSkip bool - cur *component + fn func(string, IndexTerms) error + dec *xml.Decoder + stack []string + inKeywords bool + keywordsBlockSkip bool + inKeyword bool + keywordSkip bool + inCategories bool + categoriesBlockSkip bool + inCategory bool + categorySkip bool + cur *component } // flush emits cur via fn and clears it. EOF calls flush for the last component. @@ -73,12 +78,15 @@ func (p *docParser) flush() error { return nil } name := strings.TrimSpace(p.cur.pkgname) - parts := append([]string(nil), p.cur.parts...) + terms := IndexTerms{ + Keywords: append([]string(nil), p.cur.keywords...), + Categories: append([]string(nil), p.cur.categories...), + } p.cur = nil if name == "" { return nil } - return p.fn(name, parts) + return p.fn(name, terms) } // startElement pushes stack; on flushes the previous component then starts a new cur. @@ -100,6 +108,14 @@ func (p *docParser) startElement(t xml.StartElement) error { p.inKeyword = true p.keywordSkip = !keywordLangAccepted(t.Attr) } + case "categories": + p.inCategories = true + p.categoriesBlockSkip = !keywordLangAccepted(t.Attr) + case elCategory: + if p.inCategories { + p.inCategory = true + p.categorySkip = !keywordLangAccepted(t.Attr) + } } return nil } @@ -122,11 +138,18 @@ func (p *docParser) endElement(t xml.EndElement) error { case elKeyword: p.inKeyword = false p.keywordSkip = false + case "categories": + p.inCategories = false + p.inCategory = false + p.categoriesBlockSkip = false + case elCategory: + p.inCategory = false + p.categorySkip = false } return nil } -// charData collects pkgname and text only. +// charData collects pkgname, , and text. func (p *docParser) charData(t xml.CharData) { if p.cur == nil { return @@ -146,13 +169,17 @@ func (p *docParser) charData(t xml.CharData) { p.cur.pkgname += text case elKeyword: if p.inKeyword && !p.keywordsBlockSkip && !p.keywordSkip { - p.cur.parts = append(p.cur.parts, text) + p.cur.keywords = append(p.cur.keywords, text) + } + case elCategory: + if p.inCategory && !p.categoriesBlockSkip && !p.categorySkip { + p.cur.categories = append(p.cur.categories, text) } } } -// keywordLangAccepted is used for both and start tags: true if -// there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE). +// keywordLangAccepted is used for , , , and +// start tags: true if there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE). func keywordLangAccepted(attrs []xml.Attr) bool { for _, a := range attrs { if a.Name.Local != "lang" || a.Value == "" { @@ -169,8 +196,9 @@ func keywordLangAccepted(attrs []xml.Attr) bool { // component is one AppStream being accumulated until flush. type component struct { - pkgname string - parts []string + pkgname string + keywords []string + categories []string } // dedupeWords joins fragments, removes English/German stop words, and deduplicates diff --git a/internal/appstream/parse_test.go b/internal/appstream/parse_test.go index d5097d43..a2e6ac4e 100644 --- a/internal/appstream/parse_test.go +++ b/internal/appstream/parse_test.go @@ -6,7 +6,7 @@ import ( "testing" ) -func TestParseComponentsXML_KeywordsOnly(t *testing.T) { +func TestParseComponentsXML_KeywordsAndCategories(t *testing.T) { const xml = ` @@ -14,7 +14,7 @@ func TestParseComponentsXML_KeywordsOnly(t *testing.T) { Firefox Web browser

Free software web browser.

- WebBrowser + NetworkWebBrowser internetwww
@@ -24,23 +24,29 @@ func TestParseComponentsXML_KeywordsOnly(t *testing.T) {
` - acc := make(map[string][]string) - err := ParseComponentsXML(strings.NewReader(xml), func(name string, parts []string) error { - acc[name] = append(acc[name], parts...) + accKW := make(map[string][]string) + accCat := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error { + accKW[name] = append(accKW[name], terms.Keywords...) + accCat[name] = append(accCat[name], terms.Categories...) return nil }) if err != nil { t.Fatal(err) } - got := dedupeWords(acc["firefox"]) - if got == "" { + gotKW := dedupeWords(accKW["firefox"]) + gotCat := dedupeWords(accCat["firefox"]) + if gotKW == "" { t.Fatal("expected merged keywords for firefox") } - if !strings.Contains(got, "internet") || !strings.Contains(got, "www") || !strings.Contains(got, "mozilla") { - t.Errorf("expected AppStream terms only, got %q", got) + if !strings.Contains(gotKW, "internet") || !strings.Contains(gotKW, "www") || !strings.Contains(gotKW, "mozilla") { + t.Errorf("expected keyword terms, got %q", gotKW) + } + if strings.Contains(strings.ToLower(gotKW), "browser") { + t.Errorf("did not expect description text in keywords, got %q", gotKW) } - if strings.Contains(strings.ToLower(got), "browser") || strings.Contains(got, "WebBrowser") { - t.Errorf("did not expect description/name/category text in keywords, got %q", got) + if !strings.Contains(gotCat, "Network") || !strings.Contains(gotCat, "WebBrowser") { + t.Errorf("expected category terms, got %q", gotCat) } } @@ -68,8 +74,8 @@ func TestParseComponentsXML_KeywordLangFilter(t *testing.T) {
` acc := make(map[string][]string) - err := ParseComponentsXML(strings.NewReader(xml), func(name string, parts []string) error { - acc[name] = append(acc[name], parts...) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error { + acc[name] = append(acc[name], terms.Keywords...) return nil }) if err != nil { @@ -86,6 +92,33 @@ func TestParseComponentsXML_KeywordLangFilter(t *testing.T) { } } +func TestParseComponentsXML_CategoriesLangFilter(t *testing.T) { + const xml = ` + + + demo + NeutralCat + DeutschCat + FrCat + +` + acc := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error { + acc[name] = append(acc[name], terms.Categories...) + return nil + }) + if err != nil { + t.Fatal(err) + } + got := dedupeWords(acc["demo"]) + if !strings.Contains(got, "NeutralCat") || !strings.Contains(got, "DeutschCat") { + t.Errorf("want neutral+de categories, got %q", got) + } + if strings.Contains(got, "FrCat") { + t.Errorf("did not want fr category, got %q", got) + } +} + func TestKeywordLangAccepted(t *testing.T) { tests := []struct { attrs []xml.Attr diff --git a/internal/appstream/update.go b/internal/appstream/update.go index 36643ceb..be34d5ce 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -62,7 +62,7 @@ func LatestRelease(ctx context.Context, client *http.Client) (string, error) { } // Update downloads AppStream component XML for core, extra, and multilib from -// sourcesBase, merges keywords by package name, writes the keywords column, +// sourcesBase, merges keywords and categories by package name, writes both columns, // and rebuilds the FTS index. func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase string) error { if client == nil { @@ -76,12 +76,14 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st } slog.Info("appstream snapshot", "version", version) - acc := make(map[string][]string) + accKW := make(map[string][]string) + accCat := make(map[string][]string) for _, repo := range componentRepos { var components int - err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, parts []string) error { + err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, terms IndexTerms) error { components++ - acc[name] = append(acc[name], parts...) + accKW[name] = append(accKW[name], terms.Keywords...) + accCat[name] = append(accCat[name], terms.Categories...) return nil }) if err != nil { @@ -90,9 +92,12 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st slog.Info("appstream components parsed", "repo", repo, "components", components) } - merged := make(map[string]string, len(acc)) - for name, parts := range acc { - merged[name] = dedupeWords(parts) + names := make(map[string]struct{}) + for k := range accKW { + names[k] = struct{}{} + } + for k := range accCat { + names[k] = struct{}{} } tx, err := db.BeginTx(ctx, nil) @@ -101,24 +106,26 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st } defer func() { _ = tx.Rollback() }() - if _, err := tx.ExecContext(ctx, `UPDATE package SET keywords = ''`); err != nil { - return fmt.Errorf("clear keywords: %w", err) + if _, err := tx.ExecContext(ctx, `UPDATE package SET keywords = '', categories = ''`); err != nil { + return fmt.Errorf("clear appstream columns: %w", err) } - stmt, err := tx.PrepareContext(ctx, `UPDATE package SET keywords = ? WHERE name = ?`) + stmt, err := tx.PrepareContext(ctx, `UPDATE package SET keywords = ?, categories = ? WHERE name = ?`) if err != nil { - return fmt.Errorf("prepare keyword update: %w", err) + return fmt.Errorf("prepare appstream update: %w", err) } defer func() { _ = stmt.Close() }() var updated int64 - for name, kw := range merged { - if kw == "" { + for name := range names { + kw := dedupeWords(accKW[name]) + cat := dedupeWords(accCat[name]) + if kw == "" && cat == "" { continue } - res, err := stmt.ExecContext(ctx, kw, name) + res, err := stmt.ExecContext(ctx, kw, cat, name) if err != nil { - return fmt.Errorf("update keywords for %q: %w", name, err) + return fmt.Errorf("update appstream fields for %q: %w", name, err) } n, err := res.RowsAffected() if err != nil { @@ -131,7 +138,7 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st return err } - slog.Info("appstream keywords applied", "distinct_names", len(merged), "package_rows", updated) + slog.Info("appstream fields applied", "distinct_names", len(names), "package_rows", updated) if _, err := db.ExecContext(ctx, `INSERT INTO package_fts(package_fts) VALUES('rebuild')`); err != nil { return fmt.Errorf("rebuild fts: %w", err) @@ -140,7 +147,7 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st return nil } -func fetchRepoComponents(ctx context.Context, client *http.Client, base, version, repo string, fn func(string, []string) error) error { +func fetchRepoComponents(ctx context.Context, client *http.Client, base, version, repo string, fn func(string, IndexTerms) error) error { u := fmt.Sprintf("%s%s/%s/Components-x86_64.xml.gz", base, version, repo) req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) if err != nil { diff --git a/internal/database/migrations/000005_package_categories.down.sql b/internal/database/migrations/000005_package_categories.down.sql new file mode 100644 index 00000000..fe6a1dc1 --- /dev/null +++ b/internal/database/migrations/000005_package_categories.down.sql @@ -0,0 +1,10 @@ +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, keywords, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); + +ALTER TABLE package DROP COLUMN categories; diff --git a/internal/database/migrations/000005_package_categories.up.sql b/internal/database/migrations/000005_package_categories.up.sql new file mode 100644 index 00000000..344d1bc9 --- /dev/null +++ b/internal/database/migrations/000005_package_categories.up.sql @@ -0,0 +1,10 @@ +ALTER TABLE package ADD COLUMN categories TEXT NOT NULL DEFAULT ''; + +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, keywords, categories, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); diff --git a/internal/packagedetail/handler_test.go b/internal/packagedetail/handler_test.go index 9ebd9369..d0effe26 100644 --- a/internal/packagedetail/handler_test.go +++ b/internal/packagedetail/handler_test.go @@ -44,8 +44,8 @@ func setupHandlerDB(t *testing.T) *sql.DB { usr/share/bash/bash_completion')`, // Populate FTS - `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords) - SELECT id, name, base, description, groups, provides, keywords FROM package`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords, categories) + SELECT id, name, base, description, groups, provides, keywords, categories FROM package`, } { if _, err := db.Exec(stmt); err != nil { t.Fatalf("setup: %v", err) diff --git a/internal/packages/repository.go b/internal/packages/repository.go index b94e0729..c0c733be 100644 --- a/internal/packages/repository.go +++ b/internal/packages/repository.go @@ -10,10 +10,9 @@ import ( fts "archded/internal/search" ) -// bm25Weights are per FTS5 column: name, base, description, groups, provides, keywords. -// Long AppStream keyword fields increase BM25 document length; description (short pacman -// text) must stay heavily weighted so queries like "browser" still rank packages that -// only match strongly there. +// bm25Weights are per FTS5 column: name, base, description, groups, provides, keywords, categories. +// Long AppStream fields increase BM25 document length; description (short pacman text) must +// stay heavily weighted so queries like "browser" still rank packages that only match there. const ( bm25Name = 12 bm25Base = 5 @@ -21,6 +20,7 @@ const ( bm25Groups = 1 bm25Provides = 3 bm25Keywords = 0.5 + bm25Categories = 0.5 ) type PackageSummary struct { @@ -108,8 +108,8 @@ func (r *Repository) Search(ctx context.Context, search, repo, arch string, limi countQuery = `SELECT COUNT(*) ` + baseWhere bm25 := fmt.Sprintf( - "bm25(package_fts, %d, %d, %d, %d, %d, %g)", - bm25Name, bm25Base, bm25Description, bm25Groups, bm25Provides, bm25Keywords, + "bm25(package_fts, %d, %d, %d, %d, %d, %g, %g)", + bm25Name, bm25Base, bm25Description, bm25Groups, bm25Provides, bm25Keywords, bm25Categories, ) dataQuery = `SELECT r.name, r.architecture, p.name, p.version, p.description, p.build_date, p.popularity_recent, r.testing ` + baseWhere + ` ORDER BY (p.name = ?) DESC, ` + bm25 + ` - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` diff --git a/internal/packages/repository_test.go b/internal/packages/repository_test.go index f37209ff..35cbb262 100644 --- a/internal/packages/repository_test.go +++ b/internal/packages/repository_test.go @@ -31,8 +31,8 @@ func setupTestDB(t *testing.T) *sql.DB { (4, 3, 'linux', 'linux', '6.7-rc1', 'The Linux kernel (testing)', 1700400000, 'Jan', 0.0)`, // Populate FTS - `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords) - SELECT id, name, base, description, groups, provides, keywords FROM package`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords, categories) + SELECT id, name, base, description, groups, provides, keywords, categories FROM package`, } { if _, err := db.Exec(stmt); err != nil { t.Fatalf("setup: %s...: %v", stmt[:40], err) From 71fec7d024c767ffa2f85996bd943ba4d76c9fef Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 16:23:57 +0200 Subject: [PATCH 08/18] refactor: remove odd duplication of appstream data base url --- internal/appstream/update.go | 11 +++++------ internal/config/config.go | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/appstream/update.go b/internal/appstream/update.go index be34d5ce..97f902c9 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -14,10 +14,9 @@ import ( "time" ) -// DefaultSourcesBase is the directory listing published by Arch that contains -// versioned snapshots (e.g. …/20260326/{core,extra,multilib}/Components-x86_64.xml.gz). -const DefaultSourcesBase = "https://sources.archlinux.org/other/packages/archlinux-appstream-data/" - +// archlinuxPackageJSON is the official package metadata used to resolve the +// appstream-data snapshot directory name (pkgver). The XML base URL is passed +// into Update as sourcesBase (from config.APPSTREAM_SOURCES_BASE / CLI). const archlinuxPackageJSON = "https://archlinux.org/packages/extra/any/archlinux-appstream-data/json/" const ( @@ -62,8 +61,8 @@ func LatestRelease(ctx context.Context, client *http.Client) (string, error) { } // Update downloads AppStream component XML for core, extra, and multilib from -// sourcesBase, merges keywords and categories by package name, writes both columns, -// and rebuilds the FTS index. +// sourcesBase (e.g. config.AppStreamSourcesBase), merges keywords and categories +// by package name, writes both columns, and rebuilds the FTS index. func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase string) error { if client == nil { client = &http.Client{Timeout: httpClientTimeoutUpdate} diff --git a/internal/config/config.go b/internal/config/config.go index 836ec993..b14637ca 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,10 +15,11 @@ type Config struct { func Load() (Config, error) { cfg := Config{ - Database: getEnv("DATABASE", ""), - Port: getEnv("PORT", "8080"), - PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), - DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + Database: getEnv("DATABASE", ""), + Port: getEnv("PORT", "8080"), + PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), + DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + // Arch appstream-data snapshot root (override for tests or mirrors). AppStreamSourcesBase: getEnv("APPSTREAM_SOURCES_BASE", "https://sources.archlinux.org/other/packages/archlinux-appstream-data/"), } From 87f2ac86fa199bb5fb471580a5c87176c8405b32 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 16:24:27 +0200 Subject: [PATCH 09/18] chore: simplify & beautify beautify --- internal/appstream/stopwords.go | 11 ++++++++++- internal/appstream/update.go | 2 +- internal/config/config.go | 1 - 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/appstream/stopwords.go b/internal/appstream/stopwords.go index 83d3d471..57e31bea 100644 --- a/internal/appstream/stopwords.go +++ b/internal/appstream/stopwords.go @@ -27,7 +27,16 @@ func init() { "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "you", "your", "yours", "yourself", "yourselves", // German - "als", "am", "an", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "das", "dass", "dein", "deine", "dem", "den", "der", "des", "dich", "die", "dir", "doch", "du", "durch", "ein", "eine", "einem", "einen", "einer", "eines", "er", "es", "euch", "euer", "eure", "für", "hab", "habe", "haben", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "hier", "ich", "ihm", "ihn", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "ist", "ja", "jede", "jedem", "jeden", "jeder", "jedes", "kann", "kannst", "können", "könnt", "machen", "man", "mein", "meine", "mich", "mir", "mit", "muss", "musst", "nach", "nicht", "noch", "nun", "nur", "ob", "oder", "ohne", "seid", "sein", "seine", "seinem", "seinen", "seiner", "seines", "sich", "sie", "sind", "so", "soll", "sollen", "sollst", "sollt", "sonst", "sowie", "um", "und", "uns", "unser", "unsere", "unter", "vom", "von", "vor", "war", "waren", "warst", "wart", "was", "weg", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "wer", "werde", "werden", "werdet", "wie", "wieder", "will", "wir", "wird", "wirst", "wo", "wohin", "wollen", "wollt", "würde", "würden", "zu", "zum", "zur", "über", + "als", "am", "an", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "das", "dass", "dein", "deine", + "dem", "den", "der", "des", "dich", "die", "dir", "doch", "du", "durch", "ein", "eine", "einem", "einen", "einer", + "eines", "er", "es", "euch", "euer", "eure", "für", "hab", "habe", "haben", "hast", "hat", "hatte", "hatten", "hattest", + "hattet", "hier", "ich", "ihm", "ihn", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "ist", "ja", "jede", + "jedem", "jeden", "jeder", "jedes", "kann", "kannst", "können", "könnt", "machen", "man", "mein", "meine", "mich", "mir", + "mit", "muss", "musst", "nach", "nicht", "noch", "nun", "nur", "ob", "oder", "ohne", "seid", "sein", "seine", "seinem", + "seinen", "seiner", "seines", "sich", "sie", "sind", "so", "soll", "sollen", "sollst", "sollt", "sonst", "sowie", "um", + "und", "uns", "unser", "unsere", "unter", "vom", "von", "vor", "war", "waren", "warst", "wart", "was", "weg", "weil", + "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "wer", "werde", "werden", "werdet", "wie", + "wieder", "will", "wir", "wird", "wirst", "wo", "wohin", "wollen", "wollt", "würde", "würden", "zu", "zum", "zur", "über", } stopword = make(map[string]struct{}, len(words)) for _, w := range words { diff --git a/internal/appstream/update.go b/internal/appstream/update.go index 97f902c9..2c90485a 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -61,7 +61,7 @@ func LatestRelease(ctx context.Context, client *http.Client) (string, error) { } // Update downloads AppStream component XML for core, extra, and multilib from -// sourcesBase (e.g. config.AppStreamSourcesBase), merges keywords and categories +// sourcesBase (see config.go), merges keywords and categories // by package name, writes both columns, and rebuilds the FTS index. func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase string) error { if client == nil { diff --git a/internal/config/config.go b/internal/config/config.go index b14637ca..4f2e0e00 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -19,7 +19,6 @@ func Load() (Config, error) { Port: getEnv("PORT", "8080"), PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), - // Arch appstream-data snapshot root (override for tests or mirrors). AppStreamSourcesBase: getEnv("APPSTREAM_SOURCES_BASE", "https://sources.archlinux.org/other/packages/archlinux-appstream-data/"), } From 86582dbdb3dce33c9a1b0e8d43bc458aaa8a5824 Mon Sep 17 00:00:00 2001 From: Miriam Date: Sun, 12 Apr 2026 16:29:43 +0200 Subject: [PATCH 10/18] dev: improve ci runs; run once either on push in main or if pull request into main exists and pushes are added to branch with open PR piggyback: improve ci setup; account for duplicate runs tryout to fix duplicate ci runs the other way --- .github/workflows/ci.yml | 4 ++++ internal/config/config.go | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f5b6e08..f8a8158d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,11 @@ name: CI on: push: + branches: + - main pull_request: + branches: + - main workflow_dispatch: jobs: diff --git a/internal/config/config.go b/internal/config/config.go index 4f2e0e00..836ec993 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,10 +15,10 @@ type Config struct { func Load() (Config, error) { cfg := Config{ - Database: getEnv("DATABASE", ""), - Port: getEnv("PORT", "8080"), - PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), - DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + Database: getEnv("DATABASE", ""), + Port: getEnv("PORT", "8080"), + PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), + DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), AppStreamSourcesBase: getEnv("APPSTREAM_SOURCES_BASE", "https://sources.archlinux.org/other/packages/archlinux-appstream-data/"), } From e93aa06aeec034425539f8adeb8789dfd0906e00 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:05:00 +0200 Subject: [PATCH 11/18] appstream: drop redundant slice copy and non-strict XML mode flush already hands ownership off to the caller (p.cur is nilled), so append-clone of keywords/categories was pure overhead per . Strict=false hid real malformed-input errors; the upstream feed is well-formed XML. --- internal/appstream/parse.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index 273185ff..d65411cb 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -30,9 +30,7 @@ type IndexTerms struct { // ParseComponentsXML streams the decoder and calls fn once per completed // (same pkgname may appear many times). dedupeWords runs in the caller after merge. func ParseComponentsXML(r io.Reader, fn func(pkgname string, terms IndexTerms) error) error { - d := xml.NewDecoder(r) - d.Strict = false - p := &docParser{fn: fn, dec: d} + p := &docParser{fn: fn, dec: xml.NewDecoder(r)} for { tok, err := p.dec.Token() if errors.Is(err, io.EOF) { @@ -78,10 +76,7 @@ func (p *docParser) flush() error { return nil } name := strings.TrimSpace(p.cur.pkgname) - terms := IndexTerms{ - Keywords: append([]string(nil), p.cur.keywords...), - Categories: append([]string(nil), p.cur.categories...), - } + terms := IndexTerms{Keywords: p.cur.keywords, Categories: p.cur.categories} p.cur = nil if name == "" { return nil From 10d30edcf1c376bef10e412337dc24eb49379183 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:05:25 +0200 Subject: [PATCH 12/18] appstream: merge per-name accumulators into one map Two parallel maps keyed by pkgname plus a third union map to iterate was wasted memory and an extra pass. One map of {kw, cat} slices covers it. --- internal/appstream/update.go | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/internal/appstream/update.go b/internal/appstream/update.go index 2c90485a..765b78b5 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -75,14 +75,19 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st } slog.Info("appstream snapshot", "version", version) - accKW := make(map[string][]string) - accCat := make(map[string][]string) + type terms struct{ kw, cat []string } + acc := make(map[string]*terms) for _, repo := range componentRepos { var components int - err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, terms IndexTerms) error { + err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, t IndexTerms) error { components++ - accKW[name] = append(accKW[name], terms.Keywords...) - accCat[name] = append(accCat[name], terms.Categories...) + e, ok := acc[name] + if !ok { + e = &terms{} + acc[name] = e + } + e.kw = append(e.kw, t.Keywords...) + e.cat = append(e.cat, t.Categories...) return nil }) if err != nil { @@ -91,14 +96,6 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st slog.Info("appstream components parsed", "repo", repo, "components", components) } - names := make(map[string]struct{}) - for k := range accKW { - names[k] = struct{}{} - } - for k := range accCat { - names[k] = struct{}{} - } - tx, err := db.BeginTx(ctx, nil) if err != nil { return err @@ -116,9 +113,9 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st defer func() { _ = stmt.Close() }() var updated int64 - for name := range names { - kw := dedupeWords(accKW[name]) - cat := dedupeWords(accCat[name]) + for name, e := range acc { + kw := dedupeWords(e.kw) + cat := dedupeWords(e.cat) if kw == "" && cat == "" { continue } @@ -137,7 +134,7 @@ func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase st return err } - slog.Info("appstream fields applied", "distinct_names", len(names), "package_rows", updated) + slog.Info("appstream fields applied", "distinct_names", len(acc), "package_rows", updated) if _, err := db.ExecContext(ctx, `INSERT INTO package_fts(package_fts) VALUES('rebuild')`); err != nil { return fmt.Errorf("rebuild fts: %w", err) From d11bb338a2ae120389555ac5336d89457e7dccfe Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:17:56 +0200 Subject: [PATCH 13/18] appstream: drop dead client-timeout plumbing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The http.Client parameter on Update was always passed nil from main; the fallback client timeouts (15m / 2m) were pure dead code since the ctx deadline (10m in runCommand) always wins. Match the convention of the other update commands: no client param, no hand-rolled timeouts. Also unexport latestRelease — it's not called from outside the package. --- internal/appstream/update.go | 21 +++++---------------- main.go | 2 +- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/internal/appstream/update.go b/internal/appstream/update.go index 765b78b5..0b4ead17 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -11,7 +11,6 @@ import ( "log/slog" "net/http" "strings" - "time" ) // archlinuxPackageJSON is the official package metadata used to resolve the @@ -19,23 +18,15 @@ import ( // into Update as sourcesBase (from config.APPSTREAM_SOURCES_BASE / CLI). const archlinuxPackageJSON = "https://archlinux.org/packages/extra/any/archlinux-appstream-data/json/" -const ( - httpClientTimeoutRelease = 2 * time.Minute - httpClientTimeoutUpdate = 15 * time.Minute -) - var componentRepos = []string{"core", "extra", "multilib"} type pkgJSON struct { Pkgver string `json:"pkgver"` } -// LatestRelease returns the snapshot directory name (e.g. "20260326") matching +// latestRelease returns the snapshot directory name (e.g. "20260326") matching // the current extra/any archlinux-appstream-data package in the official repos. -func LatestRelease(ctx context.Context, client *http.Client) (string, error) { - if client == nil { - client = &http.Client{Timeout: httpClientTimeoutRelease} - } +func latestRelease(ctx context.Context, client *http.Client) (string, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, archlinuxPackageJSON, nil) if err != nil { return "", err @@ -63,13 +54,11 @@ func LatestRelease(ctx context.Context, client *http.Client) (string, error) { // Update downloads AppStream component XML for core, extra, and multilib from // sourcesBase (see config.go), merges keywords and categories // by package name, writes both columns, and rebuilds the FTS index. -func Update(ctx context.Context, db *sql.DB, client *http.Client, sourcesBase string) error { - if client == nil { - client = &http.Client{Timeout: httpClientTimeoutUpdate} - } +func Update(ctx context.Context, db *sql.DB, sourcesBase string) error { + client := &http.Client{} sourcesBase = strings.TrimSuffix(sourcesBase, "/") + "/" - version, err := LatestRelease(ctx, client) + version, err := latestRelease(ctx, client) if err != nil { return err } diff --git a/main.go b/main.go index c577024e..7cdfb543 100644 --- a/main.go +++ b/main.go @@ -87,7 +87,7 @@ func runCommand(cmd string, cfg config.Config) int { return 1 } case "update-appstream": - if err := appstream.Update(ctx, db, nil, cfg.AppStreamSourcesBase); err != nil { + if err := appstream.Update(ctx, db, cfg.AppStreamSourcesBase); err != nil { slog.Error("update-appstream failed", "error", err) return 1 } From fc3449d343b863a0233d503dcf0d38b86e0fc3dc Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:18:11 +0200 Subject: [PATCH 14/18] architecture: document update-appstream subcommand --- ARCHITECTURE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 411a36d8..395a281d 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -39,11 +39,12 @@ Key indexes: FTS5 virtual table for package search (name, base, description, gro ## Data Updates -Six CLI subcommands fetch data from external sources, invoked by external systemd timers: +Seven CLI subcommands fetch data from external sources, invoked by external systemd timers: | Command | Source | Notes | |---------|--------|-------| | `update-packages` | Arch mirror `.files` DBs | 6 repos concurrent, ETag change detection, FTS rebuild after | +| `update-appstream` | sources.archlinux.org `archlinux-appstream-data` | core/extra/multilib Components-x86_64.xml.gz, FTS rebuild after | | `update-news` | forum.archlinux.de Flarum API | Paginated, HTML sanitized | | `update-mirrors` | archlinux.org/mirrors/status/json/ | Filtered by active/HTTPS/completion | | `update-releases` | archlinux.org/releng/releases/json/ | ISO URLs, checksums, torrent info | From 5c4047f2b37b1c894ce2eb95e35b5026f26c1242 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:20:02 +0200 Subject: [PATCH 15/18] appstream: drop no-op io.Reader conversion *gzip.Reader already satisfies io.Reader. --- internal/appstream/update.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/appstream/update.go b/internal/appstream/update.go index 0b4ead17..b338e133 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -7,7 +7,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "log/slog" "net/http" "strings" @@ -155,5 +154,5 @@ func fetchRepoComponents(ctx context.Context, client *http.Client, base, version } defer func() { _ = gz.Close() }() - return ParseComponentsXML(io.Reader(gz), fn) + return ParseComponentsXML(gz, fn) } From 3c5646d3b69e34178143451b36b5619047bf84d3 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:21:29 +0200 Subject: [PATCH 16/18] appstream: simplify parser with Decoder.Skip for rejected lang blocks Replace the manual stack + 6 skip flags with the decoder's own Skip(): when a /// tag has a non-en/de xml:lang, skip its entire subtree in one call. The remaining state is five booleans tracking the enter/leave of accepted elements. No behavior change; all existing tests pass. --- internal/appstream/parse.go | 140 ++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 78 deletions(-) diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go index d65411cb..63be7344 100644 --- a/internal/appstream/parse.go +++ b/internal/appstream/parse.go @@ -4,8 +4,9 @@ // // Parsing model: encoding/xml streams tokens; we keep one open in // docParser.cur. We read , /, and /. -// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed. -// flush runs at (and before the next ) to emit pkgname + terms. +// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed — +// rejected blocks are skipped wholesale via xml.Decoder.Skip. flush runs at +// (and before the next ) to emit pkgname + terms. package appstream import ( @@ -15,12 +16,6 @@ import ( "strings" ) -// XML element names referenced more than once in the decoder. -const ( - elKeyword = "keyword" - elCategory = "category" -) - // IndexTerms holds text extracted from one for FTS (merged by pkgname in update.go). type IndexTerms struct { Keywords []string @@ -45,29 +40,24 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, terms IndexTerms) e return err } case xml.EndElement: - if err := p.endElement(t); err != nil { - return err - } + p.endElement(t) case xml.CharData: p.charData(t) } } } -// docParser holds decoder state between tokens. +// docParser holds decoder state between tokens. Rejected / +// blocks are skipped by the decoder so we never see their children — no skip flags needed. type docParser struct { - fn func(string, IndexTerms) error - dec *xml.Decoder - stack []string - inKeywords bool - keywordsBlockSkip bool - inKeyword bool - keywordSkip bool - inCategories bool - categoriesBlockSkip bool - inCategory bool - categorySkip bool - cur *component + fn func(string, IndexTerms) error + dec *xml.Decoder + cur *component + inPkgname bool + inKeywords bool + inKeyword bool + inCategories bool + inCategory bool } // flush emits cur via fn and clears it. EOF calls flush for the last component. @@ -84,92 +74,86 @@ func (p *docParser) flush() error { return p.fn(name, terms) } -// startElement pushes stack; on flushes the previous component then starts a new cur. func (p *docParser) startElement(t xml.StartElement) error { - local := t.Name.Local - p.stack = append(p.stack, local) - - switch local { + switch t.Name.Local { case "component": if err := p.flush(); err != nil { return err } p.cur = &component{} + case "pkgname": + if p.cur != nil { + p.inPkgname = true + } case "keywords": + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() + } p.inKeywords = true - p.keywordsBlockSkip = !keywordLangAccepted(t.Attr) - case elKeyword: - if p.inKeywords { - p.inKeyword = true - p.keywordSkip = !keywordLangAccepted(t.Attr) + case "keyword": + if !p.inKeywords { + return nil + } + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() } + p.inKeyword = true case "categories": + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() + } p.inCategories = true - p.categoriesBlockSkip = !keywordLangAccepted(t.Attr) - case elCategory: - if p.inCategories { - p.inCategory = true - p.categorySkip = !keywordLangAccepted(t.Attr) + case "category": + if !p.inCategories { + return nil + } + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() } + p.inCategory = true } return nil } -// endElement pops stack; on flushes the finished component. -func (p *docParser) endElement(t xml.EndElement) error { - local := t.Name.Local - if len(p.stack) == 0 { - return nil - } - p.stack = p.stack[:len(p.stack)-1] - - switch local { +func (p *docParser) endElement(t xml.EndElement) { + switch t.Name.Local { case "component": - return p.flush() + _ = p.flush() + case "pkgname": + p.inPkgname = false case "keywords": p.inKeywords = false + case "keyword": p.inKeyword = false - p.keywordsBlockSkip = false - case elKeyword: - p.inKeyword = false - p.keywordSkip = false case "categories": p.inCategories = false + case "category": p.inCategory = false - p.categoriesBlockSkip = false - case elCategory: - p.inCategory = false - p.categorySkip = false } - return nil } -// charData collects pkgname, , and text. func (p *docParser) charData(t xml.CharData) { if p.cur == nil { return } - text := strings.TrimSpace(string(t)) - if text == "" { + var dst *[]string + switch { + case p.inPkgname: + text := strings.TrimSpace(string(t)) + if text != "" { + p.cur.pkgname += text + } + return + case p.inKeyword: + dst = &p.cur.keywords + case p.inCategory: + dst = &p.cur.categories + default: return } - - parent := "" - if len(p.stack) > 0 { - parent = p.stack[len(p.stack)-1] - } - - switch parent { - case "pkgname": - p.cur.pkgname += text - case elKeyword: - if p.inKeyword && !p.keywordsBlockSkip && !p.keywordSkip { - p.cur.keywords = append(p.cur.keywords, text) - } - case elCategory: - if p.inCategory && !p.categoriesBlockSkip && !p.categorySkip { - p.cur.categories = append(p.cur.categories, text) - } + text := strings.TrimSpace(string(t)) + if text != "" { + *dst = append(*dst, text) } } From 10170267a8eaf23da333a1955bafb007b569f9cb Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:28:51 +0200 Subject: [PATCH 17/18] architecture: note update-appstream does not cover testing repos Upstream archlinux-appstream-data publishes core/extra/multilib only, so core-testing/extra-testing/multilib-testing packages never get keywords or categories columns populated. Document the asymmetry so search-ranking surprises don't lead down a wrong debugging path. --- ARCHITECTURE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 395a281d..09b7de2d 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -44,7 +44,7 @@ Seven CLI subcommands fetch data from external sources, invoked by external syst | Command | Source | Notes | |---------|--------|-------| | `update-packages` | Arch mirror `.files` DBs | 6 repos concurrent, ETag change detection, FTS rebuild after | -| `update-appstream` | sources.archlinux.org `archlinux-appstream-data` | core/extra/multilib Components-x86_64.xml.gz, FTS rebuild after | +| `update-appstream` | sources.archlinux.org `archlinux-appstream-data` | core/extra/multilib only (not testing — upstream doesn't publish it), FTS rebuild after | | `update-news` | forum.archlinux.de Flarum API | Paginated, HTML sanitized | | `update-mirrors` | archlinux.org/mirrors/status/json/ | Filtered by active/HTTPS/completion | | `update-releases` | archlinux.org/releng/releases/json/ | ISO URLs, checksums, torrent info | From 4dbe5d926445e567b7625291a0291dc626c1f2f4 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 13 Apr 2026 12:30:17 +0200 Subject: [PATCH 18/18] appstream: add end-to-end tests for DB write + FTS rebuild MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the DB-facing portion of Update into applyTerms so tests can drive it directly against in-memory SQLite without mocking the HTTP fetches. Covers: - Keywords + categories land on matching package rows; non-mentioned packages stay empty. - FTS matches on both the new keyword and category columns after the rebuild — catches column-order drift between the schema and the query. - A second run clears stale data from rows no longer in the accumulator. - Duplicates and stopwords are stripped by dedupeWords. - All-stopword input does not update the row at all. --- internal/appstream/update.go | 50 +++++--- internal/appstream/update_test.go | 186 ++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 18 deletions(-) create mode 100644 internal/appstream/update_test.go diff --git a/internal/appstream/update.go b/internal/appstream/update.go index b338e133..8086b412 100644 --- a/internal/appstream/update.go +++ b/internal/appstream/update.go @@ -19,6 +19,12 @@ const archlinuxPackageJSON = "https://archlinux.org/packages/extra/any/archlinux var componentRepos = []string{"core", "extra", "multilib"} +// pkgTerms holds merged keyword/category text for one pkgname across all repos. +type pkgTerms struct { + keywords []string + categories []string +} + type pkgJSON struct { Pkgver string `json:"pkgver"` } @@ -63,19 +69,18 @@ func Update(ctx context.Context, db *sql.DB, sourcesBase string) error { } slog.Info("appstream snapshot", "version", version) - type terms struct{ kw, cat []string } - acc := make(map[string]*terms) + acc := make(map[string]*pkgTerms) for _, repo := range componentRepos { var components int err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, t IndexTerms) error { components++ e, ok := acc[name] if !ok { - e = &terms{} + e = &pkgTerms{} acc[name] = e } - e.kw = append(e.kw, t.Keywords...) - e.cat = append(e.cat, t.Categories...) + e.keywords = append(e.keywords, t.Keywords...) + e.categories = append(e.categories, t.Categories...) return nil }) if err != nil { @@ -84,51 +89,60 @@ func Update(ctx context.Context, db *sql.DB, sourcesBase string) error { slog.Info("appstream components parsed", "repo", repo, "components", components) } - tx, err := db.BeginTx(ctx, nil) + updated, err := applyTerms(ctx, db, acc) if err != nil { return err } + slog.Info("appstream fields applied", "distinct_names", len(acc), "package_rows", updated) + return nil +} + +// applyTerms clears AppStream columns on every package row, writes the dedup'd +// terms for each pkgname in a single transaction, and rebuilds the FTS index. +// Returns the number of package rows updated. +func applyTerms(ctx context.Context, db *sql.DB, acc map[string]*pkgTerms) (int64, error) { + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return 0, err + } defer func() { _ = tx.Rollback() }() if _, err := tx.ExecContext(ctx, `UPDATE package SET keywords = '', categories = ''`); err != nil { - return fmt.Errorf("clear appstream columns: %w", err) + return 0, fmt.Errorf("clear appstream columns: %w", err) } stmt, err := tx.PrepareContext(ctx, `UPDATE package SET keywords = ?, categories = ? WHERE name = ?`) if err != nil { - return fmt.Errorf("prepare appstream update: %w", err) + return 0, fmt.Errorf("prepare appstream update: %w", err) } defer func() { _ = stmt.Close() }() var updated int64 for name, e := range acc { - kw := dedupeWords(e.kw) - cat := dedupeWords(e.cat) + kw := dedupeWords(e.keywords) + cat := dedupeWords(e.categories) if kw == "" && cat == "" { continue } res, err := stmt.ExecContext(ctx, kw, cat, name) if err != nil { - return fmt.Errorf("update appstream fields for %q: %w", name, err) + return 0, fmt.Errorf("update appstream fields for %q: %w", name, err) } n, err := res.RowsAffected() if err != nil { - return err + return 0, err } updated += n } if err := tx.Commit(); err != nil { - return err + return 0, err } - slog.Info("appstream fields applied", "distinct_names", len(acc), "package_rows", updated) - if _, err := db.ExecContext(ctx, `INSERT INTO package_fts(package_fts) VALUES('rebuild')`); err != nil { - return fmt.Errorf("rebuild fts: %w", err) + return updated, fmt.Errorf("rebuild fts: %w", err) } - - return nil + return updated, nil } func fetchRepoComponents(ctx context.Context, client *http.Client, base, version, repo string, fn func(string, IndexTerms) error) error { diff --git a/internal/appstream/update_test.go b/internal/appstream/update_test.go new file mode 100644 index 00000000..61f8c4cc --- /dev/null +++ b/internal/appstream/update_test.go @@ -0,0 +1,186 @@ +package appstream + +import ( + "context" + "database/sql" + "testing" + + "archded/internal/database" +) + +func setupPackageDB(t *testing.T) *sql.DB { + t.Helper() + db, err := database.New(":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + for _, stmt := range []string{ + `INSERT INTO repository (id, name, architecture, testing) VALUES (1, 'extra', 'x86_64', 0)`, + `INSERT INTO package (id, repository_id, name, base, version, description) VALUES + (1, 1, 'firefox', 'firefox', '120.0-1', 'Standalone web browser'), + (2, 1, 'konsole', 'konsole', '23.08-1', 'KDE terminal emulator'), + (3, 1, 'linux', 'linux', '6.7-1', 'The Linux kernel')`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords, categories) + SELECT id, name, base, description, groups, provides, keywords, categories FROM package`, + } { + if _, err := db.Exec(stmt); err != nil { + t.Fatalf("setup: %v", err) + } + } + return db +} + +func TestApplyTerms_WritesAndRebuildsFTS(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + acc := map[string]*pkgTerms{ + "firefox": { + keywords: []string{"internet browser", "www"}, + categories: []string{"Network", "WebBrowser"}, + }, + "konsole": { + keywords: []string{"shell"}, + categories: []string{"System", "TerminalEmulator"}, + }, + // "linux" not in accumulator — its columns should stay empty. + } + + updated, err := applyTerms(ctx, db, acc) + if err != nil { + t.Fatal(err) + } + if updated != 2 { + t.Errorf("updated rows = %d, want 2", updated) + } + + rows := map[string]struct{ kw, cat string }{} + r, err := db.Query(`SELECT name, keywords, categories FROM package`) + if err != nil { + t.Fatal(err) + } + defer func() { _ = r.Close() }() + for r.Next() { + var name, kw, cat string + if err := r.Scan(&name, &kw, &cat); err != nil { + t.Fatal(err) + } + rows[name] = struct{ kw, cat string }{kw, cat} + } + + if rows["firefox"].kw != "internet browser www" { + t.Errorf("firefox keywords = %q", rows["firefox"].kw) + } + if rows["firefox"].cat != "Network WebBrowser" { + t.Errorf("firefox categories = %q", rows["firefox"].cat) + } + if rows["konsole"].cat != "System TerminalEmulator" { + t.Errorf("konsole categories = %q", rows["konsole"].cat) + } + if rows["linux"].kw != "" || rows["linux"].cat != "" { + t.Errorf("linux should have empty appstream columns, got kw=%q cat=%q", + rows["linux"].kw, rows["linux"].cat) + } + + // FTS must match on the new keyword/category content. + var name string + if err := db.QueryRow( + `SELECT name FROM package_fts WHERE package_fts MATCH 'WebBrowser'`).Scan(&name); err != nil { + t.Fatalf("expected firefox via category match: %v", err) + } + if name != "firefox" { + t.Errorf("category match name = %q, want firefox", name) + } + + if err := db.QueryRow( + `SELECT name FROM package_fts WHERE package_fts MATCH 'TerminalEmulator'`).Scan(&name); err != nil { + t.Fatalf("expected konsole via category match: %v", err) + } + if name != "konsole" { + t.Errorf("category match name = %q, want konsole", name) + } +} + +func TestApplyTerms_ClearsStalePriorData(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + // Populate firefox with prior-run AppStream data. + first := map[string]*pkgTerms{ + "firefox": {keywords: []string{"obsolete"}, categories: []string{"OldCategory"}}, + } + if _, err := applyTerms(ctx, db, first); err != nil { + t.Fatal(err) + } + + // Second run no longer mentions firefox (upstream dropped the component). + second := map[string]*pkgTerms{ + "konsole": {keywords: []string{"shell"}, categories: []string{"System"}}, + } + if _, err := applyTerms(ctx, db, second); err != nil { + t.Fatal(err) + } + + var kw, cat string + if err := db.QueryRow(`SELECT keywords, categories FROM package WHERE name = 'firefox'`). + Scan(&kw, &cat); err != nil { + t.Fatal(err) + } + if kw != "" || cat != "" { + t.Errorf("firefox should be cleared on second run, got kw=%q cat=%q", kw, cat) + } + + // And FTS should no longer match the stale term. + err := db.QueryRow( + `SELECT name FROM package_fts WHERE package_fts MATCH 'OldCategory'`).Scan(new(string)) + if err != sql.ErrNoRows { + t.Errorf("stale category still matches in FTS: err=%v", err) + } +} + +func TestApplyTerms_DedupesAndStripsStopwords(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + // Duplicate tokens across multiple "components" + a stopword mixed in. + acc := map[string]*pkgTerms{ + "firefox": { + keywords: []string{"internet and www", "www browser"}, + categories: []string{"Network", "Network"}, + }, + } + if _, err := applyTerms(ctx, db, acc); err != nil { + t.Fatal(err) + } + + var kw, cat string + if err := db.QueryRow(`SELECT keywords, categories FROM package WHERE name = 'firefox'`). + Scan(&kw, &cat); err != nil { + t.Fatal(err) + } + if kw != "internet www browser" { + t.Errorf("keywords = %q, want %q", kw, "internet www browser") + } + if cat != "Network" { + t.Errorf("categories = %q, want %q", cat, "Network") + } +} + +func TestApplyTerms_SkipsEmptyAfterDedupe(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + // All-stopword keywords → dedupeWords returns ""; no row should be updated. + acc := map[string]*pkgTerms{ + "firefox": {keywords: []string{"the and or"}, categories: nil}, + } + updated, err := applyTerms(ctx, db, acc) + if err != nil { + t.Fatal(err) + } + if updated != 0 { + t.Errorf("updated = %d, want 0 (all-stopword input)", updated) + } +}