diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f5b6e08..f8a8158d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,11 @@ name: CI on: push: + branches: + - main pull_request: + branches: + - main workflow_dispatch: jobs: diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 6ab435aa..09b7de2d 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -39,11 +39,12 @@ Key indexes: FTS5 virtual table for package search (name, base, description, gro ## Data Updates -Six CLI subcommands fetch data from external sources, invoked by external systemd timers: +Seven CLI subcommands fetch data from external sources, invoked by external systemd timers: | Command | Source | Notes | |---------|--------|-------| | `update-packages` | Arch mirror `.files` DBs | 6 repos concurrent, ETag change detection, FTS rebuild after | +| `update-appstream` | sources.archlinux.org `archlinux-appstream-data` | core/extra/multilib only (not testing — upstream doesn't publish it), FTS rebuild after | | `update-news` | forum.archlinux.de Flarum API | Paginated, HTML sanitized | | `update-mirrors` | archlinux.org/mirrors/status/json/ | Filtered by active/HTTPS/completion | | `update-releases` | archlinux.org/releng/releases/json/ | ISO URLs, checksums, torrent info | @@ -52,9 +53,9 @@ Six CLI subcommands fetch data from external sources, invoked by external system ## Search -FTS5 indexes: name, base, description, groups, provides (denormalized from package relations). Hyphenated queries are split into individual terms for tokenizer compatibility. +FTS5 indexes: name, base, description, groups, provides, keywords, categories (AppStream `` / `` inside blocks whose `xml:lang` is absent or en/de). Hyphenated queries are split into individual terms for tokenizer compatibility. -Ranking: exact name match first, then `bm25(10,5,1,1,3) - ln(1+popularity)` — name-weighted with log-scaled popularity boost. +Ranking: exact name match first, then `bm25` with higher weights on name/description than on keywords (so long keyword blobs do not bury short pacman descriptions), minus `ln(1+popularity)`. ## Middleware Stack diff --git a/internal/appstream/parse.go b/internal/appstream/parse.go new file mode 100644 index 00000000..63be7344 --- /dev/null +++ b/internal/appstream/parse.go @@ -0,0 +1,205 @@ +// Package appstream parses Arch Linux AppStream component XML (from +// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and +// builds per-pkgname search text for SQLite FTS. +// +// Parsing model: encoding/xml streams tokens; we keep one open in +// docParser.cur. We read , /, and /. +// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed — +// rejected blocks are skipped wholesale via xml.Decoder.Skip. flush runs at +// (and before the next ) to emit pkgname + terms. +package appstream + +import ( + "encoding/xml" + "errors" + "io" + "strings" +) + +// IndexTerms holds text extracted from one for FTS (merged by pkgname in update.go). +type IndexTerms struct { + Keywords []string + Categories []string +} + +// ParseComponentsXML streams the decoder and calls fn once per completed +// (same pkgname may appear many times). dedupeWords runs in the caller after merge. +func ParseComponentsXML(r io.Reader, fn func(pkgname string, terms IndexTerms) error) error { + p := &docParser{fn: fn, dec: xml.NewDecoder(r)} + for { + tok, err := p.dec.Token() + if errors.Is(err, io.EOF) { + return p.flush() + } + if err != nil { + return err + } + switch t := tok.(type) { + case xml.StartElement: + if err := p.startElement(t); err != nil { + return err + } + case xml.EndElement: + p.endElement(t) + case xml.CharData: + p.charData(t) + } + } +} + +// docParser holds decoder state between tokens. Rejected / +// blocks are skipped by the decoder so we never see their children — no skip flags needed. +type docParser struct { + fn func(string, IndexTerms) error + dec *xml.Decoder + cur *component + inPkgname bool + inKeywords bool + inKeyword bool + inCategories bool + inCategory bool +} + +// flush emits cur via fn and clears it. EOF calls flush for the last component. +func (p *docParser) flush() error { + if p.cur == nil { + return nil + } + name := strings.TrimSpace(p.cur.pkgname) + terms := IndexTerms{Keywords: p.cur.keywords, Categories: p.cur.categories} + p.cur = nil + if name == "" { + return nil + } + return p.fn(name, terms) +} + +func (p *docParser) startElement(t xml.StartElement) error { + switch t.Name.Local { + case "component": + if err := p.flush(); err != nil { + return err + } + p.cur = &component{} + case "pkgname": + if p.cur != nil { + p.inPkgname = true + } + case "keywords": + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() + } + p.inKeywords = true + case "keyword": + if !p.inKeywords { + return nil + } + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() + } + p.inKeyword = true + case "categories": + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() + } + p.inCategories = true + case "category": + if !p.inCategories { + return nil + } + if !keywordLangAccepted(t.Attr) { + return p.dec.Skip() + } + p.inCategory = true + } + return nil +} + +func (p *docParser) endElement(t xml.EndElement) { + switch t.Name.Local { + case "component": + _ = p.flush() + case "pkgname": + p.inPkgname = false + case "keywords": + p.inKeywords = false + case "keyword": + p.inKeyword = false + case "categories": + p.inCategories = false + case "category": + p.inCategory = false + } +} + +func (p *docParser) charData(t xml.CharData) { + if p.cur == nil { + return + } + var dst *[]string + switch { + case p.inPkgname: + text := strings.TrimSpace(string(t)) + if text != "" { + p.cur.pkgname += text + } + return + case p.inKeyword: + dst = &p.cur.keywords + case p.inCategory: + dst = &p.cur.categories + default: + return + } + text := strings.TrimSpace(string(t)) + if text != "" { + *dst = append(*dst, text) + } +} + +// keywordLangAccepted is used for , , , and +// start tags: true if there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE). +func keywordLangAccepted(attrs []xml.Attr) bool { + for _, a := range attrs { + if a.Name.Local != "lang" || a.Value == "" { + continue + } + v := strings.ToLower(strings.TrimSpace(a.Value)) + if i := strings.IndexByte(v, '-'); i > 0 { + v = v[:i] + } + return v == "en" || v == "de" + } + return true +} + +// component is one AppStream being accumulated until flush. +type component struct { + pkgname string + keywords []string + categories []string +} + +// dedupeWords joins fragments, removes English/German stop words, and deduplicates +// tokens (case-insensitive) for FTS. +func dedupeWords(parts []string) string { + seen := make(map[string]struct{}) + var b strings.Builder + for _, part := range parts { + for _, w := range strings.Fields(part) { + key := strings.ToLower(w) + if _, ok := stopword[key]; ok { + continue + } + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + if b.Len() > 0 { + b.WriteByte(' ') + } + b.WriteString(w) + } + } + return b.String() +} diff --git a/internal/appstream/parse_test.go b/internal/appstream/parse_test.go new file mode 100644 index 00000000..a2e6ac4e --- /dev/null +++ b/internal/appstream/parse_test.go @@ -0,0 +1,139 @@ +package appstream + +import ( + "encoding/xml" + "strings" + "testing" +) + +func TestParseComponentsXML_KeywordsAndCategories(t *testing.T) { + const xml = ` + + + firefox + Firefox + Web browser +

Free software web browser.

+ NetworkWebBrowser + internetwww +
+ + firefox + Firefox ESR + mozilla + +
` + + accKW := make(map[string][]string) + accCat := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error { + accKW[name] = append(accKW[name], terms.Keywords...) + accCat[name] = append(accCat[name], terms.Categories...) + return nil + }) + if err != nil { + t.Fatal(err) + } + gotKW := dedupeWords(accKW["firefox"]) + gotCat := dedupeWords(accCat["firefox"]) + if gotKW == "" { + t.Fatal("expected merged keywords for firefox") + } + if !strings.Contains(gotKW, "internet") || !strings.Contains(gotKW, "www") || !strings.Contains(gotKW, "mozilla") { + t.Errorf("expected keyword terms, got %q", gotKW) + } + if strings.Contains(strings.ToLower(gotKW), "browser") { + t.Errorf("did not expect description text in keywords, got %q", gotKW) + } + if !strings.Contains(gotCat, "Network") || !strings.Contains(gotCat, "WebBrowser") { + t.Errorf("expected category terms, got %q", gotCat) + } +} + +func TestParseComponentsXML_KeywordLangFilter(t *testing.T) { + // AppStream often sets xml:lang on , not on each . + const xml = ` + + + demo + + neutral + + + english + + + deutsch + + + deutsch2 + + + francais + + +` + acc := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error { + acc[name] = append(acc[name], terms.Keywords...) + return nil + }) + if err != nil { + t.Fatal(err) + } + got := dedupeWords(acc["demo"]) + for _, need := range []string{"neutral", "english", "deutsch", "deutsch2"} { + if !strings.Contains(got, need) { + t.Errorf("missing %q in %q", need, got) + } + } + if strings.Contains(got, "francais") { + t.Errorf("did not want fr keyword, got %q", got) + } +} + +func TestParseComponentsXML_CategoriesLangFilter(t *testing.T) { + const xml = ` + + + demo + NeutralCat + DeutschCat + FrCat + +` + acc := make(map[string][]string) + err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error { + acc[name] = append(acc[name], terms.Categories...) + return nil + }) + if err != nil { + t.Fatal(err) + } + got := dedupeWords(acc["demo"]) + if !strings.Contains(got, "NeutralCat") || !strings.Contains(got, "DeutschCat") { + t.Errorf("want neutral+de categories, got %q", got) + } + if strings.Contains(got, "FrCat") { + t.Errorf("did not want fr category, got %q", got) + } +} + +func TestKeywordLangAccepted(t *testing.T) { + tests := []struct { + attrs []xml.Attr + want bool + }{ + {nil, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "en"}}, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de"}}, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de-AT"}}, true}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "fr"}}, false}, + {[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "pl"}}, false}, + } + for _, tt := range tests { + if got := keywordLangAccepted(tt.attrs); got != tt.want { + t.Errorf("keywordLangAccepted(%v) = %v, want %v", tt.attrs, got, tt.want) + } + } +} diff --git a/internal/appstream/stopwords.go b/internal/appstream/stopwords.go new file mode 100644 index 00000000..57e31bea --- /dev/null +++ b/internal/appstream/stopwords.go @@ -0,0 +1,45 @@ +package appstream + +// stopword is a small English + German closed-class word set (articles, +// conjunctions, common prepositions, auxiliaries, pronouns). It trims noise for +// FTS without pulling in NLP dependencies. Extend deliberately: short words like +// "go" or "c" are omitted because they double as names. +var stopword map[string]struct{} + +func init() { + words := []string{ + // English + "a", "about", "after", "again", "all", "am", "an", "and", "any", "are", "as", "at", + "be", "been", "before", "being", "between", "both", "but", "by", + "can", "could", + "did", "do", "does", "doing", "done", "during", + "each", "few", "for", "from", "further", + "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", + "i", "if", "in", "into", "is", "it", "its", "itself", + "just", + "me", "more", "most", "my", "myself", + "no", "nor", "not", + "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", + "same", "she", "should", "so", "some", "such", + "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", + "under", "until", "up", + "very", + "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", + "you", "your", "yours", "yourself", "yourselves", + // German + "als", "am", "an", "auch", "auf", "aus", "bei", "bin", "bis", "bist", "da", "das", "dass", "dein", "deine", + "dem", "den", "der", "des", "dich", "die", "dir", "doch", "du", "durch", "ein", "eine", "einem", "einen", "einer", + "eines", "er", "es", "euch", "euer", "eure", "für", "hab", "habe", "haben", "hast", "hat", "hatte", "hatten", "hattest", + "hattet", "hier", "ich", "ihm", "ihn", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "in", "ist", "ja", "jede", + "jedem", "jeden", "jeder", "jedes", "kann", "kannst", "können", "könnt", "machen", "man", "mein", "meine", "mich", "mir", + "mit", "muss", "musst", "nach", "nicht", "noch", "nun", "nur", "ob", "oder", "ohne", "seid", "sein", "seine", "seinem", + "seinen", "seiner", "seines", "sich", "sie", "sind", "so", "soll", "sollen", "sollst", "sollt", "sonst", "sowie", "um", + "und", "uns", "unser", "unsere", "unter", "vom", "von", "vor", "war", "waren", "warst", "wart", "was", "weg", "weil", + "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "wer", "werde", "werden", "werdet", "wie", + "wieder", "will", "wir", "wird", "wirst", "wo", "wohin", "wollen", "wollt", "würde", "würden", "zu", "zum", "zur", "über", + } + stopword = make(map[string]struct{}, len(words)) + for _, w := range words { + stopword[w] = struct{}{} + } +} diff --git a/internal/appstream/stopwords_test.go b/internal/appstream/stopwords_test.go new file mode 100644 index 00000000..6bb51771 --- /dev/null +++ b/internal/appstream/stopwords_test.go @@ -0,0 +1,19 @@ +package appstream + +import "testing" + +func TestDedupeWords_Stopwords(t *testing.T) { + got := dedupeWords([]string{"The cat and the dog in a box"}) + want := "cat dog box" + if got != want { + t.Fatalf("got %q want %q", got, want) + } +} + +func TestDedupeWords_GermanStopwords(t *testing.T) { + got := dedupeWords([]string{"der schnelle braune Fuchs"}) + want := "schnelle braune Fuchs" + if got != want { + t.Fatalf("got %q want %q", got, want) + } +} diff --git a/internal/appstream/update.go b/internal/appstream/update.go new file mode 100644 index 00000000..8086b412 --- /dev/null +++ b/internal/appstream/update.go @@ -0,0 +1,172 @@ +package appstream + +import ( + "compress/gzip" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "log/slog" + "net/http" + "strings" +) + +// archlinuxPackageJSON is the official package metadata used to resolve the +// appstream-data snapshot directory name (pkgver). The XML base URL is passed +// into Update as sourcesBase (from config.APPSTREAM_SOURCES_BASE / CLI). +const archlinuxPackageJSON = "https://archlinux.org/packages/extra/any/archlinux-appstream-data/json/" + +var componentRepos = []string{"core", "extra", "multilib"} + +// pkgTerms holds merged keyword/category text for one pkgname across all repos. +type pkgTerms struct { + keywords []string + categories []string +} + +type pkgJSON struct { + Pkgver string `json:"pkgver"` +} + +// latestRelease returns the snapshot directory name (e.g. "20260326") matching +// the current extra/any archlinux-appstream-data package in the official repos. +func latestRelease(ctx context.Context, client *http.Client) (string, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, archlinuxPackageJSON, nil) + if err != nil { + return "", err + } + req.Header.Set("User-Agent", "archded/1.0 (+https://www.archlinux.de)") + + resp, err := client.Do(req) + if err != nil { + return "", fmt.Errorf("fetch package json: %w", err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("fetch package json: status %d", resp.StatusCode) + } + var p pkgJSON + if err := json.NewDecoder(resp.Body).Decode(&p); err != nil { + return "", fmt.Errorf("decode package json: %w", err) + } + if p.Pkgver == "" { + return "", errors.New("empty pkgver in package json") + } + return p.Pkgver, nil +} + +// Update downloads AppStream component XML for core, extra, and multilib from +// sourcesBase (see config.go), merges keywords and categories +// by package name, writes both columns, and rebuilds the FTS index. +func Update(ctx context.Context, db *sql.DB, sourcesBase string) error { + client := &http.Client{} + sourcesBase = strings.TrimSuffix(sourcesBase, "/") + "/" + + version, err := latestRelease(ctx, client) + if err != nil { + return err + } + slog.Info("appstream snapshot", "version", version) + + acc := make(map[string]*pkgTerms) + for _, repo := range componentRepos { + var components int + err := fetchRepoComponents(ctx, client, sourcesBase, version, repo, func(name string, t IndexTerms) error { + components++ + e, ok := acc[name] + if !ok { + e = &pkgTerms{} + acc[name] = e + } + e.keywords = append(e.keywords, t.Keywords...) + e.categories = append(e.categories, t.Categories...) + return nil + }) + if err != nil { + return fmt.Errorf("repo %s: %w", repo, err) + } + slog.Info("appstream components parsed", "repo", repo, "components", components) + } + + updated, err := applyTerms(ctx, db, acc) + if err != nil { + return err + } + slog.Info("appstream fields applied", "distinct_names", len(acc), "package_rows", updated) + return nil +} + +// applyTerms clears AppStream columns on every package row, writes the dedup'd +// terms for each pkgname in a single transaction, and rebuilds the FTS index. +// Returns the number of package rows updated. +func applyTerms(ctx context.Context, db *sql.DB, acc map[string]*pkgTerms) (int64, error) { + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return 0, err + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.ExecContext(ctx, `UPDATE package SET keywords = '', categories = ''`); err != nil { + return 0, fmt.Errorf("clear appstream columns: %w", err) + } + + stmt, err := tx.PrepareContext(ctx, `UPDATE package SET keywords = ?, categories = ? WHERE name = ?`) + if err != nil { + return 0, fmt.Errorf("prepare appstream update: %w", err) + } + defer func() { _ = stmt.Close() }() + + var updated int64 + for name, e := range acc { + kw := dedupeWords(e.keywords) + cat := dedupeWords(e.categories) + if kw == "" && cat == "" { + continue + } + res, err := stmt.ExecContext(ctx, kw, cat, name) + if err != nil { + return 0, fmt.Errorf("update appstream fields for %q: %w", name, err) + } + n, err := res.RowsAffected() + if err != nil { + return 0, err + } + updated += n + } + + if err := tx.Commit(); err != nil { + return 0, err + } + + if _, err := db.ExecContext(ctx, `INSERT INTO package_fts(package_fts) VALUES('rebuild')`); err != nil { + return updated, fmt.Errorf("rebuild fts: %w", err) + } + return updated, nil +} + +func fetchRepoComponents(ctx context.Context, client *http.Client, base, version, repo string, fn func(string, IndexTerms) error) error { + u := fmt.Sprintf("%s%s/%s/Components-x86_64.xml.gz", base, version, repo) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return err + } + req.Header.Set("User-Agent", "archded/1.0 (+https://www.archlinux.de)") + + resp, err := client.Do(req) + if err != nil { + return err + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("GET %s: status %d", u, resp.StatusCode) + } + + gz, err := gzip.NewReader(resp.Body) + if err != nil { + return fmt.Errorf("gzip %s: %w", u, err) + } + defer func() { _ = gz.Close() }() + + return ParseComponentsXML(gz, fn) +} diff --git a/internal/appstream/update_test.go b/internal/appstream/update_test.go new file mode 100644 index 00000000..61f8c4cc --- /dev/null +++ b/internal/appstream/update_test.go @@ -0,0 +1,186 @@ +package appstream + +import ( + "context" + "database/sql" + "testing" + + "archded/internal/database" +) + +func setupPackageDB(t *testing.T) *sql.DB { + t.Helper() + db, err := database.New(":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + for _, stmt := range []string{ + `INSERT INTO repository (id, name, architecture, testing) VALUES (1, 'extra', 'x86_64', 0)`, + `INSERT INTO package (id, repository_id, name, base, version, description) VALUES + (1, 1, 'firefox', 'firefox', '120.0-1', 'Standalone web browser'), + (2, 1, 'konsole', 'konsole', '23.08-1', 'KDE terminal emulator'), + (3, 1, 'linux', 'linux', '6.7-1', 'The Linux kernel')`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords, categories) + SELECT id, name, base, description, groups, provides, keywords, categories FROM package`, + } { + if _, err := db.Exec(stmt); err != nil { + t.Fatalf("setup: %v", err) + } + } + return db +} + +func TestApplyTerms_WritesAndRebuildsFTS(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + acc := map[string]*pkgTerms{ + "firefox": { + keywords: []string{"internet browser", "www"}, + categories: []string{"Network", "WebBrowser"}, + }, + "konsole": { + keywords: []string{"shell"}, + categories: []string{"System", "TerminalEmulator"}, + }, + // "linux" not in accumulator — its columns should stay empty. + } + + updated, err := applyTerms(ctx, db, acc) + if err != nil { + t.Fatal(err) + } + if updated != 2 { + t.Errorf("updated rows = %d, want 2", updated) + } + + rows := map[string]struct{ kw, cat string }{} + r, err := db.Query(`SELECT name, keywords, categories FROM package`) + if err != nil { + t.Fatal(err) + } + defer func() { _ = r.Close() }() + for r.Next() { + var name, kw, cat string + if err := r.Scan(&name, &kw, &cat); err != nil { + t.Fatal(err) + } + rows[name] = struct{ kw, cat string }{kw, cat} + } + + if rows["firefox"].kw != "internet browser www" { + t.Errorf("firefox keywords = %q", rows["firefox"].kw) + } + if rows["firefox"].cat != "Network WebBrowser" { + t.Errorf("firefox categories = %q", rows["firefox"].cat) + } + if rows["konsole"].cat != "System TerminalEmulator" { + t.Errorf("konsole categories = %q", rows["konsole"].cat) + } + if rows["linux"].kw != "" || rows["linux"].cat != "" { + t.Errorf("linux should have empty appstream columns, got kw=%q cat=%q", + rows["linux"].kw, rows["linux"].cat) + } + + // FTS must match on the new keyword/category content. + var name string + if err := db.QueryRow( + `SELECT name FROM package_fts WHERE package_fts MATCH 'WebBrowser'`).Scan(&name); err != nil { + t.Fatalf("expected firefox via category match: %v", err) + } + if name != "firefox" { + t.Errorf("category match name = %q, want firefox", name) + } + + if err := db.QueryRow( + `SELECT name FROM package_fts WHERE package_fts MATCH 'TerminalEmulator'`).Scan(&name); err != nil { + t.Fatalf("expected konsole via category match: %v", err) + } + if name != "konsole" { + t.Errorf("category match name = %q, want konsole", name) + } +} + +func TestApplyTerms_ClearsStalePriorData(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + // Populate firefox with prior-run AppStream data. + first := map[string]*pkgTerms{ + "firefox": {keywords: []string{"obsolete"}, categories: []string{"OldCategory"}}, + } + if _, err := applyTerms(ctx, db, first); err != nil { + t.Fatal(err) + } + + // Second run no longer mentions firefox (upstream dropped the component). + second := map[string]*pkgTerms{ + "konsole": {keywords: []string{"shell"}, categories: []string{"System"}}, + } + if _, err := applyTerms(ctx, db, second); err != nil { + t.Fatal(err) + } + + var kw, cat string + if err := db.QueryRow(`SELECT keywords, categories FROM package WHERE name = 'firefox'`). + Scan(&kw, &cat); err != nil { + t.Fatal(err) + } + if kw != "" || cat != "" { + t.Errorf("firefox should be cleared on second run, got kw=%q cat=%q", kw, cat) + } + + // And FTS should no longer match the stale term. + err := db.QueryRow( + `SELECT name FROM package_fts WHERE package_fts MATCH 'OldCategory'`).Scan(new(string)) + if err != sql.ErrNoRows { + t.Errorf("stale category still matches in FTS: err=%v", err) + } +} + +func TestApplyTerms_DedupesAndStripsStopwords(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + // Duplicate tokens across multiple "components" + a stopword mixed in. + acc := map[string]*pkgTerms{ + "firefox": { + keywords: []string{"internet and www", "www browser"}, + categories: []string{"Network", "Network"}, + }, + } + if _, err := applyTerms(ctx, db, acc); err != nil { + t.Fatal(err) + } + + var kw, cat string + if err := db.QueryRow(`SELECT keywords, categories FROM package WHERE name = 'firefox'`). + Scan(&kw, &cat); err != nil { + t.Fatal(err) + } + if kw != "internet www browser" { + t.Errorf("keywords = %q, want %q", kw, "internet www browser") + } + if cat != "Network" { + t.Errorf("categories = %q, want %q", cat, "Network") + } +} + +func TestApplyTerms_SkipsEmptyAfterDedupe(t *testing.T) { + db := setupPackageDB(t) + ctx := context.Background() + + // All-stopword keywords → dedupeWords returns ""; no row should be updated. + acc := map[string]*pkgTerms{ + "firefox": {keywords: []string{"the and or"}, categories: nil}, + } + updated, err := applyTerms(ctx, db, acc) + if err != nil { + t.Fatal(err) + } + if updated != 0 { + t.Errorf("updated = %d, want 0 (all-stopword input)", updated) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 401cb44d..836ec993 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -6,18 +6,20 @@ import ( ) type Config struct { - Database string - Port string - PackagesMirror string - DefaultMirror string + Database string + Port string + PackagesMirror string + DefaultMirror string + AppStreamSourcesBase string } func Load() (Config, error) { cfg := Config{ - Database: getEnv("DATABASE", ""), - Port: getEnv("PORT", "8080"), - PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), - DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + Database: getEnv("DATABASE", ""), + Port: getEnv("PORT", "8080"), + PackagesMirror: getEnv("PACKAGES_MIRROR", "https://geo.mirror.pkgbuild.com/"), + DefaultMirror: getEnv("DEFAULT_MIRROR", "https://geo.mirror.pkgbuild.com/"), + AppStreamSourcesBase: getEnv("APPSTREAM_SOURCES_BASE", "https://sources.archlinux.org/other/packages/archlinux-appstream-data/"), } if cfg.Database == "" { diff --git a/internal/database/migrations/000004_package_keywords.down.sql b/internal/database/migrations/000004_package_keywords.down.sql new file mode 100644 index 00000000..4479aae3 --- /dev/null +++ b/internal/database/migrations/000004_package_keywords.down.sql @@ -0,0 +1,10 @@ +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); + +ALTER TABLE package DROP COLUMN keywords; diff --git a/internal/database/migrations/000004_package_keywords.up.sql b/internal/database/migrations/000004_package_keywords.up.sql new file mode 100644 index 00000000..d030841e --- /dev/null +++ b/internal/database/migrations/000004_package_keywords.up.sql @@ -0,0 +1,10 @@ +ALTER TABLE package ADD COLUMN keywords TEXT NOT NULL DEFAULT ''; + +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, keywords, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); diff --git a/internal/database/migrations/000005_package_categories.down.sql b/internal/database/migrations/000005_package_categories.down.sql new file mode 100644 index 00000000..fe6a1dc1 --- /dev/null +++ b/internal/database/migrations/000005_package_categories.down.sql @@ -0,0 +1,10 @@ +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, keywords, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); + +ALTER TABLE package DROP COLUMN categories; diff --git a/internal/database/migrations/000005_package_categories.up.sql b/internal/database/migrations/000005_package_categories.up.sql new file mode 100644 index 00000000..344d1bc9 --- /dev/null +++ b/internal/database/migrations/000005_package_categories.up.sql @@ -0,0 +1,10 @@ +ALTER TABLE package ADD COLUMN categories TEXT NOT NULL DEFAULT ''; + +DROP TABLE package_fts; + +CREATE VIRTUAL TABLE package_fts USING fts5( + name, base, description, groups, provides, keywords, categories, + content='package', content_rowid='id' +); + +INSERT INTO package_fts(package_fts) VALUES('rebuild'); diff --git a/internal/packagedetail/handler_test.go b/internal/packagedetail/handler_test.go index 82781896..d0effe26 100644 --- a/internal/packagedetail/handler_test.go +++ b/internal/packagedetail/handler_test.go @@ -44,8 +44,8 @@ func setupHandlerDB(t *testing.T) *sql.DB { usr/share/bash/bash_completion')`, // Populate FTS - `INSERT INTO package_fts (rowid, name, base, description, groups, provides) - SELECT id, name, base, description, groups, provides FROM package`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords, categories) + SELECT id, name, base, description, groups, provides, keywords, categories FROM package`, } { if _, err := db.Exec(stmt); err != nil { t.Fatalf("setup: %v", err) diff --git a/internal/packages/repository.go b/internal/packages/repository.go index 825ba699..c0c733be 100644 --- a/internal/packages/repository.go +++ b/internal/packages/repository.go @@ -4,11 +4,25 @@ import ( "context" "database/sql" "errors" + "fmt" "strings" fts "archded/internal/search" ) +// bm25Weights are per FTS5 column: name, base, description, groups, provides, keywords, categories. +// Long AppStream fields increase BM25 document length; description (short pacman text) must +// stay heavily weighted so queries like "browser" still rank packages that only match there. +const ( + bm25Name = 12 + bm25Base = 5 + bm25Description = 10 + bm25Groups = 1 + bm25Provides = 3 + bm25Keywords = 0.5 + bm25Categories = 0.5 +) + type PackageSummary struct { Repository string Architecture string @@ -93,8 +107,12 @@ func (r *Repository) Search(ctx context.Context, search, repo, arch string, limi } countQuery = `SELECT COUNT(*) ` + baseWhere + bm25 := fmt.Sprintf( + "bm25(package_fts, %d, %d, %d, %d, %d, %g, %g)", + bm25Name, bm25Base, bm25Description, bm25Groups, bm25Provides, bm25Keywords, bm25Categories, + ) dataQuery = `SELECT r.name, r.architecture, p.name, p.version, p.description, p.build_date, p.popularity_recent, r.testing - ` + baseWhere + ` ORDER BY (p.name = ?) DESC, bm25(package_fts, 10, 5, 1, 1, 3) - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` + ` + baseWhere + ` ORDER BY (p.name = ?) DESC, ` + bm25 + ` - ln(1 + p.popularity_recent), p.build_date DESC LIMIT ? OFFSET ?` dataArgs = append(dataArgs, search, limit, offset) } else { baseWhere := `FROM package p diff --git a/internal/packages/repository_test.go b/internal/packages/repository_test.go index a0041923..35cbb262 100644 --- a/internal/packages/repository_test.go +++ b/internal/packages/repository_test.go @@ -31,8 +31,8 @@ func setupTestDB(t *testing.T) *sql.DB { (4, 3, 'linux', 'linux', '6.7-rc1', 'The Linux kernel (testing)', 1700400000, 'Jan', 0.0)`, // Populate FTS - `INSERT INTO package_fts (rowid, name, base, description, groups, provides) - SELECT id, name, base, description, groups, provides FROM package`, + `INSERT INTO package_fts (rowid, name, base, description, groups, provides, keywords, categories) + SELECT id, name, base, description, groups, provides, keywords, categories FROM package`, } { if _, err := db.Exec(stmt); err != nil { t.Fatalf("setup: %s...: %v", stmt[:40], err) diff --git a/justfile b/justfile index 32fe5afe..ff75886a 100644 --- a/justfile +++ b/justfile @@ -98,12 +98,16 @@ update-pnpm: update: update-go update-pnpm # fetch all external data into the local database -update-data: update-packages update-news update-mirrors update-releases update-package-popularities update-mirror-popularities +update-data: update-packages update-appstream update-news update-mirrors update-releases update-package-popularities update-mirror-popularities # fetch package data from Arch Linux repositories update-packages: go run . update-packages +# fetch AppStream component metadata (keywords for package search) +update-appstream: + go run . update-appstream + # fetch news from archlinux.org update-news: go run . update-news diff --git a/main.go b/main.go index aed50dd9..7cdfb543 100644 --- a/main.go +++ b/main.go @@ -7,6 +7,7 @@ import ( "os" "time" + "archded/internal/appstream" "archded/internal/config" "archded/internal/database" "archded/internal/legacy" @@ -85,6 +86,11 @@ func runCommand(cmd string, cfg config.Config) int { slog.Error("update-mirror-popularities failed", "error", err) return 1 } + case "update-appstream": + if err := appstream.Update(ctx, db, cfg.AppStreamSourcesBase); err != nil { + slog.Error("update-appstream failed", "error", err) + return 1 + } default: slog.Error("unknown command", "command", cmd) //nolint:gosec // cmd is from os.Args return 1