Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ name: CI

on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:

jobs:
Expand Down
7 changes: 4 additions & 3 deletions ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ Key indexes: FTS5 virtual table for package search (name, base, description, gro

## Data Updates

Six CLI subcommands fetch data from external sources, invoked by external systemd timers:
Seven CLI subcommands fetch data from external sources, invoked by external systemd timers:

| Command | Source | Notes |
|---------|--------|-------|
| `update-packages` | Arch mirror `.files` DBs | 6 repos concurrent, ETag change detection, FTS rebuild after |
| `update-appstream` | sources.archlinux.org `archlinux-appstream-data` | core/extra/multilib only (not testing — upstream doesn't publish it), FTS rebuild after |
| `update-news` | forum.archlinux.de Flarum API | Paginated, HTML sanitized |
| `update-mirrors` | archlinux.org/mirrors/status/json/ | Filtered by active/HTTPS/completion |
| `update-releases` | archlinux.org/releng/releases/json/ | ISO URLs, checksums, torrent info |
Expand All @@ -52,9 +53,9 @@ Six CLI subcommands fetch data from external sources, invoked by external system

## Search

FTS5 indexes: name, base, description, groups, provides (denormalized from package relations). Hyphenated queries are split into individual terms for tokenizer compatibility.
FTS5 indexes: name, base, description, groups, provides, keywords, categories (AppStream `<keyword>` / `<category>` inside blocks whose `xml:lang` is absent or en/de). Hyphenated queries are split into individual terms for tokenizer compatibility.

Ranking: exact name match first, then `bm25(10,5,1,1,3) - ln(1+popularity)` — name-weighted with log-scaled popularity boost.
Ranking: exact name match first, then `bm25` with higher weights on name/description than on keywords (so long keyword blobs do not bury short pacman descriptions), minus `ln(1+popularity)`.

## Middleware Stack

Expand Down
205 changes: 205 additions & 0 deletions internal/appstream/parse.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// Package appstream parses Arch Linux AppStream component XML (from
// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and
// builds per-pkgname search text for SQLite FTS.
//
// Parsing model: encoding/xml streams tokens; we keep one open <component> in
// docParser.cur. We read <pkgname>, <keywords>/<keyword>, and <categories>/<category>.
// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed —
// rejected blocks are skipped wholesale via xml.Decoder.Skip. flush runs at </component>
// (and before the next <component>) to emit pkgname + terms.
package appstream

import (
"encoding/xml"
"errors"
"io"
"strings"
)

// IndexTerms holds text extracted from one <component> for FTS (merged by pkgname in update.go).
type IndexTerms struct {
Keywords []string
Categories []string
}

// ParseComponentsXML streams the decoder and calls fn once per completed
// <component> (same pkgname may appear many times). dedupeWords runs in the caller after merge.
func ParseComponentsXML(r io.Reader, fn func(pkgname string, terms IndexTerms) error) error {
p := &docParser{fn: fn, dec: xml.NewDecoder(r)}
for {
tok, err := p.dec.Token()
if errors.Is(err, io.EOF) {
return p.flush()
}
if err != nil {
return err
}
switch t := tok.(type) {
case xml.StartElement:
if err := p.startElement(t); err != nil {
return err
}
case xml.EndElement:
p.endElement(t)
case xml.CharData:
p.charData(t)
}
}
}

// docParser holds decoder state between tokens. Rejected <keywords>/<categories>
// blocks are skipped by the decoder so we never see their children — no skip flags needed.
type docParser struct {
fn func(string, IndexTerms) error
dec *xml.Decoder
cur *component
inPkgname bool
inKeywords bool
inKeyword bool
inCategories bool
inCategory bool
}

// flush emits cur via fn and clears it. EOF calls flush for the last component.
func (p *docParser) flush() error {
if p.cur == nil {
return nil
}
name := strings.TrimSpace(p.cur.pkgname)
terms := IndexTerms{Keywords: p.cur.keywords, Categories: p.cur.categories}
p.cur = nil
if name == "" {
return nil
}
return p.fn(name, terms)
}

func (p *docParser) startElement(t xml.StartElement) error {
switch t.Name.Local {
case "component":
if err := p.flush(); err != nil {
return err
}
p.cur = &component{}
case "pkgname":
if p.cur != nil {
p.inPkgname = true
}
case "keywords":
if !keywordLangAccepted(t.Attr) {
return p.dec.Skip()
}
p.inKeywords = true
case "keyword":
if !p.inKeywords {
return nil
}
if !keywordLangAccepted(t.Attr) {
return p.dec.Skip()
}
p.inKeyword = true
case "categories":
if !keywordLangAccepted(t.Attr) {
return p.dec.Skip()
}
p.inCategories = true
case "category":
if !p.inCategories {
return nil
}
if !keywordLangAccepted(t.Attr) {
return p.dec.Skip()
}
p.inCategory = true
}
return nil
}

func (p *docParser) endElement(t xml.EndElement) {
switch t.Name.Local {
case "component":
_ = p.flush()
case "pkgname":
p.inPkgname = false
case "keywords":
p.inKeywords = false
case "keyword":
p.inKeyword = false
case "categories":
p.inCategories = false
case "category":
p.inCategory = false
}
}

func (p *docParser) charData(t xml.CharData) {
if p.cur == nil {
return
}
var dst *[]string
switch {
case p.inPkgname:
text := strings.TrimSpace(string(t))
if text != "" {
p.cur.pkgname += text
}
return
case p.inKeyword:
dst = &p.cur.keywords
case p.inCategory:
dst = &p.cur.categories
default:
return
}
text := strings.TrimSpace(string(t))
if text != "" {
*dst = append(*dst, text)
}
}

// keywordLangAccepted is used for <keywords>, <keyword>, <categories>, and <category>
// start tags: true if there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE).
func keywordLangAccepted(attrs []xml.Attr) bool {
for _, a := range attrs {
if a.Name.Local != "lang" || a.Value == "" {
continue
}
v := strings.ToLower(strings.TrimSpace(a.Value))
if i := strings.IndexByte(v, '-'); i > 0 {
v = v[:i]
}
return v == "en" || v == "de"
}
return true
}

// component is one AppStream <component> being accumulated until flush.
type component struct {
pkgname string
keywords []string
categories []string
}

// dedupeWords joins fragments, removes English/German stop words, and deduplicates
// tokens (case-insensitive) for FTS.
func dedupeWords(parts []string) string {
seen := make(map[string]struct{})
var b strings.Builder
for _, part := range parts {
for _, w := range strings.Fields(part) {
key := strings.ToLower(w)
if _, ok := stopword[key]; ok {
continue
}
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
if b.Len() > 0 {
b.WriteByte(' ')
}
b.WriteString(w)
}
}
return b.String()
}
139 changes: 139 additions & 0 deletions internal/appstream/parse_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package appstream

import (
"encoding/xml"
"strings"
"testing"
)

func TestParseComponentsXML_KeywordsAndCategories(t *testing.T) {
const xml = `<?xml version="1.0"?>
<components>
<component type="desktop-application">
<pkgname>firefox</pkgname>
<name>Firefox</name>
<summary>Web browser</summary>
<description><p>Free software web browser.</p></description>
<categories><category>Network</category><category>WebBrowser</category></categories>
<keywords><keyword>internet</keyword><keyword>www</keyword></keywords>
</component>
<component type="desktop-application">
<pkgname>firefox</pkgname>
<name>Firefox ESR</name>
<keywords><keyword>mozilla</keyword></keywords>
</component>
</components>`

accKW := make(map[string][]string)
accCat := make(map[string][]string)
err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error {
accKW[name] = append(accKW[name], terms.Keywords...)
accCat[name] = append(accCat[name], terms.Categories...)
return nil
})
if err != nil {
t.Fatal(err)
}
gotKW := dedupeWords(accKW["firefox"])
gotCat := dedupeWords(accCat["firefox"])
if gotKW == "" {
t.Fatal("expected merged keywords for firefox")
}
if !strings.Contains(gotKW, "internet") || !strings.Contains(gotKW, "www") || !strings.Contains(gotKW, "mozilla") {
t.Errorf("expected keyword terms, got %q", gotKW)
}
if strings.Contains(strings.ToLower(gotKW), "browser") {
t.Errorf("did not expect description text in keywords, got %q", gotKW)
}
if !strings.Contains(gotCat, "Network") || !strings.Contains(gotCat, "WebBrowser") {
t.Errorf("expected category terms, got %q", gotCat)
}
}

func TestParseComponentsXML_KeywordLangFilter(t *testing.T) {
// AppStream often sets xml:lang on <keywords>, not on each <keyword>.
const xml = `<?xml version="1.0"?>
<components>
<component>
<pkgname>demo</pkgname>
<keywords>
<keyword>neutral</keyword>
</keywords>
<keywords xml:lang="en">
<keyword>english</keyword>
</keywords>
<keywords xml:lang="de">
<keyword>deutsch</keyword>
</keywords>
<keywords xml:lang="de-DE">
<keyword>deutsch2</keyword>
</keywords>
<keywords xml:lang="fr">
<keyword>francais</keyword>
</keywords>
</component>
</components>`
acc := make(map[string][]string)
err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error {
acc[name] = append(acc[name], terms.Keywords...)
return nil
})
if err != nil {
t.Fatal(err)
}
got := dedupeWords(acc["demo"])
for _, need := range []string{"neutral", "english", "deutsch", "deutsch2"} {
if !strings.Contains(got, need) {
t.Errorf("missing %q in %q", need, got)
}
}
if strings.Contains(got, "francais") {
t.Errorf("did not want fr keyword, got %q", got)
}
}

func TestParseComponentsXML_CategoriesLangFilter(t *testing.T) {
const xml = `<?xml version="1.0"?>
<components>
<component>
<pkgname>demo</pkgname>
<categories><category>NeutralCat</category></categories>
<categories xml:lang="de"><category>DeutschCat</category></categories>
<categories xml:lang="fr"><category>FrCat</category></categories>
</component>
</components>`
acc := make(map[string][]string)
err := ParseComponentsXML(strings.NewReader(xml), func(name string, terms IndexTerms) error {
acc[name] = append(acc[name], terms.Categories...)
return nil
})
if err != nil {
t.Fatal(err)
}
got := dedupeWords(acc["demo"])
if !strings.Contains(got, "NeutralCat") || !strings.Contains(got, "DeutschCat") {
t.Errorf("want neutral+de categories, got %q", got)
}
if strings.Contains(got, "FrCat") {
t.Errorf("did not want fr category, got %q", got)
}
}

func TestKeywordLangAccepted(t *testing.T) {
tests := []struct {
attrs []xml.Attr
want bool
}{
{nil, true},
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "en"}}, true},
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de"}}, true},
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "de-AT"}}, true},
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "fr"}}, false},
{[]xml.Attr{{Name: xml.Name{Local: "lang"}, Value: "pl"}}, false},
}
for _, tt := range tests {
if got := keywordLangAccepted(tt.attrs); got != tt.want {
t.Errorf("keywordLangAccepted(%v) = %v, want %v", tt.attrs, got, tt.want)
}
}
}
Loading
Loading