11// Package appstream parses Arch Linux AppStream component XML (from
22// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and
33// builds per-pkgname search text for SQLite FTS.
4+ //
5+ // Parsing model: encoding/xml streams tokens (start/end/CharData); we keep one
6+ // open component in docParser.cur. stack + muteLeaf track the path so
7+ // text is attributed to the right element; name/summary with xml:lang outside
8+ // en/de are skipped. flush runs at </component> (and before the next <component>)
9+ // to emit (pkgname, parts); the caller merges duplicate pkgnames.
410package appstream
511
612import (
@@ -15,11 +21,9 @@ const (
1521 elKeyword = "keyword"
1622)
1723
18- // ParseComponentsXML reads a Components-*.xml stream and calls fn for each
19- // <component>, as soon as the element is complete. Multiple components with the
20- // same <pkgname> produce multiple invocations; the caller merges by name. This
21- // matches the streaming style of pacmandb.Parse: only one component is held in
22- // memory at a time.
24+ // ParseComponentsXML streams the decoder and calls fn once per completed
25+ // <component> (same pkgname may appear many times). fn receives raw text
26+ // fragments in parts; dedupeWords runs in the caller after merge.
2327func ParseComponentsXML (r io.Reader , fn func (pkgname string , parts []string ) error ) error {
2428 d := xml .NewDecoder (r )
2529 d .Strict = false
@@ -47,6 +51,9 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err
4751 }
4852}
4953
54+ // docParser holds decoder state between tokens.
55+ // stack/muteLeaf are parallel: element names and whether that leaf skips CharData (non-en/de name/summary).
56+ // inKeywords/inKeyword/inCats/inDesc gate text from nested sections. cur is the open <component> or nil.
5057type docParser struct {
5158 fn func (string , []string ) error
5259 dec * xml.Decoder
@@ -59,6 +66,7 @@ type docParser struct {
5966 cur * component
6067}
6168
69+ // flush emits cur via fn and clears it. EOF calls flush for the last component.
6270func (p * docParser ) flush () error {
6371 if p .cur == nil {
6472 return nil
@@ -72,6 +80,7 @@ func (p *docParser) flush() error {
7280 return p .fn (name , parts )
7381}
7482
83+ // startElement pushes stack/muteLeaf; on <component> flushes the previous component then starts a new cur.
7584func (p * docParser ) startElement (t xml.StartElement ) error {
7685 local := t .Name .Local
7786 p .stack = append (p .stack , local )
@@ -109,6 +118,7 @@ func (p *docParser) startElement(t xml.StartElement) error {
109118 return nil
110119}
111120
121+ // endElement pops stack/muteLeaf; on </component> flushes the finished component.
112122func (p * docParser ) endElement (t xml.EndElement ) error {
113123 local := t .Name .Local
114124 if len (p .stack ) == 0 {
@@ -137,6 +147,7 @@ func (p *docParser) endElement(t xml.EndElement) error {
137147 return nil
138148}
139149
150+ // charData routes text to pkgname or parts by parent element name (stack tip).
140151func (p * docParser ) charData (t xml.CharData ) {
141152 if p .cur == nil {
142153 return
@@ -175,11 +186,13 @@ func (p *docParser) charData(t xml.CharData) {
175186 }
176187}
177188
189+ // component is one AppStream <component> being accumulated until flush.
178190type component struct {
179191 pkgname string
180192 parts []string
181193}
182194
195+ // dedupeWords joins fragments and drops duplicate tokens (case-insensitive) for FTS.
183196func dedupeWords (parts []string ) string {
184197 seen := make (map [string ]struct {})
185198 var b strings.Builder
0 commit comments