Skip to content

Commit 6dd96e0

Browse files
committed
dev: document how the code works
1 parent 7599324 commit 6dd96e0

1 file changed

Lines changed: 18 additions & 5 deletions

File tree

internal/appstream/parse.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
// Package appstream parses Arch Linux AppStream component XML (from
22
// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and
33
// builds per-pkgname search text for SQLite FTS.
4+
//
5+
// Parsing model: encoding/xml streams tokens (start/end/CharData); we keep one
6+
// open component in docParser.cur. stack + muteLeaf track the path so
7+
// text is attributed to the right element; name/summary with xml:lang outside
8+
// en/de are skipped. flush runs at </component> (and before the next <component>)
9+
// to emit (pkgname, parts); the caller merges duplicate pkgnames.
410
package appstream
511

612
import (
@@ -15,11 +21,9 @@ const (
1521
elKeyword = "keyword"
1622
)
1723

18-
// ParseComponentsXML reads a Components-*.xml stream and calls fn for each
19-
// <component>, as soon as the element is complete. Multiple components with the
20-
// same <pkgname> produce multiple invocations; the caller merges by name. This
21-
// matches the streaming style of pacmandb.Parse: only one component is held in
22-
// memory at a time.
24+
// ParseComponentsXML streams the decoder and calls fn once per completed
25+
// <component> (same pkgname may appear many times). fn receives raw text
26+
// fragments in parts; dedupeWords runs in the caller after merge.
2327
func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) error) error {
2428
d := xml.NewDecoder(r)
2529
d.Strict = false
@@ -47,6 +51,9 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err
4751
}
4852
}
4953

54+
// docParser holds decoder state between tokens.
55+
// stack/muteLeaf are parallel: element names and whether that leaf skips CharData (non-en/de name/summary).
56+
// inKeywords/inKeyword/inCats/inDesc gate text from nested sections. cur is the open <component> or nil.
5057
type docParser struct {
5158
fn func(string, []string) error
5259
dec *xml.Decoder
@@ -59,6 +66,7 @@ type docParser struct {
5966
cur *component
6067
}
6168

69+
// flush emits cur via fn and clears it. EOF calls flush for the last component.
6270
func (p *docParser) flush() error {
6371
if p.cur == nil {
6472
return nil
@@ -72,6 +80,7 @@ func (p *docParser) flush() error {
7280
return p.fn(name, parts)
7381
}
7482

83+
// startElement pushes stack/muteLeaf; on <component> flushes the previous component then starts a new cur.
7584
func (p *docParser) startElement(t xml.StartElement) error {
7685
local := t.Name.Local
7786
p.stack = append(p.stack, local)
@@ -109,6 +118,7 @@ func (p *docParser) startElement(t xml.StartElement) error {
109118
return nil
110119
}
111120

121+
// endElement pops stack/muteLeaf; on </component> flushes the finished component.
112122
func (p *docParser) endElement(t xml.EndElement) error {
113123
local := t.Name.Local
114124
if len(p.stack) == 0 {
@@ -137,6 +147,7 @@ func (p *docParser) endElement(t xml.EndElement) error {
137147
return nil
138148
}
139149

150+
// charData routes text to pkgname or parts by parent element name (stack tip).
140151
func (p *docParser) charData(t xml.CharData) {
141152
if p.cur == nil {
142153
return
@@ -175,11 +186,13 @@ func (p *docParser) charData(t xml.CharData) {
175186
}
176187
}
177188

189+
// component is one AppStream <component> being accumulated until flush.
178190
type component struct {
179191
pkgname string
180192
parts []string
181193
}
182194

195+
// dedupeWords joins fragments and drops duplicate tokens (case-insensitive) for FTS.
183196
func dedupeWords(parts []string) string {
184197
seen := make(map[string]struct{})
185198
var b strings.Builder

0 commit comments

Comments
 (0)