22// https://sources.archlinux.org/other/packages/archlinux-appstream-data/) and
33// builds per-pkgname search text for SQLite FTS.
44//
5- // Parsing model: encoding/xml streams tokens (start/end/CharData); we keep one
6- // open component in docParser.cur. stack + muteLeaf track the path so
7- // text is attributed to the right element; name/summary with xml:lang outside
8- // en/de are skipped. flush runs at </component> (and before the next <component>)
9- // to emit (pkgname, parts); the caller merges duplicate pkgnames.
5+ // Parsing model: encoding/xml streams tokens; we keep one open <component> in
6+ // docParser.cur. Only <pkgname> and <keywords>/<keyword> text are read; other
7+ // elements (name, summary, description, categories) are ignored for indexing.
8+ // AppStream puts xml:lang on <keywords> blocks (not each <keyword>). Only blocks
9+ // with no lang or en/de are indexed; per-<keyword> xml:lang is also respected when present.
10+ // flush runs at </component> (and before the next <component>) to emit (pkgname, parts).
1011package appstream
1112
1213import (
@@ -22,8 +23,8 @@ const (
2223)
2324
2425// ParseComponentsXML streams the decoder and calls fn once per completed
25- // <component> (same pkgname may appear many times). fn receives raw text
26- // fragments in parts ; dedupeWords runs in the caller after merge.
26+ // <component> (same pkgname may appear many times). parts contains only
27+ // <keyword> text ; dedupeWords runs in the caller after merge.
2728func ParseComponentsXML (r io.Reader , fn func (pkgname string , parts []string ) error ) error {
2829 d := xml .NewDecoder (r )
2930 d .Strict = false
@@ -52,18 +53,18 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, parts []string) err
5253}
5354
5455// docParser holds decoder state between tokens.
55- // stack/muteLeaf are parallel: element names and whether that leaf skips CharData (non-en/de name/summary).
56- // inKeywords/inKeyword/inCats/inDesc gate text from nested sections. cur is the open <component> or nil.
56+ // keywordsBlockSkip drops a whole <keywords xml:lang="…"> block when not en/de/neutral.
57+ // keywordSkip does the same for a single <keyword> when it carries xml:lang.
58+ // cur is the open <component> or nil.
5759type docParser struct {
58- fn func (string , []string ) error
59- dec * xml.Decoder
60- stack []string
61- muteLeaf []bool
62- inKeywords bool
63- inKeyword bool
64- inCats bool
65- inDesc int
66- cur * component
60+ fn func (string , []string ) error
61+ dec * xml.Decoder
62+ stack []string
63+ inKeywords bool
64+ keywordsBlockSkip bool
65+ inKeyword bool
66+ keywordSkip bool
67+ cur * component
6768}
6869
6970// flush emits cur via fn and clears it. EOF calls flush for the last component.
@@ -80,23 +81,10 @@ func (p *docParser) flush() error {
8081 return p .fn (name , parts )
8182}
8283
83- // startElement pushes stack/muteLeaf ; on <component> flushes the previous component then starts a new cur.
84+ // startElement pushes stack; on <component> flushes the previous component then starts a new cur.
8485func (p * docParser ) startElement (t xml.StartElement ) error {
8586 local := t .Name .Local
8687 p .stack = append (p .stack , local )
87- muted := false
88- if local == "name" || local == "summary" {
89- for _ , a := range t .Attr {
90- if a .Name .Local != "lang" || a .Value == "" {
91- continue
92- }
93- if a .Value != "en" && a .Value != "de" {
94- muted = true
95- break
96- }
97- }
98- }
99- p .muteLeaf = append (p .muteLeaf , muted )
10088
10189 switch local {
10290 case "component" :
@@ -106,56 +94,43 @@ func (p *docParser) startElement(t xml.StartElement) error {
10694 p .cur = & component {}
10795 case "keywords" :
10896 p .inKeywords = true
97+ p .keywordsBlockSkip = ! keywordLangAccepted (t .Attr )
10998 case elKeyword :
11099 if p .inKeywords {
111100 p .inKeyword = true
101+ p .keywordSkip = ! keywordLangAccepted (t .Attr )
112102 }
113- case "categories" :
114- p .inCats = true
115- case "description" :
116- p .inDesc ++
117103 }
118104 return nil
119105}
120106
121- // endElement pops stack/muteLeaf ; on </component> flushes the finished component.
107+ // endElement pops stack; on </component> flushes the finished component.
122108func (p * docParser ) endElement (t xml.EndElement ) error {
123109 local := t .Name .Local
124110 if len (p .stack ) == 0 {
125111 return nil
126112 }
127113 p .stack = p .stack [:len (p .stack )- 1 ]
128- if len (p .muteLeaf ) > 0 {
129- p .muteLeaf = p .muteLeaf [:len (p .muteLeaf )- 1 ]
130- }
131114
132115 switch local {
133116 case "component" :
134117 return p .flush ()
135118 case "keywords" :
136119 p .inKeywords = false
137120 p .inKeyword = false
121+ p .keywordsBlockSkip = false
138122 case elKeyword :
139123 p .inKeyword = false
140- case "categories" :
141- p .inCats = false
142- case "description" :
143- if p .inDesc > 0 {
144- p .inDesc --
145- }
124+ p .keywordSkip = false
146125 }
147126 return nil
148127}
149128
150- // charData routes text to pkgname or parts by parent element name (stack tip) .
129+ // charData collects pkgname and <keyword> text only .
151130func (p * docParser ) charData (t xml.CharData ) {
152131 if p .cur == nil {
153132 return
154133 }
155- muted := len (p .muteLeaf ) > 0 && p .muteLeaf [len (p .muteLeaf )- 1 ]
156- if muted {
157- return
158- }
159134 text := strings .TrimSpace (string (t ))
160135 if text == "" {
161136 return
@@ -169,21 +144,27 @@ func (p *docParser) charData(t xml.CharData) {
169144 switch parent {
170145 case "pkgname" :
171146 p .cur .pkgname += text
172- case "name" , "summary" :
173- p .cur .parts = append (p .cur .parts , text )
174- case "category" :
175- if p .inCats {
176- p .cur .parts = append (p .cur .parts , text )
177- }
178147 case elKeyword :
179- if p .inKeyword {
148+ if p .inKeyword && ! p . keywordsBlockSkip && ! p . keywordSkip {
180149 p .cur .parts = append (p .cur .parts , text )
181150 }
182- case "p" :
183- if p .inDesc > 0 {
184- p .cur .parts = append (p .cur .parts , text )
151+ }
152+ }
153+
154+ // keywordLangAccepted is used for both <keywords> and <keyword> start tags: true if
155+ // there is no xml:lang, or it is en/de (including BCP47 prefixes like de-DE).
156+ func keywordLangAccepted (attrs []xml.Attr ) bool {
157+ for _ , a := range attrs {
158+ if a .Name .Local != "lang" || a .Value == "" {
159+ continue
160+ }
161+ v := strings .ToLower (strings .TrimSpace (a .Value ))
162+ if i := strings .IndexByte (v , '-' ); i > 0 {
163+ v = v [:i ]
185164 }
165+ return v == "en" || v == "de"
186166 }
167+ return true
187168}
188169
189170// component is one AppStream <component> being accumulated until flush.
0 commit comments