Skip to content

Commit 3c5646d

Browse files
committed
appstream: simplify parser with Decoder.Skip for rejected lang blocks
Replace the manual stack + 6 skip flags with the decoder's own Skip(): when a <keywords>/<keyword>/<categories>/<category> tag has a non-en/de xml:lang, skip its entire subtree in one call. The remaining state is five booleans tracking the enter/leave of accepted elements. No behavior change; all existing tests pass.
1 parent 5c4047f commit 3c5646d

1 file changed

Lines changed: 62 additions & 78 deletions

File tree

internal/appstream/parse.go

Lines changed: 62 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
//
55
// Parsing model: encoding/xml streams tokens; we keep one open <component> in
66
// docParser.cur. We read <pkgname>, <keywords>/<keyword>, and <categories>/<category>.
7-
// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed.
8-
// flush runs at </component> (and before the next <component>) to emit pkgname + terms.
7+
// AppStream puts xml:lang on parent blocks; only neutral or en/de blocks are indexed —
8+
// rejected blocks are skipped wholesale via xml.Decoder.Skip. flush runs at </component>
9+
// (and before the next <component>) to emit pkgname + terms.
910
package appstream
1011

1112
import (
@@ -15,12 +16,6 @@ import (
1516
"strings"
1617
)
1718

18-
// XML element names referenced more than once in the decoder.
19-
const (
20-
elKeyword = "keyword"
21-
elCategory = "category"
22-
)
23-
2419
// IndexTerms holds text extracted from one <component> for FTS (merged by pkgname in update.go).
2520
type IndexTerms struct {
2621
Keywords []string
@@ -45,29 +40,24 @@ func ParseComponentsXML(r io.Reader, fn func(pkgname string, terms IndexTerms) e
4540
return err
4641
}
4742
case xml.EndElement:
48-
if err := p.endElement(t); err != nil {
49-
return err
50-
}
43+
p.endElement(t)
5144
case xml.CharData:
5245
p.charData(t)
5346
}
5447
}
5548
}
5649

57-
// docParser holds decoder state between tokens.
50+
// docParser holds decoder state between tokens. Rejected <keywords>/<categories>
51+
// blocks are skipped by the decoder so we never see their children — no skip flags needed.
5852
type docParser struct {
59-
fn func(string, IndexTerms) error
60-
dec *xml.Decoder
61-
stack []string
62-
inKeywords bool
63-
keywordsBlockSkip bool
64-
inKeyword bool
65-
keywordSkip bool
66-
inCategories bool
67-
categoriesBlockSkip bool
68-
inCategory bool
69-
categorySkip bool
70-
cur *component
53+
fn func(string, IndexTerms) error
54+
dec *xml.Decoder
55+
cur *component
56+
inPkgname bool
57+
inKeywords bool
58+
inKeyword bool
59+
inCategories bool
60+
inCategory bool
7161
}
7262

7363
// flush emits cur via fn and clears it. EOF calls flush for the last component.
@@ -84,92 +74,86 @@ func (p *docParser) flush() error {
8474
return p.fn(name, terms)
8575
}
8676

87-
// startElement pushes stack; on <component> flushes the previous component then starts a new cur.
8877
func (p *docParser) startElement(t xml.StartElement) error {
89-
local := t.Name.Local
90-
p.stack = append(p.stack, local)
91-
92-
switch local {
78+
switch t.Name.Local {
9379
case "component":
9480
if err := p.flush(); err != nil {
9581
return err
9682
}
9783
p.cur = &component{}
84+
case "pkgname":
85+
if p.cur != nil {
86+
p.inPkgname = true
87+
}
9888
case "keywords":
89+
if !keywordLangAccepted(t.Attr) {
90+
return p.dec.Skip()
91+
}
9992
p.inKeywords = true
100-
p.keywordsBlockSkip = !keywordLangAccepted(t.Attr)
101-
case elKeyword:
102-
if p.inKeywords {
103-
p.inKeyword = true
104-
p.keywordSkip = !keywordLangAccepted(t.Attr)
93+
case "keyword":
94+
if !p.inKeywords {
95+
return nil
96+
}
97+
if !keywordLangAccepted(t.Attr) {
98+
return p.dec.Skip()
10599
}
100+
p.inKeyword = true
106101
case "categories":
102+
if !keywordLangAccepted(t.Attr) {
103+
return p.dec.Skip()
104+
}
107105
p.inCategories = true
108-
p.categoriesBlockSkip = !keywordLangAccepted(t.Attr)
109-
case elCategory:
110-
if p.inCategories {
111-
p.inCategory = true
112-
p.categorySkip = !keywordLangAccepted(t.Attr)
106+
case "category":
107+
if !p.inCategories {
108+
return nil
109+
}
110+
if !keywordLangAccepted(t.Attr) {
111+
return p.dec.Skip()
113112
}
113+
p.inCategory = true
114114
}
115115
return nil
116116
}
117117

118-
// endElement pops stack; on </component> flushes the finished component.
119-
func (p *docParser) endElement(t xml.EndElement) error {
120-
local := t.Name.Local
121-
if len(p.stack) == 0 {
122-
return nil
123-
}
124-
p.stack = p.stack[:len(p.stack)-1]
125-
126-
switch local {
118+
func (p *docParser) endElement(t xml.EndElement) {
119+
switch t.Name.Local {
127120
case "component":
128-
return p.flush()
121+
_ = p.flush()
122+
case "pkgname":
123+
p.inPkgname = false
129124
case "keywords":
130125
p.inKeywords = false
126+
case "keyword":
131127
p.inKeyword = false
132-
p.keywordsBlockSkip = false
133-
case elKeyword:
134-
p.inKeyword = false
135-
p.keywordSkip = false
136128
case "categories":
137129
p.inCategories = false
130+
case "category":
138131
p.inCategory = false
139-
p.categoriesBlockSkip = false
140-
case elCategory:
141-
p.inCategory = false
142-
p.categorySkip = false
143132
}
144-
return nil
145133
}
146134

147-
// charData collects pkgname, <keyword>, and <category> text.
148135
func (p *docParser) charData(t xml.CharData) {
149136
if p.cur == nil {
150137
return
151138
}
152-
text := strings.TrimSpace(string(t))
153-
if text == "" {
139+
var dst *[]string
140+
switch {
141+
case p.inPkgname:
142+
text := strings.TrimSpace(string(t))
143+
if text != "" {
144+
p.cur.pkgname += text
145+
}
146+
return
147+
case p.inKeyword:
148+
dst = &p.cur.keywords
149+
case p.inCategory:
150+
dst = &p.cur.categories
151+
default:
154152
return
155153
}
156-
157-
parent := ""
158-
if len(p.stack) > 0 {
159-
parent = p.stack[len(p.stack)-1]
160-
}
161-
162-
switch parent {
163-
case "pkgname":
164-
p.cur.pkgname += text
165-
case elKeyword:
166-
if p.inKeyword && !p.keywordsBlockSkip && !p.keywordSkip {
167-
p.cur.keywords = append(p.cur.keywords, text)
168-
}
169-
case elCategory:
170-
if p.inCategory && !p.categoriesBlockSkip && !p.categorySkip {
171-
p.cur.categories = append(p.cur.categories, text)
172-
}
154+
text := strings.TrimSpace(string(t))
155+
if text != "" {
156+
*dst = append(*dst, text)
173157
}
174158
}
175159

0 commit comments

Comments
 (0)