Skip to content

Commit 03a3bdc

Browse files
committed
fix: deduplicate extraction logic, remove dead code, fix doc comments
- customParser now delegates to extractWithSelector (eliminates duplicated goquery parse+find+html loop) - image extraction moved out of evaluation loop — runs once on final result instead of every iteration - extract "ai-evaluator" to aiEvaluatorUser constant - fix incorrect doc comment on callAPI - remove unused getAuth test helper - remove redundant cancel() call and restating comments
1 parent 68ddf4a commit 03a3bdc

3 files changed

Lines changed: 13 additions & 24 deletions

File tree

extractor/evaluator.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ type OpenAIEvaluator struct {
4545
client *openai.Client
4646
}
4747

48-
// getClient returns the OpenAI client, creating it once on first use
4948
func (e *OpenAIEvaluator) getClient() *openai.Client {
5049
e.clientOnce.Do(func() {
5150
if e.clientConfig != nil {
@@ -71,7 +70,6 @@ func (e *OpenAIEvaluator) Evaluate(ctx context.Context, reqURL, extractedText, h
7170
if !errors.Is(err, errInvalidJSON) {
7271
return nil, err
7372
}
74-
cancel() // release the first context before creating a new one
7573

7674
// retry once on invalid JSON with a fresh timeout
7775
log.Printf("[WARN] invalid JSON from OpenAI for %s, retrying once", reqURL)
@@ -87,7 +85,7 @@ func (e *OpenAIEvaluator) Evaluate(ctx context.Context, reqURL, extractedText, h
8785
}
8886

8987
// callAPI makes a single API call and parses the response JSON.
90-
// Returns nil EvalResult (without error) if the response is not valid JSON.
88+
// returns errInvalidJSON if the response is not valid JSON.
9189
func (e *OpenAIEvaluator) callAPI(ctx context.Context, client *openai.Client, userPrompt string) (*EvalResult, error) {
9290
resp, err := client.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
9391
Model: e.Model,

extractor/readability.go

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,6 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
192192
return rb, nil
193193
}
194194

195-
// maxGPTIter returns MaxGPTIter or the default if not set
196195
func (f *UReadability) maxGPTIter() int {
197196
if f.MaxGPTIter > 0 {
198197
return f.MaxGPTIter
@@ -239,29 +238,31 @@ func (f *UReadability) evaluateAndImprove(ctx context.Context, reqURL, htmlBody
239238
continue
240239
}
241240

242-
// rebuild the response with new content
241+
// rebuild the response with new content (defer link normalisation and image extraction to after the loop)
243242
improved := *best
244243
improved.Content = f.getText(rawHTML, best.Title)
245244
improved.Rich = rawHTML
246245
improved.Excerpt = f.getSnippet(improved.Content)
247246

248-
// normalize links and extract images from the new content
247+
best = &improved
248+
bestSelector = eval.Selector
249+
}
250+
251+
// post-process the final result: normalise links and extract images once
252+
if bestSelector != "" {
249253
finalURL, err := url.Parse(best.URL)
250254
if err != nil {
251255
log.Printf("[WARN] failed to parse URL %q in evaluateAndImprove: %v", best.URL, err)
252256
return best
253257
}
254-
improved.Rich, improved.AllLinks = f.normalizeLinks(improved.Rich, finalURL)
255-
darticle, err := goquery.NewDocumentFromReader(strings.NewReader(improved.Rich))
258+
best.Rich, best.AllLinks = f.normalizeLinks(best.Rich, finalURL)
259+
darticle, err := goquery.NewDocumentFromReader(strings.NewReader(best.Rich))
256260
if err == nil {
257261
if im, allImages, ok := f.extractPics(darticle.Find("img"), reqURL); ok {
258-
improved.Image = im
259-
improved.AllImages = allImages
262+
best.Image = im
263+
best.AllImages = allImages
260264
}
261265
}
262-
263-
best = &improved
264-
bestSelector = eval.Selector
265266
}
266267

267268
// save rule if we found a better selector. merge with any existing rule (force mode may
@@ -317,16 +318,10 @@ func (f *UReadability) getContent(_ context.Context, body, reqURL string, rule *
317318
// custom rules parser
318319
customParser := func(body, reqURL string, rule datastore.Rule) (content, rich string, err error) {
319320
log.Printf("[DEBUG] custom extractor for %s", reqURL)
320-
dbody, err := goquery.NewDocumentFromReader(strings.NewReader(body))
321+
res, err := f.extractWithSelector(body, rule.Content)
321322
if err != nil {
322323
return "", "", err
323324
}
324-
var res string
325-
dbody.Find(rule.Content).Each(func(_ int, s *goquery.Selection) {
326-
if html, err := s.Html(); err == nil {
327-
res += html
328-
}
329-
})
330325
if res == "" {
331326
return "", "", fmt.Errorf("nothing extracted from %s, rule=%v", reqURL, rule)
332327
}

rest/server_test.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -657,10 +657,6 @@ func TestServer_ContentParsedWrong(t *testing.T) {
657657
})
658658
}
659659

660-
func getAuth(t *testing.T, url string) (response string, statusCode int) {
661-
return doAuth(t, "GET", url)
662-
}
663-
664660
func postAuth(t *testing.T, url string) (response string, statusCode int) {
665661
return doAuth(t, "POST", url)
666662
}

0 commit comments

Comments
 (0)