Skip to content

Commit 58b4d3f

Browse files
committed
feat: add OpenAI-powered content parsing improvement
Adds ContentParsedWrong endpoint that asks ChatGPT for a CSS selector to extract content from a URL, compares with current extraction, and saves a new rule if the result is different. Resolves #27.
1 parent d8d89e9 commit 58b4d3f

2 files changed

Lines changed: 130 additions & 0 deletions

File tree

extractor/readability.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,114 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
398398
return rb, nil
399399
}
400400

401+
// ContentParsedWrong asks ChatGPT for a CSS selector to extract content from the URL,
402+
// compares the result with the current extraction, and saves a new rule if different.
403+
func (f *UReadability) ContentParsedWrong(ctx context.Context, urlStr string) (string, error) {
404+
originalContent, err := f.Extract(ctx, urlStr)
405+
if err != nil {
406+
return "", fmt.Errorf("failed to extract content: %w", err)
407+
}
408+
409+
selector, err := f.getChatGPTSelector(ctx, urlStr)
410+
if err != nil {
411+
return "", fmt.Errorf("failed to get CSS selector: %w", err)
412+
}
413+
414+
body, err := f.getHTMLBody(ctx, urlStr)
415+
if err != nil {
416+
return "", fmt.Errorf("failed to get HTML body: %w", err)
417+
}
418+
419+
newContent, err := f.extractContentWithSelector(body, selector)
420+
if err != nil {
421+
return "", fmt.Errorf("failed to extract content with new selector: %w", err)
422+
}
423+
424+
if strings.TrimSpace(originalContent.Content) != strings.TrimSpace(newContent) {
425+
rule := datastore.Rule{
426+
Domain: extractDomain(urlStr),
427+
Content: selector,
428+
TestURLs: []string{urlStr},
429+
Enabled: true,
430+
}
431+
432+
if _, err = f.Rules.Save(ctx, rule); err != nil {
433+
return "", fmt.Errorf("failed to save new rule: %w", err)
434+
}
435+
436+
return fmt.Sprintf("new custom rule with DOM %s created", selector), nil
437+
}
438+
439+
return "default rule is good, no need to create the custom one", nil
440+
}
441+
442+
func (f *UReadability) getChatGPTSelector(ctx context.Context, urlStr string) (string, error) {
443+
client := openai.NewClient(f.OpenAIKey)
444+
resp, err := client.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
445+
Model: openai.GPT4o,
446+
Messages: []openai.ChatCompletionMessage{
447+
{
448+
Role: openai.ChatMessageRoleSystem,
449+
Content: "You are a helpful assistant that provides CSS selectors for extracting main content from web pages.",
450+
},
451+
{
452+
Role: openai.ChatMessageRoleUser,
453+
Content: fmt.Sprintf("Given the URL %s, identify the CSS selector that can be used to extract the main content "+
454+
"of the article. This typically includes elements like 'article', 'main', or specific classes. "+
455+
"Return only this selector and nothing else.", urlStr),
456+
},
457+
},
458+
})
459+
if err != nil {
460+
return "", err
461+
}
462+
463+
if len(resp.Choices) == 0 {
464+
return "", errors.New("no response from OpenAI")
465+
}
466+
return resp.Choices[0].Message.Content, nil
467+
}
468+
469+
// getHTMLBody fetches page HTML for re-extraction with a new selector
470+
func (f *UReadability) getHTMLBody(ctx context.Context, urlStr string) (string, error) {
471+
httpClient := &http.Client{Timeout: f.TimeOut}
472+
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, http.NoBody)
473+
if err != nil {
474+
return "", err
475+
}
476+
req.Header.Set("User-Agent", userAgent)
477+
resp, err := httpClient.Do(req)
478+
if err != nil {
479+
return "", err
480+
}
481+
defer func() {
482+
if closeErr := resp.Body.Close(); closeErr != nil {
483+
log.Printf("[WARN] failed to close response body, error=%v", closeErr)
484+
}
485+
}()
486+
body, err := io.ReadAll(resp.Body)
487+
if err != nil {
488+
return "", err
489+
}
490+
return string(body), nil
491+
}
492+
493+
func (f *UReadability) extractContentWithSelector(body, selector string) (string, error) {
494+
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
495+
if err != nil {
496+
return "", err
497+
}
498+
return doc.Find(selector).Text(), nil
499+
}
500+
501+
func extractDomain(urlStr string) string {
502+
u, err := url.Parse(urlStr)
503+
if err != nil {
504+
return ""
505+
}
506+
return u.Hostname()
507+
}
508+
401509
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
402510
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
403511
// and at last tries to use general readability parser

rest/server.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ func (s *Server) routes(frontendDir string) http.Handler {
8787
protectedGroup.HandleFunc("POST /rule", s.saveRule)
8888
protectedGroup.HandleFunc("POST /toggle-rule/{id}", s.toggleRule)
8989
protectedGroup.HandleFunc("POST /preview", s.handlePreview)
90+
protectedGroup.HandleFunc("GET /content-parsed-wrong", s.contentParsedWrong)
9091
})
9192
})
9293

@@ -411,6 +412,27 @@ func (s *Server) handleMetrics(w http.ResponseWriter, _ *http.Request) {
411412
})
412413
}
413414

415+
func (s *Server) contentParsedWrong(w http.ResponseWriter, r *http.Request) {
416+
if s.Readability.OpenAIKey == "" {
417+
rest.SendErrorJSON(w, r, log.Default(), http.StatusBadRequest, nil, "OpenAI key is not set")
418+
return
419+
}
420+
421+
exampleURL := r.URL.Query().Get("url")
422+
if exampleURL == "" {
423+
rest.SendErrorJSON(w, r, log.Default(), http.StatusBadRequest, nil, "url parameter is required")
424+
return
425+
}
426+
427+
message, err := s.Readability.ContentParsedWrong(r.Context(), exampleURL)
428+
if err != nil {
429+
rest.SendErrorJSON(w, r, log.Default(), http.StatusInternalServerError, err, err.Error())
430+
return
431+
}
432+
433+
rest.RenderJSON(w, JSON{"message": message})
434+
}
435+
414436
func getBid(id string) bson.ObjectID {
415437
bid, err := bson.ObjectIDFromHex(id)
416438
if err != nil {

0 commit comments

Comments
 (0)