@@ -398,6 +398,114 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
398398 return rb , nil
399399}
400400
401+ // ContentParsedWrong asks ChatGPT for a CSS selector to extract content from the URL,
402+ // compares the result with the current extraction, and saves a new rule if different.
403+ func (f * UReadability ) ContentParsedWrong (ctx context.Context , urlStr string ) (string , error ) {
404+ originalContent , err := f .Extract (ctx , urlStr )
405+ if err != nil {
406+ return "" , fmt .Errorf ("failed to extract content: %w" , err )
407+ }
408+
409+ selector , err := f .getChatGPTSelector (ctx , urlStr )
410+ if err != nil {
411+ return "" , fmt .Errorf ("failed to get CSS selector: %w" , err )
412+ }
413+
414+ body , err := f .getHTMLBody (ctx , urlStr )
415+ if err != nil {
416+ return "" , fmt .Errorf ("failed to get HTML body: %w" , err )
417+ }
418+
419+ newContent , err := f .extractContentWithSelector (body , selector )
420+ if err != nil {
421+ return "" , fmt .Errorf ("failed to extract content with new selector: %w" , err )
422+ }
423+
424+ if strings .TrimSpace (originalContent .Content ) != strings .TrimSpace (newContent ) {
425+ rule := datastore.Rule {
426+ Domain : extractDomain (urlStr ),
427+ Content : selector ,
428+ TestURLs : []string {urlStr },
429+ Enabled : true ,
430+ }
431+
432+ if _ , err = f .Rules .Save (ctx , rule ); err != nil {
433+ return "" , fmt .Errorf ("failed to save new rule: %w" , err )
434+ }
435+
436+ return fmt .Sprintf ("new custom rule with DOM %s created" , selector ), nil
437+ }
438+
439+ return "default rule is good, no need to create the custom one" , nil
440+ }
441+
442+ func (f * UReadability ) getChatGPTSelector (ctx context.Context , urlStr string ) (string , error ) {
443+ client := openai .NewClient (f .OpenAIKey )
444+ resp , err := client .CreateChatCompletion (ctx , openai.ChatCompletionRequest {
445+ Model : openai .GPT4o ,
446+ Messages : []openai.ChatCompletionMessage {
447+ {
448+ Role : openai .ChatMessageRoleSystem ,
449+ Content : "You are a helpful assistant that provides CSS selectors for extracting main content from web pages." ,
450+ },
451+ {
452+ Role : openai .ChatMessageRoleUser ,
453+ Content : fmt .Sprintf ("Given the URL %s, identify the CSS selector that can be used to extract the main content " +
454+ "of the article. This typically includes elements like 'article', 'main', or specific classes. " +
455+ "Return only this selector and nothing else." , urlStr ),
456+ },
457+ },
458+ })
459+ if err != nil {
460+ return "" , err
461+ }
462+
463+ if len (resp .Choices ) == 0 {
464+ return "" , errors .New ("no response from OpenAI" )
465+ }
466+ return resp .Choices [0 ].Message .Content , nil
467+ }
468+
469+ // getHTMLBody fetches page HTML for re-extraction with a new selector
470+ func (f * UReadability ) getHTMLBody (ctx context.Context , urlStr string ) (string , error ) {
471+ httpClient := & http.Client {Timeout : f .TimeOut }
472+ req , err := http .NewRequestWithContext (ctx , "GET" , urlStr , http .NoBody )
473+ if err != nil {
474+ return "" , err
475+ }
476+ req .Header .Set ("User-Agent" , userAgent )
477+ resp , err := httpClient .Do (req )
478+ if err != nil {
479+ return "" , err
480+ }
481+ defer func () {
482+ if closeErr := resp .Body .Close (); closeErr != nil {
483+ log .Printf ("[WARN] failed to close response body, error=%v" , closeErr )
484+ }
485+ }()
486+ body , err := io .ReadAll (resp .Body )
487+ if err != nil {
488+ return "" , err
489+ }
490+ return string (body ), nil
491+ }
492+
493+ func (f * UReadability ) extractContentWithSelector (body , selector string ) (string , error ) {
494+ doc , err := goquery .NewDocumentFromReader (strings .NewReader (body ))
495+ if err != nil {
496+ return "" , err
497+ }
498+ return doc .Find (selector ).Text (), nil
499+ }
500+
501+ func extractDomain (urlStr string ) string {
502+ u , err := url .Parse (urlStr )
503+ if err != nil {
504+ return ""
505+ }
506+ return u .Hostname ()
507+ }
508+
401509// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
402510// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
403511// and at last tries to use general readability parser
0 commit comments