@@ -324,3 +324,91 @@ async def test_fetch_with_proxy(self):
324324
325325 # Verify AsyncClient was called with proxy
326326 mock_client_class .assert_called_once_with (proxy = "http://proxy.example.com:8080" )
327+
328+
329+
330+ class TestExtractContentFallback :
331+ """Tests for the fallback extraction in extract_content_from_html."""
332+
333+ def test_readability_sufficient_content_no_fallback (self ):
334+ """When Readability returns enough content, no fallback is triggered."""
335+ html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
336+ readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"
337+
338+ with patch ("readabilipy.simple_json.simple_json_from_html_string" ) as mock_readability :
339+ mock_readability .return_value = {"content" : readability_content }
340+ result = extract_content_from_html (html )
341+ # Should only be called once (Readability path succeeds)
342+ assert mock_readability .call_count == 1
343+ assert "word" in result
344+
345+ def test_readability_strips_content_falls_back_to_no_readability (self ):
346+ """When Readability returns too little, falls back to non-Readability extraction."""
347+ # Simulate a large HTML page where Readability strips hidden SSR content
348+ html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"
349+
350+ def mock_simple_json (h , use_readability = True ):
351+ if use_readability :
352+ # Readability stripped everything, returns almost nothing
353+ return {"content" : "<div>Loading...</div>" }
354+ else :
355+ # Without Readability, returns full content
356+ return {"content" : "<div>" + "<p>content </p>" * 500 + "</div>" }
357+
358+ with patch ("readabilipy.simple_json.simple_json_from_html_string" , side_effect = mock_simple_json ):
359+ result = extract_content_from_html (html )
360+ assert "content" in result
361+ assert len (result .strip ()) > 100
362+
363+ def test_both_readability_modes_fail_falls_back_to_markdownify (self ):
364+ """When both readabilipy modes return too little, falls back to raw markdownify."""
365+ html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"
366+
367+ with patch ("readabilipy.simple_json.simple_json_from_html_string" ) as mock_readability :
368+ # Both modes return empty/minimal content
369+ mock_readability .return_value = {"content" : "" }
370+ result = extract_content_from_html (html )
371+ # Should fall through to markdownify on raw HTML
372+ assert "important data" in result
373+ assert mock_readability .call_count == 2 # called for both modes
374+
375+ def test_completely_empty_html_returns_error (self ):
376+ """Completely empty HTML returns error message."""
377+ with patch ("readabilipy.simple_json.simple_json_from_html_string" ) as mock_readability :
378+ mock_readability .return_value = {"content" : "" }
379+ result = extract_content_from_html ("" )
380+ assert "<error>" in result
381+
382+ def test_readability_none_content_triggers_fallback (self ):
383+ """When Readability returns None content, fallback is triggered."""
384+ html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"
385+
386+ call_count = [0 ]
387+ def mock_simple_json (h , use_readability = True ):
388+ call_count [0 ] += 1
389+ if call_count [0 ] == 1 :
390+ return {"content" : None } # Readability returns None
391+ else :
392+ return {"content" : "<div>" + "<p>real content </p>" * 200 + "</div>" }
393+
394+ with patch ("readabilipy.simple_json.simple_json_from_html_string" , side_effect = mock_simple_json ):
395+ result = extract_content_from_html (html )
396+ assert "real content" in result
397+
398+ def test_threshold_is_one_percent_of_html (self ):
399+ """The fallback threshold is 1% of the input HTML length."""
400+ # 10000 chars of HTML -> threshold is 100 chars
401+ padding = "x" * 9000
402+ html = f"<html><body><div style=\" visibility:hidden\" >{ padding } </div><p>tiny</p></body></html>"
403+
404+ def mock_simple_json (h , use_readability = True ):
405+ if use_readability :
406+ # Returns less than 1% of input
407+ return {"content" : "<p>tiny</p>" }
408+ else :
409+ return {"content" : f"<div>{ padding } </div><p>tiny</p>" }
410+
411+ with patch ("readabilipy.simple_json.simple_json_from_html_string" , side_effect = mock_simple_json ):
412+ result = extract_content_from_html (html )
413+ # Should have triggered fallback since "tiny" (4 chars) < 100 (1% threshold)
414+ assert len (result .strip ()) > 50
0 commit comments