Skip to content

Commit a820fc5

Browse files
authored
fix: skip empty ByteStream in HTMLToDocument to avoid noisy lxml logs (#11670)
1 parent 8aef2a4 commit a820fc5

3 files changed

Lines changed: 27 additions & 0 deletions

File tree

haystack/components/converters/html.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ def run(
111111
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
112112
continue
113113

114+
if not bytestream.data:
115+
logger.warning("Skipping {source} because it is empty.", source=source)
116+
continue
117+
114118
try:
115119
text = extract(bytestream.data.decode("utf-8"), **merged_extraction_kwargs)
116120
except Exception as conversion_e:
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
fixes:
3+
- |
4+
``HTMLToDocument`` now skips ``ByteStream`` objects with empty content instead of
5+
passing them to trafilatura. This prevents noisy lxml parse error logs (such as
6+
"Document is empty") when a fetcher emits an empty stream.

test/components/converters/test_html_to_document.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,23 @@ def test_run_error_handling(self, caplog):
156156
assert "Could not read non_existing_file.html" in caplog.text
157157
assert results["documents"] == []
158158

159+
def test_run_empty_bytestream(self, caplog):
160+
"""
161+
Test that an empty ByteStream is skipped without invoking extraction,
162+
so no noisy lxml parse errors are emitted.
163+
"""
164+
empty_stream = ByteStream(data=b"")
165+
empty_stream.mime_type = "text/html"
166+
converter = HTMLToDocument()
167+
168+
with patch("haystack.components.converters.html.extract") as mock_extract:
169+
with caplog.at_level(logging.WARNING):
170+
results = converter.run(sources=[empty_stream])
171+
172+
assert results["documents"] == []
173+
mock_extract.assert_not_called()
174+
assert "because it is empty" in caplog.text
175+
159176
def test_mixed_sources_run(self, test_files_path):
160177
"""
161178
Test if the component runs correctly if the input is a mix of paths and ByteStreams.

0 commit comments

Comments
 (0)