Skip to content

Commit 7db517a

Browse files
authored
fix: Reset private state correctly in sitemap parsers (#1938)
## Description Fixes two attribute-name typos in `src/crawlee/_utils/sitemap.py` where the underscore prefix was missing on assignment, so each statement silently created a new public attribute instead of resetting the intended private state: - `_XMLSaxSitemapHandler.endElement` set `self.current_tag = None` instead of `self._current_tag = None`. The handler therefore never left the "inside a tracked tag" state, so stray text between elements kept being appended to the buffer and a duplicate close tag could re-process stale buffer contents. - `_TxtSitemapParser.flush` set `self.buffer = ''` instead of `self._buffer = ''`. Reusing the parser after `flush()` concatenated the leftover URL with the next chunk, yielding corrupted URLs like `https://b.com/https://c.com/`. Both fixes add the missing underscore. Regression tests cover the state reset in the XML handler and the buffer corruption in the TXT parser.
1 parent fe9464c commit 7db517a

2 files changed

Lines changed: 37 additions & 3 deletions

File tree

src/crawlee/_utils/sitemap.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def endElement(self, name: str) -> None:
122122
elif name == 'changefreq' and text in VALID_CHANGE_FREQS:
123123
self._current_url['changefreq'] = text
124124

125-
self.current_tag = None
125+
self._current_tag = None
126126

127127
if name == 'url' and 'loc' in self._current_url:
128128
self.items.append({'type': 'url', **self._current_url})
@@ -156,7 +156,7 @@ async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
156156
url = self._buffer.strip()
157157
if url:
158158
yield {'type': 'url', 'loc': url}
159-
self.buffer = ''
159+
self._buffer = ''
160160

161161
def close(self) -> None:
162162
"""Clean up resources."""

tests/unit/_utils/test_sitemap.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,15 @@
66

77
from yarl import URL
88

9-
from crawlee._utils.sitemap import ParseSitemapOptions, Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
9+
from crawlee._utils.sitemap import (
10+
ParseSitemapOptions,
11+
Sitemap,
12+
SitemapUrl,
13+
_TxtSitemapParser,
14+
_XMLSaxSitemapHandler,
15+
discover_valid_sitemaps,
16+
parse_sitemap,
17+
)
1018
from crawlee.http_clients._base import HttpClient, HttpResponse
1119

1220
BASIC_SITEMAP = """
@@ -347,6 +355,32 @@ async def test_discover_sitemaps_multiple_domains() -> None:
347355
}
348356

349357

358+
def test_xml_handler_resets_current_tag_on_end_element() -> None:
359+
"""Closing a tracked tag resets the handler's current tag so stray text between elements is ignored."""
360+
handler = _XMLSaxSitemapHandler()
361+
handler.startElement('urlset', MagicMock())
362+
handler.startElement('url', MagicMock())
363+
handler.startElement('loc', MagicMock())
364+
handler.characters('https://example.com/')
365+
handler.endElement('loc')
366+
367+
assert handler._current_tag is None
368+
369+
# Stray text between elements must not be buffered.
370+
handler.characters(' stray text ')
371+
assert handler._buffer == 'https://example.com/'
372+
373+
374+
async def test_txt_parser_flush_clears_buffer() -> None:
375+
"""Feeding more data after flush() must not concatenate the previously flushed URL."""
376+
parser = _TxtSitemapParser()
377+
items = [item async for item in parser.process_chunk('https://a.com/\nhttps://b.com/')]
378+
items += [item async for item in parser.flush()]
379+
items += [item async for item in parser.process_chunk('https://c.com/\n')]
380+
381+
assert [item['loc'] for item in items] == ['https://a.com/', 'https://b.com/', 'https://c.com/']
382+
383+
350384
async def test_discover_sitemap_url_without_host_skipped() -> None:
351385
"""URLs without a host are skipped."""
352386
http_client = _make_mock_client({})

0 commit comments

Comments
 (0)