Skip to content

Commit 8ab3f95

Browse files
authored
fix: Do not raise KeyError in parse_sitemap when partial options are provided (#1940)
## Description - `ParseSitemapOptions` is declared as `TypedDict(total=False)`, so every key is optional, but `parse_sitemap` read `emit_nested_sitemaps`, `max_depth`, and `sitemap_retries` with direct subscripting. Any caller passing a partial options dict (e.g. just `timeout`) via `Sitemap.load()`, `Sitemap.parse()`, or `parse_sitemap()` hit a `KeyError`. - Read all options uniformly with `.get()` and the existing defaults, collapsing the duplicated `if/else` branches. - Add a regression test covering a partial options dict.
1 parent b7c62a2 commit 8ab3f95

2 files changed

Lines changed: 15 additions & 13 deletions

File tree

src/crawlee/_utils/sitemap.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -437,17 +437,11 @@ async def parse_sitemap(
437437
up to the specified maximum depth.
438438
"""
439439
# Set default options
440-
default_timeout = timedelta(seconds=30)
441-
if options:
442-
emit_nested_sitemaps = options['emit_nested_sitemaps']
443-
max_depth = options['max_depth']
444-
sitemap_retries = options['sitemap_retries']
445-
timeout = options.get('timeout', default_timeout)
446-
else:
447-
emit_nested_sitemaps = False
448-
max_depth = float('inf')
449-
sitemap_retries = 3
450-
timeout = default_timeout
440+
options = options or {}
441+
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
442+
max_depth = options.get('max_depth', float('inf'))
443+
sitemap_retries = options.get('sitemap_retries', 3)
444+
timeout = options.get('timeout', timedelta(seconds=30))
451445

452446
# Setup working state
453447
sources = list(initial_sources)

tests/unit/_utils/test_sitemap.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import base64
22
import gzip
3-
from datetime import datetime
3+
from datetime import datetime, timedelta
44
from typing import Any
55
from unittest.mock import AsyncMock, MagicMock
66

77
from yarl import URL
88

9-
from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
9+
from crawlee._utils.sitemap import ParseSitemapOptions, Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
1010
from crawlee.http_clients._base import HttpClient, HttpResponse
1111

1212
BASIC_SITEMAP = """
@@ -267,6 +267,14 @@ async def test_sitemap_from_string() -> None:
267267
assert set(sitemap.urls) == BASIC_RESULTS
268268

269269

270+
async def test_parse_sitemap_with_partial_options() -> None:
271+
"""Test that missing keys in partial `ParseSitemapOptions` fall back to defaults."""
272+
options = ParseSitemapOptions(timeout=timedelta(seconds=10))
273+
items = [item async for item in parse_sitemap([{'type': 'raw', 'content': BASIC_SITEMAP}], options=options)]
274+
275+
assert {item.loc for item in items} == BASIC_RESULTS
276+
277+
270278
async def test_discover_sitemap_from_robots_txt() -> None:
271279
"""Sitemap URL found in robots.txt is yielded."""
272280
robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'

0 commit comments

Comments
 (0)