|
| 1 | +""" |
| 2 | +Tests for #1880: markdown_generator deserialization validation in CrawlerRunConfig |
| 3 | +
|
| 4 | +Ensures that: |
| 5 | +1. Correct {"type": ..., "params": {...}} format deserializes properly |
| 6 | +2. Wrong key names ("options") raise a clear ValueError, not a cryptic AttributeError |
| 7 | +3. Nested content_filter deserializes correctly |
| 8 | +""" |
| 9 | +import pytest |
| 10 | + |
| 11 | + |
| 12 | +class TestMarkdownGeneratorDeserialization: |
| 13 | + """Test CrawlerRunConfig.load() with markdown_generator configs.""" |
| 14 | + |
| 15 | + def test_params_key_deserializes_correctly(self): |
| 16 | + """{"type": ..., "params": {...}} should produce a real object.""" |
| 17 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 18 | + |
| 19 | + data = { |
| 20 | + "markdown_generator": { |
| 21 | + "type": "DefaultMarkdownGenerator", |
| 22 | + "params": {}, |
| 23 | + } |
| 24 | + } |
| 25 | + config = CrawlerRunConfig.load(data) |
| 26 | + from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| 27 | + assert isinstance(config.markdown_generator, DefaultMarkdownGenerator) |
| 28 | + |
| 29 | + def test_params_with_content_filter(self): |
| 30 | + """Nested BM25ContentFilter should deserialize inside markdown_generator.""" |
| 31 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 32 | + from crawl4ai.content_filter_strategy import BM25ContentFilter |
| 33 | + |
| 34 | + data = { |
| 35 | + "markdown_generator": { |
| 36 | + "type": "DefaultMarkdownGenerator", |
| 37 | + "params": { |
| 38 | + "content_filter": { |
| 39 | + "type": "BM25ContentFilter", |
| 40 | + "params": { |
| 41 | + "user_query": "example", |
| 42 | + "bm25_threshold": 0.9, |
| 43 | + }, |
| 44 | + } |
| 45 | + }, |
| 46 | + } |
| 47 | + } |
| 48 | + config = CrawlerRunConfig.load(data) |
| 49 | + assert isinstance(config.markdown_generator.content_filter, BM25ContentFilter) |
| 50 | + assert config.markdown_generator.content_filter.user_query == "example" |
| 51 | + assert config.markdown_generator.content_filter.bm25_threshold == 0.9 |
| 52 | + |
| 53 | + def test_params_with_pruning_filter(self): |
| 54 | + """PruningContentFilter should also work.""" |
| 55 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 56 | + from crawl4ai.content_filter_strategy import PruningContentFilter |
| 57 | + |
| 58 | + data = { |
| 59 | + "markdown_generator": { |
| 60 | + "type": "DefaultMarkdownGenerator", |
| 61 | + "params": { |
| 62 | + "content_filter": { |
| 63 | + "type": "PruningContentFilter", |
| 64 | + "params": {}, |
| 65 | + } |
| 66 | + }, |
| 67 | + } |
| 68 | + } |
| 69 | + config = CrawlerRunConfig.load(data) |
| 70 | + assert isinstance(config.markdown_generator.content_filter, PruningContentFilter) |
| 71 | + |
| 72 | + def test_options_key_raises_clear_error(self): |
| 73 | + """Using "options" instead of "params" should raise ValueError with hint.""" |
| 74 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 75 | + |
| 76 | + data = { |
| 77 | + "markdown_generator": { |
| 78 | + "type": "DefaultMarkdownGenerator", |
| 79 | + "options": {"content_filter": {}}, |
| 80 | + } |
| 81 | + } |
| 82 | + with pytest.raises(ValueError, match="params.*required"): |
| 83 | + CrawlerRunConfig.load(data) |
| 84 | + |
| 85 | + def test_arbitrary_key_raises_clear_error(self): |
| 86 | + """Any non-"params" key should raise ValueError.""" |
| 87 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 88 | + |
| 89 | + data = { |
| 90 | + "markdown_generator": { |
| 91 | + "type": "DefaultMarkdownGenerator", |
| 92 | + "settings": {}, |
| 93 | + } |
| 94 | + } |
| 95 | + with pytest.raises(ValueError, match="markdown_generator must be an instance"): |
| 96 | + CrawlerRunConfig.load(data) |
| 97 | + |
| 98 | + def test_plain_dict_raises_clear_error(self): |
| 99 | + """A dict without type/params structure should raise ValueError.""" |
| 100 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 101 | + |
| 102 | + data = { |
| 103 | + "markdown_generator": {"foo": "bar"} |
| 104 | + } |
| 105 | + with pytest.raises(ValueError, match="got dict"): |
| 106 | + CrawlerRunConfig.load(data) |
| 107 | + |
| 108 | + def test_error_message_mentions_params_key(self): |
| 109 | + """Error message should specifically mention that 'params' is required.""" |
| 110 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 111 | + |
| 112 | + data = { |
| 113 | + "markdown_generator": { |
| 114 | + "type": "DefaultMarkdownGenerator", |
| 115 | + "options": {}, |
| 116 | + } |
| 117 | + } |
| 118 | + with pytest.raises(ValueError) as exc_info: |
| 119 | + CrawlerRunConfig.load(data) |
| 120 | + msg = str(exc_info.value) |
| 121 | + assert "params" in msg |
| 122 | + assert "options" in msg or "not recognized" in msg |
| 123 | + |
| 124 | + def test_none_markdown_generator_uses_default(self): |
| 125 | + """None should use the default (DefaultMarkdownGenerator).""" |
| 126 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 127 | + from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| 128 | + |
| 129 | + config = CrawlerRunConfig(markdown_generator=None) |
| 130 | + # None is allowed — the crawler falls back to default behavior |
| 131 | + assert config.markdown_generator is None |
| 132 | + |
| 133 | + def test_valid_instance_passes_validation(self): |
| 134 | + """Passing an actual instance should work fine.""" |
| 135 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 136 | + from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| 137 | + from crawl4ai.content_filter_strategy import BM25ContentFilter |
| 138 | + |
| 139 | + gen = DefaultMarkdownGenerator( |
| 140 | + content_filter=BM25ContentFilter(user_query="test") |
| 141 | + ) |
| 142 | + config = CrawlerRunConfig(markdown_generator=gen) |
| 143 | + assert config.markdown_generator is gen |
| 144 | + assert config.markdown_generator.content_filter.user_query == "test" |
| 145 | + |
| 146 | + |
| 147 | +class TestExistingValidationStillWorks: |
| 148 | + """Ensure existing extraction_strategy/chunking_strategy validation unchanged.""" |
| 149 | + |
| 150 | + def test_extraction_strategy_dict_raises(self): |
| 151 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 152 | + with pytest.raises(ValueError, match="extraction_strategy"): |
| 153 | + CrawlerRunConfig(extraction_strategy={"type": "bad"}) |
| 154 | + |
| 155 | + def test_chunking_strategy_dict_raises(self): |
| 156 | + from crawl4ai.async_configs import CrawlerRunConfig |
| 157 | + with pytest.raises(ValueError, match="chunking_strategy"): |
| 158 | + CrawlerRunConfig(chunking_strategy={"type": "bad"}) |
0 commit comments