Merge pull request #1885 from unclecode/develop

unclecode · web-flow · commit 1debe5f5fcc1 · 2026-03-30T09:59:58.000+07:00
docs: update version references to 0.8.6
diff --git a/README.md b/README.md
@@ -37,15 +37,15 @@ Limited slots._
 
 Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
 
-[✨ Check out latest update v0.8.5](#-recent-updates)
+[✨ Check out latest update v0.8.6](#-recent-updates)
 
-✨ **New in v0.8.5**: Anti-Bot Detection, Shadow DOM & 60+ Bug Fixes! Automatic 3-tier anti-bot detection with proxy escalation, Shadow DOM flattening, deep crawl cancellation, config defaults API, consent popup removal, and critical security patches. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.8.5.md)
+✨ **New in v0.8.6**: Security hotfix — replaced `litellm` with `unclecode-litellm` due to a PyPI supply chain compromise. If you're on v0.8.5, please upgrade immediately.
 
-✨ Recent v0.8.0: Crash Recovery & Prefetch Mode! Deep crawl crash recovery with `resume_state` and `on_state_change` callbacks for long-running crawls. New `prefetch=True` mode for 5-10x faster URL discovery. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.8.0.md)
+✨ Recent v0.8.5: Anti-Bot Detection, Shadow DOM & 60+ Bug Fixes! Automatic 3-tier anti-bot detection with proxy escalation, Shadow DOM flattening, deep crawl cancellation, config defaults API, consent popup removal, and critical security patches. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.8.5.md)
 
-✨ Previous v0.7.8: Stability & Bug Fix Release! 11 bug fixes addressing Docker API issues, LLM extraction improvements, URL handling fixes, and dependency updates. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.8.md)
+✨ Previous v0.8.0: Crash Recovery & Prefetch Mode! Deep crawl crash recovery with `resume_state` and `on_state_change` callbacks for long-running crawls. New `prefetch=True` mode for 5-10x faster URL discovery. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.8.0.md)
 
-✨ Previous v0.7.7: Complete Self-Hosting Platform with Real-time Monitoring! Enterprise-grade monitoring dashboard, comprehensive REST API, WebSocket streaming, and smart browser pool management. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.7.md)
+✨ Previous v0.7.8: Stability & Bug Fix Release! 11 bug fixes addressing Docker API issues, LLM extraction improvements, URL handling fixes, and dependency updates. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.8.md)
 
 <details>
   <summary>🤓 <strong>My Personal Story</strong></summary>
@@ -565,6 +565,17 @@ async def test_news_crawl():
 ## ✨ Recent Updates
 
 <details open>
+<summary><strong>Version 0.8.6 — Security Hotfix: litellm Supply Chain Fix</strong></summary>
+
+Replaced `litellm` dependency with `unclecode-litellm` due to a PyPI supply chain compromise affecting the original package. If you're on v0.8.5 or earlier, upgrade immediately.
+
+```bash
+pip install -U crawl4ai
+```
+
+</details>
+
+<details>
 <summary><strong>Version 0.8.5 Release Highlights - Anti-Bot Detection, Shadow DOM & 60+ Bug Fixes</strong></summary>
 
 Our biggest release since v0.8.0. Anti-bot detection with proxy escalation, Shadow DOM flattening, deep crawl cancellation, and over 60 bug fixes.
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -1660,6 +1660,19 @@ def __init__(
             raise ValueError(
                 "chunking_strategy must be an instance of ChunkingStrategy"
             )
+        if self.markdown_generator is not None and not isinstance(
+            self.markdown_generator, MarkdownGenerationStrategy
+        ):
+            hint = ""
+            if isinstance(self.markdown_generator, dict):
+                hint = (
+                    ' The JSON format must be {"type": "<ClassName>", "params": {...}}.'
+                    ' Note: "params" is required — "options" or other keys are not recognized.'
+                )
+            raise ValueError(
+                "markdown_generator must be an instance of MarkdownGenerationStrategy, "
+                f"got {type(self.markdown_generator).__name__}.{hint}"
+            )
 
         # Set default chunking strategy if None
         if self.chunking_strategy is None:
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
@@ -59,13 +59,13 @@ Pull and run images directly from Docker Hub without building locally.
 
 #### 1. Pull the Image
 
-Our latest stable release is `0.8.5`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
+Our latest stable release is `0.8.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
 
 ```bash
-# Pull the latest stable version (0.8.5)
-docker pull unclecode/crawl4ai:0.8.5
+# Pull the latest stable version (0.8.6)
+docker pull unclecode/crawl4ai:0.8.6
 
-# Or use the latest tag (points to 0.8.0)
+# Or use the latest tag
 docker pull unclecode/crawl4ai:latest
 ```
 
@@ -100,7 +100,7 @@ EOL
       -p 11235:11235 \
       --name crawl4ai \
       --shm-size=1g \
-      unclecode/crawl4ai:0.8.5
+      unclecode/crawl4ai:0.8.6
     ```
 
 *   **With LLM support:**
@@ -111,7 +111,7 @@ EOL
       --name crawl4ai \
       --env-file .llm.env \
       --shm-size=1g \
-      unclecode/crawl4ai:0.8.5
+      unclecode/crawl4ai:0.8.6
     ```
 
 > The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
@@ -184,7 +184,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
     ```bash
     # Pulls and runs the release candidate from Docker Hub
     # Automatically selects the correct architecture
-    IMAGE=unclecode/crawl4ai:0.8.5 docker compose up -d
+    IMAGE=unclecode/crawl4ai:0.8.6 docker compose up -d
     ```
 
 *   **Build and Run Locally:**
diff --git a/tests/test_markdown_generator_validation_1880.py b/tests/test_markdown_generator_validation_1880.py
@@ -0,0 +1,158 @@
+"""
+Tests for #1880: markdown_generator deserialization validation in CrawlerRunConfig
+
+Ensures that:
+1. Correct {"type": ..., "params": {...}} format deserializes properly
+2. Wrong key names ("options") raise a clear ValueError, not a cryptic AttributeError
+3. Nested content_filter deserializes correctly
+"""
+import pytest
+
+
+class TestMarkdownGeneratorDeserialization:
+    """Test CrawlerRunConfig.load() with markdown_generator configs."""
+
+    def test_params_key_deserializes_correctly(self):
+        """{"type": ..., "params": {...}} should produce a real object."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+
+        data = {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {},
+            }
+        }
+        config = CrawlerRunConfig.load(data)
+        from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+        assert isinstance(config.markdown_generator, DefaultMarkdownGenerator)
+
+    def test_params_with_content_filter(self):
+        """Nested BM25ContentFilter should deserialize inside markdown_generator."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+        from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+        data = {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "BM25ContentFilter",
+                        "params": {
+                            "user_query": "example",
+                            "bm25_threshold": 0.9,
+                        },
+                    }
+                },
+            }
+        }
+        config = CrawlerRunConfig.load(data)
+        assert isinstance(config.markdown_generator.content_filter, BM25ContentFilter)
+        assert config.markdown_generator.content_filter.user_query == "example"
+        assert config.markdown_generator.content_filter.bm25_threshold == 0.9
+
+    def test_params_with_pruning_filter(self):
+        """PruningContentFilter should also work."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+        from crawl4ai.content_filter_strategy import PruningContentFilter
+
+        data = {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {},
+                    }
+                },
+            }
+        }
+        config = CrawlerRunConfig.load(data)
+        assert isinstance(config.markdown_generator.content_filter, PruningContentFilter)
+
+    def test_options_key_raises_clear_error(self):
+        """Using "options" instead of "params" should raise ValueError with hint."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+
+        data = {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "options": {"content_filter": {}},
+            }
+        }
+        with pytest.raises(ValueError, match="params.*required"):
+            CrawlerRunConfig.load(data)
+
+    def test_arbitrary_key_raises_clear_error(self):
+        """Any non-"params" key should raise ValueError."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+
+        data = {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "settings": {},
+            }
+        }
+        with pytest.raises(ValueError, match="markdown_generator must be an instance"):
+            CrawlerRunConfig.load(data)
+
+    def test_plain_dict_raises_clear_error(self):
+        """A dict without type/params structure should raise ValueError."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+
+        data = {
+            "markdown_generator": {"foo": "bar"}
+        }
+        with pytest.raises(ValueError, match="got dict"):
+            CrawlerRunConfig.load(data)
+
+    def test_error_message_mentions_params_key(self):
+        """Error message should specifically mention that 'params' is required."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+
+        data = {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "options": {},
+            }
+        }
+        with pytest.raises(ValueError) as exc_info:
+            CrawlerRunConfig.load(data)
+        msg = str(exc_info.value)
+        assert "params" in msg
+        assert "options" in msg or "not recognized" in msg
+
+    def test_none_markdown_generator_uses_default(self):
+        """None should use the default (DefaultMarkdownGenerator)."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+        from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+        config = CrawlerRunConfig(markdown_generator=None)
+        # None is allowed — the crawler falls back to default behavior
+        assert config.markdown_generator is None
+
+    def test_valid_instance_passes_validation(self):
+        """Passing an actual instance should work fine."""
+        from crawl4ai.async_configs import CrawlerRunConfig
+        from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+        from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+        gen = DefaultMarkdownGenerator(
+            content_filter=BM25ContentFilter(user_query="test")
+        )
+        config = CrawlerRunConfig(markdown_generator=gen)
+        assert config.markdown_generator is gen
+        assert config.markdown_generator.content_filter.user_query == "test"
+
+
+class TestExistingValidationStillWorks:
+    """Ensure existing extraction_strategy/chunking_strategy validation unchanged."""
+
+    def test_extraction_strategy_dict_raises(self):
+        from crawl4ai.async_configs import CrawlerRunConfig
+        with pytest.raises(ValueError, match="extraction_strategy"):
+            CrawlerRunConfig(extraction_strategy={"type": "bad"})
+
+    def test_chunking_strategy_dict_raises(self):
+        from crawl4ai.async_configs import CrawlerRunConfig
+        with pytest.raises(ValueError, match="chunking_strategy"):
+            CrawlerRunConfig(chunking_strategy={"type": "bad"})