Skip to content

Commit 5a8fb57

Browse files
authored
Merge pull request #1648 from christopher-w-murphy/fix/content-relevance-filter
[Fix]: Docker server does not decode ContentRelevanceFilter
2 parents df4d87e + 6893094 commit 5a8fb57

File tree

7 files changed

+169
-53
lines changed

7 files changed

+169
-53
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro
10341034

10351035
| Company | About | Sponsorship Tier |
10361036
|------|------|----------------------------|
1037-
| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a> | Scrapeless is the best full-stack web scraping toolkit offering Scraping API, Scraping Browser, Web Unlocker, Captcha Solver, and Proxies, designed to handle all your data collection needs. | 🥈 Silver |
1037+
| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a> | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver |
10381038
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
10391039
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
10401040
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |

crawl4ai/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@
7272
BestFirstCrawlingStrategy,
7373
DFSDeepCrawlStrategy,
7474
DeepCrawlDecorator,
75+
ContentRelevanceFilter,
76+
ContentTypeScorer,
7577
)
7678
# NEW: Import AsyncUrlSeeder
7779
from .async_url_seeder import AsyncUrlSeeder

crawl4ai/async_configs.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1+
import importlib
12
import os
2-
from typing import Union
33
import warnings
44
import requests
55
from .config import (
@@ -27,23 +27,22 @@
2727
from .cache_context import CacheMode
2828
from .proxy_strategy import ProxyRotationStrategy
2929

30-
from typing import Union, List, Callable
3130
import inspect
32-
from typing import Any, Dict, Optional
31+
from typing import Any, Callable, Dict, List, Optional, Union
3332
from enum import Enum
3433

3534
# Type alias for URL matching
3635
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
3736

37+
3838
class MatchMode(Enum):
3939
OR = "or"
4040
AND = "and"
4141

4242
# from .proxy_strategy import ProxyConfig
4343

4444

45-
46-
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
45+
def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
4746
"""
4847
Recursively convert an object to a serializable dictionary using {type, params} structure
4948
for complex objects.
@@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
110109
# if value is not None:
111110
# current_values[attr_name] = to_serializable_dict(value)
112111

113-
114-
115112
return {
116113
"type": obj.__class__.__name__,
117114
"params": current_values
@@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any:
137134
if data["type"] == "dict" and "value" in data:
138135
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
139136

140-
# Import from crawl4ai for class instances
141-
import crawl4ai
142-
143-
if hasattr(crawl4ai, data["type"]):
144-
cls = getattr(crawl4ai, data["type"])
137+
cls = None
138+
# If you are receiving an error while trying to convert a dict to an object:
139+
# Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
140+
module_paths = ["crawl4ai"]
141+
for module_path in module_paths:
142+
try:
143+
mod = importlib.import_module(module_path)
144+
if hasattr(mod, data["type"]):
145+
cls = getattr(mod, data["type"])
146+
break
147+
except (ImportError, AttributeError):
148+
continue
145149

150+
if cls is not None:
146151
# Handle Enum
147152
if issubclass(cls, Enum):
148153
return cls(data["params"])

crawl4ai/deep_crawling/filters.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -509,18 +509,22 @@ def apply(self, url: str) -> bool:
509509
class ContentRelevanceFilter(URLFilter):
510510
"""BM25-based relevance filter using head section content"""
511511

512-
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
512+
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query")
513513

514514
def __init__(
515515
self,
516-
query: str,
516+
query: Union[str, List[str]],
517517
threshold: float,
518518
k1: float = 1.2,
519519
b: float = 0.75,
520520
avgdl: int = 1000,
521521
):
522522
super().__init__(name="BM25RelevanceFilter")
523-
self.query_terms = self._tokenize(query)
523+
if isinstance(query, list):
524+
self.query = " ".join(query)
525+
else:
526+
self.query = query
527+
self.query_terms = self._tokenize(self.query)
524528
self.threshold = threshold
525529
self.k1 = k1 # TF saturation parameter
526530
self.b = b # Length normalization parameter

crawl4ai/docker_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ async def stream_results() -> AsyncGenerator[CrawlResult, None]:
180180
yield CrawlResult(**result)
181181
return stream_results()
182182

183-
response = await self._request("POST", "/crawl", json=data)
183+
response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout)
184184
result_data = response.json()
185185
if not result_data.get("success", False):
186186
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import json
2+
import asyncio
3+
from urllib.parse import quote, urlencode
4+
from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler
5+
6+
# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers:
7+
# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration
8+
9+
async def main():
10+
# customize browser fingerprint
11+
fingerprint = {
12+
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36",
13+
"platform": "Windows",
14+
"screen": {
15+
"width": 1280, "height": 1024
16+
},
17+
"localization": {
18+
"languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong",
19+
}
20+
}
21+
22+
fingerprint_json = json.dumps(fingerprint)
23+
encoded_fingerprint = quote(fingerprint_json)
24+
25+
scrapeless_params = {
26+
"token": "your token",
27+
"sessionTTL": 1000,
28+
"sessionName": "Demo",
29+
"fingerprint": encoded_fingerprint,
30+
# Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options.
31+
# "proxyCountry": "ANY",
32+
# create profile on scrapeless
33+
# "profileId": "your profileId",
34+
# For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started
35+
}
36+
query_string = urlencode(scrapeless_params)
37+
scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}"
38+
async with AsyncWebCrawler(
39+
config=BrowserConfig(
40+
headless=False,
41+
browser_mode="cdp",
42+
cdp_url=scrapeless_connection_url,
43+
)
44+
) as crawler:
45+
result = await crawler.arun(
46+
url="https://www.scrapeless.com/en",
47+
config=CrawlerRunConfig(
48+
wait_for="css:.content",
49+
scan_full_page=True,
50+
),
51+
)
52+
print("-" * 20)
53+
print(f'Status Code: {result.status_code}')
54+
print("-" * 20)
55+
print(f'Title: {result.metadata["title"]}')
56+
print(f'Description: {result.metadata["description"]}')
57+
print("-" * 20)
58+
59+
if __name__ == "__main__":
60+
asyncio.run(main())
61+

tests/docker/test_filter_deep_crawl.py

Lines changed: 80 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,31 @@
11
"""
22
Test the complete fix for both the filter serialization and JSON serialization issues.
33
"""
4+
import os
5+
import traceback
6+
from typing import Any
47

58
import asyncio
69
import httpx
710

811
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
9-
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
12+
from crawl4ai.deep_crawling import (
13+
BFSDeepCrawlStrategy,
14+
ContentRelevanceFilter,
15+
FilterChain,
16+
URLFilter,
17+
URLPatternFilter,
18+
)
1019

11-
BASE_URL = "http://localhost:11234/" # Adjust port as needed
20+
CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234")
21+
try:
22+
BASE_PORT = int(CRAWL4AI_DOCKER_PORT)
23+
except TypeError:
24+
BASE_PORT = 11234
25+
BASE_URL = f"http://localhost:{BASE_PORT}/" # Adjust port as needed
1226

13-
async def test_with_docker_client():
27+
28+
async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool:
1429
"""Test using the Docker client (same as 1419.py)."""
1530
from crawl4ai.docker_client import Crawl4aiDockerClient
1631

@@ -24,19 +39,10 @@ async def test_with_docker_client():
2439
verbose=True,
2540
) as client:
2641

27-
# Create filter chain - testing the serialization fix
28-
filter_chain = [
29-
URLPatternFilter(
30-
# patterns=["*about*", "*privacy*", "*terms*"],
31-
patterns=["*advanced*"],
32-
reverse=True
33-
),
34-
]
35-
3642
crawler_config = CrawlerRunConfig(
3743
deep_crawl_strategy=BFSDeepCrawlStrategy(
3844
max_depth=2, # Keep it shallow for testing
39-
# max_pages=5, # Limit pages for testing
45+
max_pages=max_pages, # Limit pages for testing
4046
filter_chain=FilterChain(filter_chain)
4147
),
4248
cache_mode=CacheMode.BYPASS,
@@ -47,6 +53,7 @@ async def test_with_docker_client():
4753
["https://docs.crawl4ai.com"], # Simple test page
4854
browser_config=BrowserConfig(headless=True),
4955
crawler_config=crawler_config,
56+
hooks_timeout=timeout,
5057
)
5158

5259
if results:
@@ -74,12 +81,11 @@ async def test_with_docker_client():
7481

7582
except Exception as e:
7683
print(f"❌ Docker client test failed: {e}")
77-
import traceback
7884
traceback.print_exc()
7985
return False
8086

8187

82-
async def test_with_rest_api():
88+
async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool:
8389
"""Test using REST API directly."""
8490
print("\n" + "=" * 60)
8591
print("Testing with REST API")
@@ -90,19 +96,11 @@ async def test_with_rest_api():
9096
"type": "BFSDeepCrawlStrategy",
9197
"params": {
9298
"max_depth": 2,
93-
# "max_pages": 5,
99+
"max_pages": max_pages,
94100
"filter_chain": {
95101
"type": "FilterChain",
96102
"params": {
97-
"filters": [
98-
{
99-
"type": "URLPatternFilter",
100-
"params": {
101-
"patterns": ["*advanced*"],
102-
"reverse": True
103-
}
104-
}
105-
]
103+
"filters": filters
106104
}
107105
}
108106
}
@@ -126,7 +124,7 @@ async def test_with_rest_api():
126124
response = await client.post(
127125
f"{BASE_URL}crawl",
128126
json=crawl_payload,
129-
timeout=30
127+
timeout=timeout,
130128
)
131129

132130
if response.status_code == 200:
@@ -150,7 +148,6 @@ async def test_with_rest_api():
150148

151149
except Exception as e:
152150
print(f"❌ REST API test failed: {e}")
153-
import traceback
154151
traceback.print_exc()
155152
return False
156153

@@ -165,12 +162,62 @@ async def main():
165162
results = []
166163

167164
# Test 1: Docker client
168-
docker_passed = await test_with_docker_client()
169-
results.append(("Docker Client", docker_passed))
165+
max_pages_ = [20, 5]
166+
timeouts = [30, 60]
167+
filter_chain_test_cases = [
168+
[
169+
URLPatternFilter(
170+
# patterns=["*about*", "*privacy*", "*terms*"],
171+
patterns=["*advanced*"],
172+
reverse=True
173+
),
174+
],
175+
[
176+
ContentRelevanceFilter(
177+
query="about faq",
178+
threshold=0.2,
179+
),
180+
],
181+
]
182+
for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)):
183+
docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout)
184+
results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
170185

171186
# Test 2: REST API
172-
rest_passed = await test_with_rest_api()
173-
results.append(("REST API", rest_passed))
187+
max_pages_ = [20, 5, 5]
188+
timeouts = [30, 60, 60]
189+
filters_test_cases = [
190+
[
191+
{
192+
"type": "URLPatternFilter",
193+
"params": {
194+
"patterns": ["*advanced*"],
195+
"reverse": True
196+
}
197+
}
198+
],
199+
[
200+
{
201+
"type": "ContentRelevanceFilter",
202+
"params": {
203+
"query": "about faq",
204+
"threshold": 0.2,
205+
}
206+
}
207+
],
208+
[
209+
{
210+
"type": "ContentRelevanceFilter",
211+
"params": {
212+
"query": ["about", "faq"],
213+
"threshold": 0.2,
214+
}
215+
}
216+
],
217+
]
218+
for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)):
219+
rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout)
220+
results.append((f"REST API w/ filters {idx}", rest_passed))
174221

175222
# Summary
176223
print("\n" + "=" * 60)
@@ -186,10 +233,7 @@ async def main():
186233

187234
print("=" * 60)
188235
if all_passed:
189-
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
190-
print("\nThe fixes:")
191-
print("1. Filter serialization: Fixed by not serializing private __slots__")
192-
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
236+
print("🎉 ALL TESTS PASSED!")
193237
else:
194238
print("⚠️ Some tests failed. Please check the server logs for details.")
195239

@@ -198,4 +242,4 @@ async def main():
198242

199243
if __name__ == "__main__":
200244
import sys
201-
sys.exit(asyncio.run(main()))
245+
sys.exit(asyncio.run(main()))

0 commit comments

Comments
 (0)