Skip to content

Commit 07ccf13

Browse files
committed
Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268
1 parent b36c6da commit 07ccf13

File tree

2 files changed

+124
-0
lines changed

2 files changed

+124
-0
lines changed

crawl4ai/async_crawler_strategy.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,12 @@ async def get_delayed_content(delay: float = 5.0) -> str:
10231023
final_messages = await self.adapter.retrieve_console_messages(page)
10241024
captured_console.extend(final_messages)
10251025

1026+
###
1027+
# This ensures we capture the current page URL at the time we return the response,
1028+
# which correctly reflects any JavaScript navigation that occurred.
1029+
###
1030+
redirected_url = page.url # Use current page URL to capture JS redirects
1031+
10261032
# Return complete response
10271033
return AsyncCrawlResponse(
10281034
html=html,
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""Test delayed redirect WITH wait_for - does link resolution use correct URL?"""
2+
import asyncio
3+
import threading
4+
from http.server import HTTPServer, SimpleHTTPRequestHandler
5+
6+
class RedirectTestHandler(SimpleHTTPRequestHandler):
7+
def log_message(self, format, *args):
8+
pass
9+
10+
def do_GET(self):
11+
if self.path == "/page-a":
12+
self.send_response(200)
13+
self.send_header("Content-type", "text/html")
14+
self.end_headers()
15+
content = """
16+
<!DOCTYPE html>
17+
<html>
18+
<head><title>Page A</title></head>
19+
<body>
20+
<h1>Page A - Will redirect after 200ms</h1>
21+
<script>
22+
setTimeout(function() {
23+
window.location.href = '/redirect-target/';
24+
}, 200);
25+
</script>
26+
</body>
27+
</html>
28+
"""
29+
self.wfile.write(content.encode())
30+
elif self.path.startswith("/redirect-target"):
31+
self.send_response(200)
32+
self.send_header("Content-type", "text/html")
33+
self.end_headers()
34+
content = """
35+
<!DOCTYPE html>
36+
<html>
37+
<head><title>Redirect Target</title></head>
38+
<body>
39+
<h1>Redirect Target</h1>
40+
<nav id="target-nav">
41+
<a href="subpage-1">Subpage 1</a>
42+
<a href="subpage-2">Subpage 2</a>
43+
</nav>
44+
</body>
45+
</html>
46+
"""
47+
self.wfile.write(content.encode())
48+
else:
49+
self.send_response(404)
50+
self.end_headers()
51+
52+
async def main():
53+
import socket
54+
class ReuseAddrHTTPServer(HTTPServer):
55+
allow_reuse_address = True
56+
57+
server = ReuseAddrHTTPServer(("localhost", 8769), RedirectTestHandler)
58+
thread = threading.Thread(target=server.serve_forever)
59+
thread.daemon = True
60+
thread.start()
61+
62+
try:
63+
import sys
64+
sys.path.insert(0, '/Users/nasrin/vscode/c4ai-uc/develop')
65+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
66+
67+
print("=" * 60)
68+
print("TEST: Delayed JS redirect WITH wait_for='css:#target-nav'")
69+
print("This waits for the redirect to complete")
70+
print("=" * 60)
71+
72+
browser_config = BrowserConfig(headless=True, verbose=False)
73+
crawl_config = CrawlerRunConfig(
74+
cache_mode="bypass",
75+
wait_for="css:#target-nav" # Wait for element on redirect target
76+
)
77+
78+
async with AsyncWebCrawler(config=browser_config) as crawler:
79+
result = await crawler.arun(
80+
url="http://localhost:8769/page-a",
81+
config=crawl_config
82+
)
83+
84+
print(f"Original URL: http://localhost:8769/page-a")
85+
print(f"Redirected URL returned: {result.redirected_url}")
86+
print(f"HTML contains 'Redirect Target': {'Redirect Target' in result.html}")
87+
print()
88+
89+
if "/redirect-target" in (result.redirected_url or ""):
90+
print("✓ redirected_url is CORRECT")
91+
else:
92+
print("✗ BUG #1: redirected_url is WRONG - still shows original URL!")
93+
94+
# Check links
95+
all_links = []
96+
if isinstance(result.links, dict):
97+
all_links = result.links.get("internal", []) + result.links.get("external", [])
98+
99+
print(f"\nLinks found ({len(all_links)} total):")
100+
bug_found = False
101+
for link in all_links:
102+
href = link.get("href", "") if isinstance(link, dict) else getattr(link, 'href', "")
103+
if "subpage" in href:
104+
print(f" {href}")
105+
if "/page-a/" in href:
106+
print(" ^^^ BUG #2: Link resolved with WRONG base URL!")
107+
bug_found = True
108+
elif "/redirect-target/" in href:
109+
print(" ^^^ CORRECT")
110+
111+
if not bug_found and all_links:
112+
print("\n✓ Link resolution is CORRECT")
113+
114+
finally:
115+
server.shutdown()
116+
117+
if __name__ == "__main__":
118+
asyncio.run(main())

0 commit comments

Comments
 (0)