|
| 1 | +"""Test delayed redirect WITH wait_for - does link resolution use correct URL?""" |
| 2 | +import asyncio |
| 3 | +import threading |
| 4 | +from http.server import HTTPServer, SimpleHTTPRequestHandler |
| 5 | + |
| 6 | +class RedirectTestHandler(SimpleHTTPRequestHandler): |
| 7 | + def log_message(self, format, *args): |
| 8 | + pass |
| 9 | + |
| 10 | + def do_GET(self): |
| 11 | + if self.path == "/page-a": |
| 12 | + self.send_response(200) |
| 13 | + self.send_header("Content-type", "text/html") |
| 14 | + self.end_headers() |
| 15 | + content = """ |
| 16 | + <!DOCTYPE html> |
| 17 | + <html> |
| 18 | + <head><title>Page A</title></head> |
| 19 | + <body> |
| 20 | + <h1>Page A - Will redirect after 200ms</h1> |
| 21 | + <script> |
| 22 | + setTimeout(function() { |
| 23 | + window.location.href = '/redirect-target/'; |
| 24 | + }, 200); |
| 25 | + </script> |
| 26 | + </body> |
| 27 | + </html> |
| 28 | + """ |
| 29 | + self.wfile.write(content.encode()) |
| 30 | + elif self.path.startswith("/redirect-target"): |
| 31 | + self.send_response(200) |
| 32 | + self.send_header("Content-type", "text/html") |
| 33 | + self.end_headers() |
| 34 | + content = """ |
| 35 | + <!DOCTYPE html> |
| 36 | + <html> |
| 37 | + <head><title>Redirect Target</title></head> |
| 38 | + <body> |
| 39 | + <h1>Redirect Target</h1> |
| 40 | + <nav id="target-nav"> |
| 41 | + <a href="subpage-1">Subpage 1</a> |
| 42 | + <a href="subpage-2">Subpage 2</a> |
| 43 | + </nav> |
| 44 | + </body> |
| 45 | + </html> |
| 46 | + """ |
| 47 | + self.wfile.write(content.encode()) |
| 48 | + else: |
| 49 | + self.send_response(404) |
| 50 | + self.end_headers() |
| 51 | + |
| 52 | +async def main(): |
| 53 | + import socket |
| 54 | + class ReuseAddrHTTPServer(HTTPServer): |
| 55 | + allow_reuse_address = True |
| 56 | + |
| 57 | + server = ReuseAddrHTTPServer(("localhost", 8769), RedirectTestHandler) |
| 58 | + thread = threading.Thread(target=server.serve_forever) |
| 59 | + thread.daemon = True |
| 60 | + thread.start() |
| 61 | + |
| 62 | + try: |
| 63 | + import sys |
| 64 | + sys.path.insert(0, '/Users/nasrin/vscode/c4ai-uc/develop') |
| 65 | + from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig |
| 66 | + |
| 67 | + print("=" * 60) |
| 68 | + print("TEST: Delayed JS redirect WITH wait_for='css:#target-nav'") |
| 69 | + print("This waits for the redirect to complete") |
| 70 | + print("=" * 60) |
| 71 | + |
| 72 | + browser_config = BrowserConfig(headless=True, verbose=False) |
| 73 | + crawl_config = CrawlerRunConfig( |
| 74 | + cache_mode="bypass", |
| 75 | + wait_for="css:#target-nav" # Wait for element on redirect target |
| 76 | + ) |
| 77 | + |
| 78 | + async with AsyncWebCrawler(config=browser_config) as crawler: |
| 79 | + result = await crawler.arun( |
| 80 | + url="http://localhost:8769/page-a", |
| 81 | + config=crawl_config |
| 82 | + ) |
| 83 | + |
| 84 | + print(f"Original URL: http://localhost:8769/page-a") |
| 85 | + print(f"Redirected URL returned: {result.redirected_url}") |
| 86 | + print(f"HTML contains 'Redirect Target': {'Redirect Target' in result.html}") |
| 87 | + print() |
| 88 | + |
| 89 | + if "/redirect-target" in (result.redirected_url or ""): |
| 90 | + print("✓ redirected_url is CORRECT") |
| 91 | + else: |
| 92 | + print("✗ BUG #1: redirected_url is WRONG - still shows original URL!") |
| 93 | + |
| 94 | + # Check links |
| 95 | + all_links = [] |
| 96 | + if isinstance(result.links, dict): |
| 97 | + all_links = result.links.get("internal", []) + result.links.get("external", []) |
| 98 | + |
| 99 | + print(f"\nLinks found ({len(all_links)} total):") |
| 100 | + bug_found = False |
| 101 | + for link in all_links: |
| 102 | + href = link.get("href", "") if isinstance(link, dict) else getattr(link, 'href', "") |
| 103 | + if "subpage" in href: |
| 104 | + print(f" {href}") |
| 105 | + if "/page-a/" in href: |
| 106 | + print(" ^^^ BUG #2: Link resolved with WRONG base URL!") |
| 107 | + bug_found = True |
| 108 | + elif "/redirect-target/" in href: |
| 109 | + print(" ^^^ CORRECT") |
| 110 | + |
| 111 | + if not bug_found and all_links: |
| 112 | + print("\n✓ Link resolution is CORRECT") |
| 113 | + |
| 114 | + finally: |
| 115 | + server.shutdown() |
| 116 | + |
| 117 | +if __name__ == "__main__": |
| 118 | + asyncio.run(main()) |
0 commit comments