EasyScrape/examples/03_error_handling.py at main · doudol/EasyScrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
"""Error handling examples.

Run: python 03_error_handling.py
"""
import easyscrape as es
from easyscrape.exceptions import (
    EasyScrapeError,
    NetworkError,
    HTTPError,
    InvalidURLError,
    RequestTimeout,
    RateLimitHit,
    RetryExhausted,
)


def catch_all_errors():
    """Catch any EasyScrape error."""
    try:
        result = es.scrape("https://httpbin.org/status/500")
    except EasyScrapeError as e:
        print(f"  Caught: {type(e).__name__}: {e}")


def specific_errors():
    """Handle specific error types."""
    urls = [
        ("https://httpbin.org/status/404", "Not Found"),
        ("https://httpbin.org/status/500", "Server Error"),
        ("https://httpbin.org/delay/10", "Timeout"),
    ]

    config = es.Config(timeout=2.0, max_retries=1)

    for url, description in urls:
        print(f"\n  Testing: {description}")
        try:
            result = es.scrape(url, config=config)
            print(f"    Success: {result.status_code}")
        except RequestTimeout:
            print("    Caught: RequestTimeout")
        except HTTPError as e:
            print(f"    Caught: HTTPError (status={e.status_code})")
        except NetworkError as e:
            print(f"    Caught: NetworkError ({e})")
        except RetryExhausted as e:
            print(f"    Caught: RetryExhausted (attempts={e.attempts})")


def invalid_urls():
    """Handle invalid URLs."""
    bad_urls = [
        "not-a-url",
        "javascript:alert(1)",
        "http://localhost/admin",
        "http://169.254.169.254/metadata",
    ]

    for url in bad_urls:
        try:
            es.scrape(url)
        except InvalidURLError as e:
            print(f"  Blocked: {url[:30]}... ({e.reason})")


def graceful_degradation():
    """Continue scraping despite errors."""
    urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/status/404",
        "https://httpbin.org/html",
        "https://httpbin.org/status/500",
        "https://httpbin.org/html",
    ]

    config = es.Config(max_retries=1)
    results = []
    errors = []

    for url in urls:
        try:
            result = es.scrape(url, config=config)
            if result.ok:
                results.append(result)
            else:
                errors.append((url, f"HTTP {result.status_code}"))
        except EasyScrapeError as e:
            errors.append((url, str(e)))

    print(f"  Successful: {len(results)}")
    print(f"  Failed: {len(errors)}")
    for url, error in errors:
        print(f"    - {url}: {error[:50]}")


def main():
    print("=" * 60)
    print("  Error Handling Examples")
    print("=" * 60)

    print("\n1. Catch All Errors")
    print("-" * 40)
    catch_all_errors()

    print("\n2. Specific Error Types")
    print("-" * 40)
    specific_errors()

    print("\n3. Invalid URLs (Security)")
    print("-" * 40)
    invalid_urls()

    print("\n4. Graceful Degradation")
    print("-" * 40)
    graceful_degradation()


if __name__ == "__main__":
    main()