scrapy-proxy-headers/test_proxy_headers.py at main · proxymesh/scrapy-proxy-headers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python3
"""
Test harness for scrapy-proxy-headers.

Tests that the extension can send custom headers to a proxy and receive
response headers from the proxy's CONNECT response.

Environment Variables:
    PROXY_URL       - Proxy URL (also checks HTTPS_PROXY). Required.
    TEST_URL        - URL to request (default: https://api.ipify.org?format=json)
    PROXY_HEADER    - Response header to check for (default: X-ProxyMesh-IP)
    SEND_PROXY_HEADER - Optional header name to send to proxy
    SEND_PROXY_VALUE  - Optional value for the send header

Usage:
    PROXY_URL=http://your-proxy:port python test_proxy_headers.py
    PROXY_URL=http://your-proxy:port python test_proxy_headers.py -v
"""

import os
import sys
import argparse

# Scrapy imports
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


class ProxyHeaderTestSpider(scrapy.Spider):
    """Spider that tests proxy header functionality."""

    name = "proxy_header_test"

    custom_settings = {
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
        },
        # Allow non-2xx so parse() runs: we only assert proxy tunnel headers (e.g.
        # X-ProxyMesh-IP), and the test URL may return 403/429 from CI egress.
        "HTTPERROR_ALLOW_ALL": True,
        "LOG_LEVEL": "WARNING",
        "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
    }

    def __init__(self, proxy_url, test_url, proxy_header,
                 send_header=None, send_value=None, verbose=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.proxy_url = proxy_url
        self.test_url = test_url
        self.proxy_header = proxy_header
        self.send_header = send_header
        self.send_value = send_value
        self.verbose = verbose
        self.test_passed = False
        self.header_value = None
        self.error_message = None

    def start_requests(self):
        meta = {"proxy": self.proxy_url}

        # Add custom proxy headers if specified
        if self.send_header and self.send_value:
            meta["proxy_headers"] = {self.send_header: self.send_value}

        yield scrapy.Request(
            url=self.test_url,
            meta=meta,
            callback=self.parse,
            errback=self.handle_error
        )

    def parse(self, response):
        # Check for the expected proxy header
        header_value = response.headers.get(self.proxy_header.encode())

        if header_value:
            self.test_passed = True
            self.header_value = header_value.decode() if isinstance(header_value, bytes) else header_value
        else:
            self.test_passed = False
            self.error_message = f"Header '{self.proxy_header}' not found in response"

    def handle_error(self, failure):
        self.test_passed = False
        self.error_message = str(failure.value)


def run_test(proxy_url: str, test_url: str, proxy_header: str,
             send_header: str = None, send_value: str = None,
             verbose: bool = False) -> bool:
    """
    Run the proxy header test.

    Returns True if test passed, False otherwise.
    """
    # Print test configuration
    print("Testing scrapy-proxy-headers")
    print("=" * 28)
    print(f"Proxy URL: {proxy_url}")
    print(f"Test URL: {test_url}")
    print(f"Checking for header: {proxy_header}")

    if send_header and send_value:
        print(f"Sending header: {send_header}: {send_value}")

    print()

    # Create and run the spider
    process = CrawlerProcess(settings={
        "LOG_ENABLED": False,
    })

    # Store spider instance to check results
    spider_instance = None

    def store_spider(spider):
        nonlocal spider_instance
        spider_instance = spider

    crawler = process.create_crawler(ProxyHeaderTestSpider)
    crawler.signals.connect(store_spider, signal=scrapy.signals.spider_opened)

    process.crawl(
        crawler,
        proxy_url=proxy_url,
        test_url=test_url,
        proxy_header=proxy_header,
        send_header=send_header,
        send_value=send_value,
        verbose=verbose
    )

    process.start()

    # Check results
    if spider_instance is None:
        print("[FAIL] Spider did not start")
        return False

    if spider_instance.test_passed:
        if verbose and spider_instance.header_value:
            print(f"[PASS] Received header {proxy_header}: {spider_instance.header_value}")
        else:
            print(f"[PASS] Received header {proxy_header}")
        return True
    else:
        print(f"[FAIL] {spider_instance.error_message}")
        return False


def main():
    parser = argparse.ArgumentParser(
        description="Test proxy header functionality with scrapy-proxy-headers"
    )
    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Show header values in output"
    )
    args = parser.parse_args()

    # Get configuration from environment
    proxy_url = os.environ.get("PROXY_URL") or os.environ.get("HTTPS_PROXY")
    if not proxy_url:
        print("Error: PROXY_URL or HTTPS_PROXY environment variable required")
        sys.exit(1)

    test_url = os.environ.get("TEST_URL", "https://api.ipify.org?format=json")
    proxy_header = os.environ.get("PROXY_HEADER", "X-ProxyMesh-IP")
    send_header = os.environ.get("SEND_PROXY_HEADER")
    send_value = os.environ.get("SEND_PROXY_VALUE")

    # Run the test
    success = run_test(
        proxy_url=proxy_url,
        test_url=test_url,
        proxy_header=proxy_header,
        send_header=send_header,
        send_value=send_value,
        verbose=args.verbose
    )

    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()