Merge pull request #8 from proxymesh/chore/add-usage-skill

proxymesh · web-flow · commit c42711759d21 · 2026-04-20T10:28:13.000-07:00
Add Scrapy 2.15 compatibility and usage skill
diff --git a/.agents/skills/scrapy-proxy-headers/SKILL.md b/.agents/skills/scrapy-proxy-headers/SKILL.md
@@ -0,0 +1,130 @@
+---
+name: scrapy-proxy-headers
+description: >-
+  Send and receive custom headers during HTTPS CONNECT tunneling in Scrapy.
+  Use when adding proxy headers to Scrapy spiders, configuring download handlers
+  for proxy header support, or reading proxy response headers like X-ProxyMesh-IP.
+---
+
+# scrapy-proxy-headers
+
+Send custom headers to proxies and receive proxy response headers in Scrapy.
+
+## Installation
+
+```bash
+pip install scrapy-proxy-headers
+```
+
+## Configuration
+
+Add the download handler in `settings.py`:
+
+```python
+DOWNLOAD_HANDLERS = {
+    "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
+}
+```
+
+Or in spider's `custom_settings`:
+
+```python
+class MySpider(scrapy.Spider):
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
+        }
+    }
+```
+
+## Sending Proxy Headers
+
+Use `request.meta["proxy_headers"]`:
+
+```python
+import scrapy
+
+class MySpider(scrapy.Spider):
+    name = "example"
+    
+    def start_requests(self):
+        yield scrapy.Request(
+            url="https://api.ipify.org?format=json",
+            meta={
+                "proxy": "http://your-proxy:port",
+                "proxy_headers": {"X-ProxyMesh-Country": "US"}
+            }
+        )
+    
+    def parse(self, response):
+        proxy_ip = response.headers.get("X-ProxyMesh-IP")
+        self.logger.info(f"Proxy IP: {proxy_ip}")
+```
+
+## Receiving Proxy Response Headers
+
+Headers from the proxy's CONNECT response are merged into `response.headers`:
+
+```python
+def parse(self, response):
+    proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
+    if proxy_ip:
+        print(f"Request made through IP: {proxy_ip.decode()}")
+```
+
+Note: Headers are bytes in Scrapy; decode with `.decode()`.
+
+## Complete Example
+
+```python
+import scrapy
+
+class ProxyHeadersSpider(scrapy.Spider):
+    name = "proxy_headers_demo"
+    
+    custom_settings = {
+        "DOWNLOAD_HANDLERS": {
+            "https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
+        }
+    }
+    
+    def start_requests(self):
+        yield scrapy.Request(
+            url="https://api.ipify.org?format=json",
+            meta={
+                "proxy": "http://us.proxymesh.com:31280",
+                "proxy_headers": {"X-ProxyMesh-Country": "US"}
+            },
+            callback=self.parse_ip
+        )
+    
+    def parse_ip(self, response):
+        data = response.json()
+        proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
+        
+        yield {
+            "public_ip": data["ip"],
+            "proxy_ip": proxy_ip.decode() if proxy_ip else None
+        }
+```
+
+## Proxy Headers
+
+Custom headers sent during CONNECT are proxy-specific. Check your proxy provider's docs.
+
+Example with [ProxyMesh](https://proxymesh.com):
+
+| Header | Direction | Purpose |
+|--------|-----------|---------|
+| `X-ProxyMesh-Country` | Send | Route through specific country |
+| `X-ProxyMesh-IP` | Send/Receive | Request or receive sticky IP |
+
+## Testing
+
+```bash
+PROXY_URL=http://your-proxy:port python test_proxy_headers.py -v
+```
+
+## Documentation
+
+Full docs at [scrapy-proxy-headers.readthedocs.io](https://scrapy-proxy-headers.readthedocs.io/).
diff --git a/docs/index.rst b/docs/index.rst
@@ -195,7 +195,7 @@ The agent:
 **Methods:**
 
 * ``_get_agent(request, timeout)`` - Returns an agent configured with proxy headers from ``request.meta["proxy_headers"]``
-* ``_cb_bodydone(result, request, url)`` - Callback that merges proxy response headers into the final response
+* ``_cb_bodydone(result, *args)`` - Callback that merges proxy response headers into the final response (compatible with Scrapy 2.14 and 2.15+)
 
 TunnelingHeadersAgent
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,13 +4,16 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "scrapy-proxy-headers"
-version = "0.1.6"
+version = "0.2.0"
 authors = [
   { name="ProxyMesh", email="support@proxymesh.com" },
 ]
 description = "Add custom proxy headers to HTTPS requests in Scrapy"
 readme = "README.md"
 requires-python = ">=3.8"
+dependencies = [
+    "scrapy>=2.14.1",
+]
 classifiers = [
     "Programming Language :: Python :: 3",
     "Operating System :: OS Independent",
diff --git a/scrapy_proxy_headers/agent.py b/scrapy_proxy_headers/agent.py
@@ -103,6 +103,7 @@ class ScrapyProxyHeadersAgent(ScrapyAgent):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._agent = None
+        self.proxy_response_headers = None
     
     def _get_agent(self, request, timeout: float):
         self._agent = super()._get_agent(request, timeout)
@@ -116,13 +117,13 @@ def _get_agent(self, request, timeout: float):
         
         return self._agent
 
-    def _cb_bodydone(self, result, request, url: str):
-        r = super()._cb_bodydone(result, request, url)
+    def _cb_bodydone(self, result, *args):
+        # Scrapy 2.15+ changed signature from (result, request, url) to (result, url)
+        r = super()._cb_bodydone(result, *args)
         if isinstance(r, Response):
             if self._agent and hasattr(self._agent, '_endpoint'):
                 proxy_response_headers = getattr(self._agent._endpoint, '_proxy_response_headers', None)
                 if proxy_response_headers:
                     r.headers.update(proxy_response_headers)
-                    # save this for download handler
-                    r._proxy_response_headers = proxy_response_headers
+                    self.proxy_response_headers = proxy_response_headers
         return r
diff --git a/scrapy_proxy_headers/download_handler.py b/scrapy_proxy_headers/download_handler.py
@@ -1,13 +1,18 @@
 from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
 from scrapy_proxy_headers.agent import ScrapyProxyHeadersAgent
 
+
 class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._proxy_headers_by_proxy = {}
     
-    def download_request(self, request, spider):
+    def download_request(self, request, spider=None):
         """Return a deferred for the HTTP download"""
+        # Support both old Scrapy (spider param) and new Scrapy (self._crawler.spider)
+        if spider is None:
+            spider = self._crawler.spider
+        
         agent = ScrapyProxyHeadersAgent(
             contextFactory=self._contextFactory,
             pool=self._pool,
@@ -20,13 +25,12 @@ def download_request(self, request, spider):
         proxy = request.meta.get("proxy")
 
         if proxy:
-            # we need to do all this because the proxy tunnels can get re-used
-            # when that happens, the proxy headers are not available in subsequent responses
-            # so we need to save the proxy headers by the proxy, from the first tunnel response
-            # so we can add them to subsequent responses
+            # Proxy tunnels can get re-used; when that happens, proxy headers
+            # are not available in subsequent responses. Save proxy headers by
+            # proxy URL from the first tunnel response to add to later responses.
             def callback(response):
-                if hasattr(response, '_proxy_response_headers'):
-                    self._proxy_headers_by_proxy[proxy] = response._proxy_response_headers
+                if agent.proxy_response_headers:
+                    self._proxy_headers_by_proxy[proxy] = agent.proxy_response_headers
 
                 if proxy in self._proxy_headers_by_proxy:
                     response.headers.update(self._proxy_headers_by_proxy[proxy])