Skip to content

Commit c427117

Browse files
authored
Merge pull request #8 from proxymesh/chore/add-usage-skill
Add Scrapy 2.15 compatibility and usage skill
2 parents e45e111 + b527c53 commit c427117

5 files changed

Lines changed: 151 additions & 13 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
---
2+
name: scrapy-proxy-headers
3+
description: >-
4+
Send and receive custom headers during HTTPS CONNECT tunneling in Scrapy.
5+
Use when adding proxy headers to Scrapy spiders, configuring download handlers
6+
for proxy header support, or reading proxy response headers like X-ProxyMesh-IP.
7+
---
8+
9+
# scrapy-proxy-headers
10+
11+
Send custom headers to proxies and receive proxy response headers in Scrapy.
12+
13+
## Installation
14+
15+
```bash
16+
pip install scrapy-proxy-headers
17+
```
18+
19+
## Configuration
20+
21+
Add the download handler in `settings.py`:
22+
23+
```python
24+
DOWNLOAD_HANDLERS = {
25+
"https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
26+
}
27+
```
28+
29+
Or in spider's `custom_settings`:
30+
31+
```python
32+
class MySpider(scrapy.Spider):
33+
custom_settings = {
34+
"DOWNLOAD_HANDLERS": {
35+
"https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
36+
}
37+
}
38+
```
39+
40+
## Sending Proxy Headers
41+
42+
Use `request.meta["proxy_headers"]`:
43+
44+
```python
45+
import scrapy
46+
47+
class MySpider(scrapy.Spider):
48+
name = "example"
49+
50+
def start_requests(self):
51+
yield scrapy.Request(
52+
url="https://api.ipify.org?format=json",
53+
meta={
54+
"proxy": "http://your-proxy:port",
55+
"proxy_headers": {"X-ProxyMesh-Country": "US"}
56+
}
57+
)
58+
59+
def parse(self, response):
60+
proxy_ip = response.headers.get("X-ProxyMesh-IP")
61+
self.logger.info(f"Proxy IP: {proxy_ip}")
62+
```
63+
64+
## Receiving Proxy Response Headers
65+
66+
Headers from the proxy's CONNECT response are merged into `response.headers`:
67+
68+
```python
69+
def parse(self, response):
70+
proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
71+
if proxy_ip:
72+
print(f"Request made through IP: {proxy_ip.decode()}")
73+
```
74+
75+
Note: Headers are bytes in Scrapy; decode with `.decode()`.
76+
77+
## Complete Example
78+
79+
```python
80+
import scrapy
81+
82+
class ProxyHeadersSpider(scrapy.Spider):
83+
name = "proxy_headers_demo"
84+
85+
custom_settings = {
86+
"DOWNLOAD_HANDLERS": {
87+
"https": "scrapy_proxy_headers.HTTP11ProxyDownloadHandler"
88+
}
89+
}
90+
91+
def start_requests(self):
92+
yield scrapy.Request(
93+
url="https://api.ipify.org?format=json",
94+
meta={
95+
"proxy": "http://us.proxymesh.com:31280",
96+
"proxy_headers": {"X-ProxyMesh-Country": "US"}
97+
},
98+
callback=self.parse_ip
99+
)
100+
101+
def parse_ip(self, response):
102+
data = response.json()
103+
proxy_ip = response.headers.get(b"X-ProxyMesh-IP")
104+
105+
yield {
106+
"public_ip": data["ip"],
107+
"proxy_ip": proxy_ip.decode() if proxy_ip else None
108+
}
109+
```
110+
111+
## Proxy Headers
112+
113+
Custom headers sent during CONNECT are proxy-specific. Check your proxy provider's docs.
114+
115+
Example with [ProxyMesh](https://proxymesh.com):
116+
117+
| Header | Direction | Purpose |
118+
|--------|-----------|---------|
119+
| `X-ProxyMesh-Country` | Send | Route through specific country |
120+
| `X-ProxyMesh-IP` | Send/Receive | Request or receive sticky IP |
121+
122+
## Testing
123+
124+
```bash
125+
PROXY_URL=http://your-proxy:port python test_proxy_headers.py -v
126+
```
127+
128+
## Documentation
129+
130+
Full docs at [scrapy-proxy-headers.readthedocs.io](https://scrapy-proxy-headers.readthedocs.io/).

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ The agent:
195195
**Methods:**
196196

197197
* ``_get_agent(request, timeout)`` - Returns an agent configured with proxy headers from ``request.meta["proxy_headers"]``
198-
* ``_cb_bodydone(result, request, url)`` - Callback that merges proxy response headers into the final response
198+
* ``_cb_bodydone(result, *args)`` - Callback that merges proxy response headers into the final response (compatible with Scrapy 2.14 and 2.15+)
199199

200200
TunnelingHeadersAgent
201201
~~~~~~~~~~~~~~~~~~~~~

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "scrapy-proxy-headers"
7-
version = "0.1.6"
7+
version = "0.2.0"
88
authors = [
99
{ name="ProxyMesh", email="support@proxymesh.com" },
1010
]
1111
description = "Add custom proxy headers to HTTPS requests in Scrapy"
1212
readme = "README.md"
1313
requires-python = ">=3.8"
14+
dependencies = [
15+
"scrapy>=2.14.1",
16+
]
1417
classifiers = [
1518
"Programming Language :: Python :: 3",
1619
"Operating System :: OS Independent",

scrapy_proxy_headers/agent.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class ScrapyProxyHeadersAgent(ScrapyAgent):
103103
def __init__(self, *args, **kwargs):
104104
super().__init__(*args, **kwargs)
105105
self._agent = None
106+
self.proxy_response_headers = None
106107

107108
def _get_agent(self, request, timeout: float):
108109
self._agent = super()._get_agent(request, timeout)
@@ -116,13 +117,13 @@ def _get_agent(self, request, timeout: float):
116117

117118
return self._agent
118119

119-
def _cb_bodydone(self, result, request, url: str):
120-
r = super()._cb_bodydone(result, request, url)
120+
def _cb_bodydone(self, result, *args):
121+
# Scrapy 2.15+ changed signature from (result, request, url) to (result, url)
122+
r = super()._cb_bodydone(result, *args)
121123
if isinstance(r, Response):
122124
if self._agent and hasattr(self._agent, '_endpoint'):
123125
proxy_response_headers = getattr(self._agent._endpoint, '_proxy_response_headers', None)
124126
if proxy_response_headers:
125127
r.headers.update(proxy_response_headers)
126-
# save this for download handler
127-
r._proxy_response_headers = proxy_response_headers
128+
self.proxy_response_headers = proxy_response_headers
128129
return r

scrapy_proxy_headers/download_handler.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
22
from scrapy_proxy_headers.agent import ScrapyProxyHeadersAgent
33

4+
45
class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
56
def __init__(self, *args, **kwargs):
67
super().__init__(*args, **kwargs)
78
self._proxy_headers_by_proxy = {}
89

9-
def download_request(self, request, spider):
10+
def download_request(self, request, spider=None):
1011
"""Return a deferred for the HTTP download"""
12+
# Support both old Scrapy (spider param) and new Scrapy (self._crawler.spider)
13+
if spider is None:
14+
spider = self._crawler.spider
15+
1116
agent = ScrapyProxyHeadersAgent(
1217
contextFactory=self._contextFactory,
1318
pool=self._pool,
@@ -20,13 +25,12 @@ def download_request(self, request, spider):
2025
proxy = request.meta.get("proxy")
2126

2227
if proxy:
23-
# we need to do all this because the proxy tunnels can get re-used
24-
# when that happens, the proxy headers are not available in subsequent responses
25-
# so we need to save the proxy headers by the proxy, from the first tunnel response
26-
# so we can add them to subsequent responses
28+
# Proxy tunnels can get re-used; when that happens, proxy headers
29+
# are not available in subsequent responses. Save proxy headers by
30+
# proxy URL from the first tunnel response to add to later responses.
2731
def callback(response):
28-
if hasattr(response, '_proxy_response_headers'):
29-
self._proxy_headers_by_proxy[proxy] = response._proxy_response_headers
32+
if agent.proxy_response_headers:
33+
self._proxy_headers_by_proxy[proxy] = agent.proxy_response_headers
3034

3135
if proxy in self._proxy_headers_by_proxy:
3236
response.headers.update(self._proxy_headers_by_proxy[proxy])

0 commit comments

Comments
 (0)