-
Notifications
You must be signed in to change notification settings - Fork 741
Expand file tree
/
Copy pathtest_tiers.py
More file actions
192 lines (141 loc) · 7.09 KB
/
test_tiers.py
File metadata and controls
192 lines (141 loc) · 7.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from __future__ import annotations
from yarl import URL
from crawlee import Request
from crawlee.proxy_configuration import ProxyConfiguration, _ProxyTierTracker
async def test_rotates_proxies_uniformly_with_no_request() -> None:
tiered_proxy_urls: list[list[str | None]] = [
['http://proxy:1111', 'http://proxy:2222'],
['http://proxy:3333', 'http://proxy:4444'],
]
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
info = await config.new_proxy_info(None, None, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
info = await config.new_proxy_info(None, None, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][1]
info = await config.new_proxy_info(None, None, None)
assert info is not None
assert info.url == tiered_proxy_urls[1][0]
info = await config.new_proxy_info(None, None, None)
assert info is not None
assert info.url == tiered_proxy_urls[1][1]
info = await config.new_proxy_info(None, None, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
async def test_retrying_request_makes_tier_go_up() -> None:
tiered_proxy_urls: list[list[str | None]] = [
['http://proxy:1111'],
['http://proxy:2222'],
['http://proxy:3333'],
['http://proxy:4444'],
]
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
# Calling `new_proxy_info` with the same request most probably means it's being retried
request_1 = Request(url='http://some.domain/abc', unique_key='1')
info = await config.new_proxy_info(None, request_1, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
info = await config.new_proxy_info(None, request_1, None)
assert info is not None
assert info.url == tiered_proxy_urls[1][0]
info = await config.new_proxy_info(None, request_1, None)
assert info is not None
assert info.url == tiered_proxy_urls[2][0]
# Subsequent requests with the same domain should use the same tier
request_2 = Request(url='http://some.domain/xyz', unique_key='2')
info = await config.new_proxy_info(None, request_2, None)
assert info is not None
assert info.url == tiered_proxy_urls[2][0]
async def test_retrying_request_makes_tier_go_up_with_sessions() -> None:
tiered_proxy_urls: list[list[str | None]] = [
['http://proxy:1111'],
['http://proxy:2222'],
['http://proxy:3333'],
['http://proxy:4444'],
]
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
request = Request(url='http://some.domain/abc', unique_key='1')
# Calling `new_proxy_info` with the same request likely means that it is being retried.
# However, a single session should always receive the same proxy
info = await config.new_proxy_info('session_id', request, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
info = await config.new_proxy_info('session_id', request, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
info = await config.new_proxy_info('session_id', request, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
# For a new session, we will get a proxy from the corresponding tier
info = await config.new_proxy_info('session_id2', request, None)
assert info is not None
assert info.url == tiered_proxy_urls[3][0]
info = await config.new_proxy_info('session_id2', request, None)
assert info is not None
assert info.url == tiered_proxy_urls[3][0]
async def test_successful_request_makes_tier_go_down() -> None:
"""Repeatedly requesting a proxy for a single request will cause the proxy tier to go up -
ProxyConfiguration assumes those are retries. Then, requesting a proxy for different requests to the same domain
will cause the tier to drop back down."""
tiered_proxy_urls: list[list[str | None]] = [
['http://proxy:1111'],
['http://proxy:2222'],
['http://proxy:3333'],
['http://proxy:4444'],
]
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
request_1 = Request(url='http://some.domain/abc', unique_key='1')
info = None
for tier in tiered_proxy_urls:
info = await config.new_proxy_info(None, request_1, None)
assert info is not None
assert info.url == tier[0]
for i in range(100):
new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i))
info = await config.new_proxy_info(None, new_request, None)
assert info is not None
assert info.url == tiered_proxy_urls[0][0]
async def test_none_proxy_retrying_request_makes_tier_go_up() -> None:
tiered_proxy_urls: list[list[str | None]] = [
[None],
['http://proxy:1111'],
]
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
# Calling `new_proxy_info` with the same request most probably means it's being retried
request_1 = Request(url='http://some.domain/abc', unique_key='1')
# No proxy used.
info = await config.new_proxy_info(None, request_1, None)
assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'
# Proxy should go up one tier for same request that was already sent before.
info = await config.new_proxy_info(None, request_1, None)
assert info is not None, (
'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.'
)
assert info.url == tiered_proxy_urls[1][0]
async def test_none_proxy_rotates_proxies_uniformly_with_no_request() -> None:
tiered_proxy_urls = [
[None, 'http://proxy:1111'],
]
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
# No proxy used.
info = await config.new_proxy_info(None, None, None)
assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'
# Proxy should be rotated on the same proxy tier for a new request.
info = await config.new_proxy_info(None, None, None)
assert info is not None, (
'config.new_proxy_info is expected to generate non-none proxy info from non-none tiered_proxy_urls.'
)
assert info.url == tiered_proxy_urls[0][1]
# Proxy rotation starts from the beginning of the proxy list after last proxy in tier was used. No proxy used again.
info = await config.new_proxy_info(None, None, None)
assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'
def test_predict_tier_bounds_with_single_tier() -> None:
"""With a single tier, predict_tier should always return 0."""
tracker = _ProxyTierTracker([[URL('http://proxy:1111')]])
tracker.add_error('example.com', 0)
# Each call mutates internal state (decaying histogram, potentially shifting tiers). The error score starts
# at 10 and decays by 1 per call, so 20 iterations covers the full decay to zero and beyond.
for _ in range(20):
tier = tracker.predict_tier('example.com')
assert tier == 0