Skip to content

Commit f5682da

Browse files
committed
Resolve some type ignores
1 parent a55a78f commit f5682da

File tree

12 files changed

+107
-49
lines changed

12 files changed

+107
-49
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Cache
22
__pycache__
3-
.uv_cache
43
.pytest_cache
54
.ruff_cache
5+
.ty_cache
66
.uv-cache
77

88
# Virtual envs

src/crawlee/_utils/recurring_task.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ class RecurringTask:
2525
"""
2626

2727
def __init__(self, func: Callable, delay: timedelta) -> None:
28-
logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') # ty: ignore[unresolved-attribute]
28+
logger.debug(
29+
'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
30+
func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
31+
delay,
32+
)
2933
self.func = func
3034
self.delay = delay
3135
self.task: asyncio.Task | None = None
@@ -55,7 +59,11 @@ async def _wrapper(self) -> None:
5559

5660
def start(self) -> None:
5761
"""Start the recurring task execution."""
58-
self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') # ty: ignore[possibly-missing-attribute]
62+
name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
63+
self.task = asyncio.create_task(
64+
self._wrapper(),
65+
name=f'Task-recurring-{name}',
66+
)
5967

6068
async def stop(self) -> None:
6169
"""Stop the recurring task execution."""

src/crawlee/_utils/sitemap.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -430,10 +430,17 @@ async def parse_sitemap(
430430
up to the specified maximum depth.
431431
"""
432432
# Set default options
433-
options = options or {} # ty: ignore[invalid-assignment]
434-
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) # ty: ignore[possibly-missing-attribute]
435-
max_depth = options.get('max_depth', float('inf')) # ty: ignore[possibly-missing-attribute]
436-
sitemap_retries = options.get('sitemap_retries', 3) # ty: ignore[possibly-missing-attribute]
433+
default_timeout = timedelta(seconds=30)
434+
if options:
435+
emit_nested_sitemaps = options['emit_nested_sitemaps']
436+
max_depth = options['max_depth']
437+
sitemap_retries = options['sitemap_retries']
438+
timeout = options.get('timeout', default_timeout)
439+
else:
440+
emit_nested_sitemaps = False
441+
max_depth = float('inf')
442+
sitemap_retries = 3
443+
timeout = default_timeout
437444

438445
# Setup working state
439446
sources = list(initial_sources)
@@ -472,7 +479,7 @@ async def parse_sitemap(
472479
sitemap_retries,
473480
emit_nested_sitemaps=emit_nested_sitemaps,
474481
proxy_info=proxy_info,
475-
timeout=options.get('timeout', timedelta(seconds=30)), # ty: ignore[possibly-missing-attribute]
482+
timeout=timeout,
476483
):
477484
yield result
478485
else:

src/crawlee/browsers/_browser_pool.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,11 @@ def with_default_plugin(
138138
kwargs: Additional arguments for default constructor.
139139
"""
140140
plugin_options: dict = defaultdict(dict)
141-
plugin_options['browser_launch_options'] = browser_launch_options or {}
141+
plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
142142
plugin_options['browser_new_context_options'] = browser_new_context_options or {}
143143

144144
if headless is not None:
145-
plugin_options['browser_launch_options']['headless'] = headless # ty: ignore[invalid-assignment]
145+
plugin_options['browser_launch_options']['headless'] = headless
146146

147147
if use_incognito_pages is not None:
148148
plugin_options['use_incognito_pages'] = use_incognito_pages

src/crawlee/browsers/_playwright_browser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ async def new_context(self, **context_options: Any) -> BrowserContext:
7878

7979
async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
8080
if self._temp_dir and self._temp_dir.exists():
81-
await asyncio.to_thread(lambda: shutil.rmtree(self._temp_dir, ignore_errors=True)) # ty: ignore[invalid-argument-type]
81+
temp_dir = self._temp_dir
82+
await asyncio.to_thread(lambda: shutil.rmtree(temp_dir, ignore_errors=True))
8283

8384
@override
8485
async def close(self, **kwargs: Any) -> None:

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,31 +27,23 @@
2727
)
2828
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
2929
from crawlee.crawlers._parsel._parsel_parser import ParselParser
30+
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
3031
from crawlee.statistics import Statistics, StatisticsState
3132

32-
from ._adaptive_playwright_crawler_statistics import (
33-
AdaptivePlaywrightCrawlerStatisticState,
34-
)
33+
from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
3534
from ._adaptive_playwright_crawling_context import (
3635
AdaptivePlaywrightCrawlingContext,
3736
AdaptivePlaywrightPreNavCrawlingContext,
3837
)
39-
from ._rendering_type_predictor import (
40-
DefaultRenderingTypePredictor,
41-
RenderingType,
42-
RenderingTypePredictor,
43-
)
44-
from ._result_comparator import (
45-
create_default_comparator,
46-
)
38+
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
39+
from ._result_comparator import create_default_comparator
4740

4841
if TYPE_CHECKING:
4942
from types import TracebackType
5043

5144
from typing_extensions import Unpack
5245

5346
from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
54-
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
5547

5648

5749
TStaticParseResult = TypeVar('TStaticParseResult')
@@ -162,7 +154,7 @@ def __init__(
162154
super().__init__(statistics=adaptive_statistics, **kwargs)
163155

164156
# Sub crawlers related.
165-
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} # ty: ignore[invalid-assignment]
157+
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
166158

167159
# Each sub crawler will use custom logger .
168160
static_logger = getLogger('Subcrawler_static')
@@ -183,7 +175,7 @@ def __init__(
183175
)
184176
playwright_crawler = PlaywrightCrawler(
185177
statistics=_NonPersistentStatistics(),
186-
**playwright_crawler_specific_kwargs, # ty: ignore[invalid-argument-type]
178+
**playwright_crawler_specific_kwargs,
187179
**basic_crawler_kwargs_for_pw_crawler,
188180
)
189181

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,7 @@
1414

1515
from crawlee import service_locator
1616
from crawlee._request import Request, RequestOptions, RequestState
17-
from crawlee._types import (
18-
BasicCrawlingContext,
19-
ConcurrencySettings,
20-
)
17+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings
2118
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
2219
from crawlee._utils.docs import docs_group
2320
from crawlee._utils.robots import RobotsTxtFile
@@ -177,13 +174,12 @@ def __init__(
177174
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
178175
else:
179176
if fingerprint_generator == 'default':
180-
if not browser_type:
181-
generator_browser_type = None
182-
else:
183-
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
177+
generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
178+
[fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
179+
)
184180

185181
fingerprint_generator = DefaultFingerprintGenerator(
186-
header_options=HeaderGeneratorOptions(browsers=generator_browser_type) # ty: ignore[invalid-argument-type]
182+
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
187183
)
188184

189185
browser_pool = BrowserPool.with_default_plugin(
@@ -516,6 +512,7 @@ async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]:
516512

517513
async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
518514
"""Update the cookies in the page context."""
515+
# False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
519516
await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type]
520517

521518
async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:

src/crawlee/events/_event_manager.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ async def listener_wrapper(event_data: EventData) -> None:
178178
else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
179179
)
180180

181-
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}') # ty: ignore[invalid-argument-type, unresolved-attribute]
181+
listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
182+
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
182183
self._listener_tasks.add(listener_task)
183184

184185
try:
@@ -189,7 +190,12 @@ async def listener_wrapper(event_data: EventData) -> None:
189190
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
190191
logger.exception(
191192
'Exception in the event listener',
192-
extra={'event_name': event.value, 'listener_name': listener.__name__}, # ty: ignore[unresolved-attribute]
193+
extra={
194+
'event_name': event.value,
195+
'listener_name': listener.__name__
196+
if hasattr(listener, '__name__')
197+
else listener.__class__.__name__,
198+
},
193199
)
194200
finally:
195201
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')

src/crawlee/http_clients/_curl_impersonate.py

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
import asyncio
44
from contextlib import asynccontextmanager
5-
from typing import TYPE_CHECKING, Any
5+
from http.cookiejar import Cookie
6+
from typing import TYPE_CHECKING, Any, cast
67

78
from curl_cffi import CurlInfo
89
from curl_cffi.const import CurlHttpVersion
@@ -15,7 +16,7 @@
1516
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
1617
from typing_extensions import override
1718

18-
from crawlee._types import HttpHeaders, HttpPayload
19+
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
1920
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
2021
from crawlee._utils.docs import docs_group
2122
from crawlee.errors import ProxyError
@@ -24,11 +25,11 @@
2425
if TYPE_CHECKING:
2526
from collections.abc import AsyncGenerator
2627
from datetime import timedelta
27-
from http.cookiejar import Cookie
2828

2929
from curl_cffi import Curl
3030
from curl_cffi.requests import Request as CurlRequest
3131
from curl_cffi.requests import Response
32+
from curl_cffi.requests.session import HttpMethod as CurlHttpMethod
3233

3334
from crawlee import Request
3435
from crawlee._types import HttpMethod
@@ -90,13 +91,15 @@ def headers(self) -> HttpHeaders:
9091
async def read(self) -> bytes:
9192
if self._response.astream_task:
9293
raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
94+
9395
return self._response.content
9496

9597
async def read_stream(self) -> AsyncGenerator[bytes, None]:
96-
if not self._response.astream_task or self._response.astream_task.done(): # ty: ignore[possibly-missing-attribute]
97-
raise RuntimeError(
98-
'Cannot read stream: either already consumed or Response not obtained from `stream` method'
99-
)
98+
if not self._response.astream_task:
99+
raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')
100+
101+
if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():
102+
raise RuntimeError('Cannot read stream, it was already consumed.')
100103

101104
async for chunk in self._response.aiter_content():
102105
yield chunk
@@ -156,7 +159,7 @@ async def crawl(
156159
try:
157160
response = await client.request(
158161
url=request.url,
159-
method=request.method.upper(), # ty: ignore[invalid-argument-type]
162+
method=self._convert_method(request.method),
160163
headers=request.headers,
161164
data=request.payload,
162165
cookies=session.cookies.jar if session else None,
@@ -203,7 +206,7 @@ async def send_request(
203206
try:
204207
response = await client.request(
205208
url=url,
206-
method=method.upper(), # ty: ignore[invalid-argument-type]
209+
method=self._convert_method(method),
207210
headers=dict(headers) if headers else None,
208211
data=payload,
209212
cookies=session.cookies.jar if session else None,
@@ -244,7 +247,7 @@ async def stream(
244247
try:
245248
response = await client.request(
246249
url=url,
247-
method=method.upper(), # ty: ignore[invalid-argument-type]
250+
method=self._convert_method(method),
248251
headers=dict(headers) if headers else None,
249252
data=payload,
250253
cookies=session.cookies.jar if session else None,
@@ -291,6 +294,40 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
291294

292295
return self._client_by_proxy_url[proxy_url]
293296

297+
def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
298+
"""Convert from Crawlee HTTP method to curl-cffi HTTP method.
299+
300+
Args:
301+
method: Crawlee HTTP method.
302+
303+
Returns:
304+
Corresponding curl-cffi HTTP method.
305+
306+
Raises:
307+
ValueError: If the provided HTTP method is not supported.
308+
"""
309+
method_upper = method.upper() # curl-cffi requires uppercase methods
310+
311+
match method_upper:
312+
case 'GET':
313+
return 'GET'
314+
case 'POST':
315+
return 'POST'
316+
case 'PUT':
317+
return 'PUT'
318+
case 'DELETE':
319+
return 'DELETE'
320+
case 'OPTIONS':
321+
return 'OPTIONS'
322+
case 'HEAD':
323+
return 'HEAD'
324+
case 'TRACE':
325+
return 'TRACE'
326+
case 'PATCH':
327+
return 'PATCH'
328+
case _:
329+
raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')
330+
294331
@staticmethod
295332
def _is_proxy_error(error: CurlRequestError) -> bool:
296333
"""Determine whether the given error is related to a proxy issue.
@@ -308,11 +345,16 @@ def _is_proxy_error(error: CurlRequestError) -> bool:
308345

309346
@staticmethod
310347
def _get_cookies(curl: Curl) -> list[Cookie]:
311-
cookies: list[Cookie] = []
312-
for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # ty: ignore[not-iterable]
313-
curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # ty: ignore[invalid-argument-type]
348+
cookies = list[Cookie]()
349+
350+
# Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.
351+
cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))
352+
353+
for curl_cookie in cookie_list:
354+
curl_morsel = CurlMorsel.from_curl_format(curl_cookie)
314355
cookie = curl_morsel.to_cookiejar_cookie()
315356
cookies.append(cookie)
357+
316358
return cookies
317359

318360
async def cleanup(self) -> None:

src/crawlee/storage_clients/_redis/_dataset_client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,12 +179,14 @@ async def get_data(
179179
case (True, int(), None):
180180
json_path += f'[:-{offset}]'
181181
case (True, int(), int()):
182+
# ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
182183
json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator]
183184
case (False, 0, int()):
184185
json_path += f'[:{limit}]'
185186
case (False, int(), None):
186187
json_path += f'[{offset}:]'
187188
case (False, int(), int()):
189+
# ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
188190
json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator]
189191

190192
if json_path == '$':

0 commit comments

Comments
 (0)