Skip to content

Commit 8b883d2

Browse files
authored
Threaded loop updates & fixes (#361)
* Close thread loop adapter from the correct thread * Handle futures in threaded loop * Remove assert * Do not stop threaded loop adapter for each test method * Update crashed browser tests to run on Windows * Tests: maybe launch in thread * CI: also run pinned dependency envs on Windows * Do not return from async tests * _maybe_future_from_coro * Add missing type hints * Add reruns=2 to all tests * Adjust expected failure message
1 parent 8cb31c1 commit 8b883d2

8 files changed

Lines changed: 89 additions & 37 deletions

File tree

.github/workflows/tests.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ jobs:
3939
- name: Run twisted tests
4040
run: tox -e py-twisted
4141

42+
- name: Run pinned dependency tests (Windows, asyncio tests)
43+
if: runner.os == 'Windows'
44+
run: tox -e py-pinned
45+
46+
- name: Run pinned dependency tests (Windows, Twisted tests)
47+
if: runner.os == 'Windows'
48+
run: tox -e py-pinned-twisted
49+
4250
- name: Upload coverage report (Linux)
4351
if: runner.os == 'Linux'
4452
env:

pylintrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ disable=
99
too-few-public-methods,
1010
too-many-arguments,
1111
too-many-instance-attributes,
12+
too-many-lines,
1213
# tests
1314
duplicate-code,
1415
import-outside-toplevel,

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
[tool.black]
22
line-length = 99
3+
4+
[tool.pytest.ini_options]
5+
reruns = "2"

scrapy_playwright/_utils.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import platform
44
import threading
5+
from dataclasses import dataclass
56
from typing import Awaitable, Dict, Iterator, Optional, Tuple, Union
67

78
import scrapy
@@ -104,6 +105,13 @@ async def _get_header_value(
104105
return None
105106

106107

108+
@dataclass
109+
class _QueueItem:
110+
coro: Awaitable
111+
promise: Deferred | asyncio.Future
112+
loop: asyncio.AbstractEventLoop | None = None
113+
114+
107115
class _ThreadedLoopAdapter:
108116
"""Utility class to start an asyncio event loop in a new thread and redirect coroutines.
109117
This allows to run Playwright in a different loop than the Scrapy crawler, allowing to
@@ -116,32 +124,60 @@ class _ThreadedLoopAdapter:
116124
_stop_events: Dict[int, asyncio.Event] = {}
117125

118126
@classmethod
119-
async def _handle_coro(cls, coro: Awaitable, dfd: Deferred) -> None:
127+
async def _handle_coro_deferred(cls, queue_item: _QueueItem) -> None:
120128
from twisted.internet import reactor
121129

130+
dfd: Deferred = queue_item.promise
131+
122132
try:
123-
result = await coro
133+
result = await queue_item.coro
124134
except Exception as exc:
125135
reactor.callFromThread(dfd.errback, failure.Failure(exc))
126136
else:
127137
reactor.callFromThread(dfd.callback, result)
128138

139+
@classmethod
140+
async def _handle_coro_future(cls, queue_item: _QueueItem) -> None:
141+
future: asyncio.Future = queue_item.promise
142+
loop: asyncio.AbstractEventLoop = queue_item.loop # type: ignore[assignment]
143+
try:
144+
result = await queue_item.coro
145+
except Exception as exc:
146+
loop.call_soon_threadsafe(future.set_exception, exc)
147+
else:
148+
loop.call_soon_threadsafe(future.set_result, result)
149+
129150
@classmethod
130151
async def _process_queue(cls) -> None:
131152
while any(not ev.is_set() for ev in cls._stop_events.values()):
132-
coro, dfd = await cls._coro_queue.get()
133-
asyncio.create_task(cls._handle_coro(coro, dfd))
153+
queue_item = await cls._coro_queue.get()
154+
if isinstance(queue_item.promise, asyncio.Future):
155+
asyncio.create_task(cls._handle_coro_future(queue_item))
156+
elif isinstance(queue_item.promise, Deferred):
157+
asyncio.create_task(cls._handle_coro_deferred(queue_item))
134158
cls._coro_queue.task_done()
135159

136160
@classmethod
137-
def _deferred_from_coro(cls, coro) -> Deferred:
161+
def _deferred_from_coro(cls, coro: Awaitable) -> Deferred:
138162
dfd: Deferred = Deferred()
139-
asyncio.run_coroutine_threadsafe(cls._coro_queue.put((coro, dfd)), cls._loop)
163+
queue_item = _QueueItem(coro=coro, promise=dfd)
164+
asyncio.run_coroutine_threadsafe(cls._coro_queue.put(queue_item), cls._loop)
140165
return dfd
141166

142167
@classmethod
143-
def start(cls, caller_id: int) -> None:
144-
cls._stop_events[caller_id] = asyncio.Event()
168+
def _future_from_coro(cls, coro: Awaitable) -> asyncio.Future:
169+
target_loop = asyncio.get_running_loop() # Scrapy thread loop
170+
future: asyncio.Future = asyncio.Future()
171+
queue_item = _QueueItem(coro=coro, promise=future, loop=target_loop)
172+
asyncio.run_coroutine_threadsafe(cls._coro_queue.put(queue_item), cls._loop)
173+
return future
174+
175+
@classmethod
176+
def start(cls, download_handler_id: int) -> None:
177+
"""Start the event loop in a new thread if not already started.
178+
Should be called from the Scrapy thread.
179+
"""
180+
cls._stop_events[download_handler_id] = asyncio.Event()
145181
if not getattr(cls, "_loop", None):
146182
policy = asyncio.DefaultEventLoopPolicy()
147183
if platform.system() == "Windows":
@@ -155,9 +191,11 @@ def start(cls, caller_id: int) -> None:
155191
asyncio.run_coroutine_threadsafe(cls._process_queue(), cls._loop)
156192

157193
@classmethod
158-
def stop(cls, caller_id: int) -> None:
159-
"""Wait until all handlers are closed to stop the event loop and join the thread."""
160-
cls._stop_events[caller_id].set()
194+
def stop(cls, download_handler_id: int) -> None:
195+
"""Wait until all handlers are closed to stop the event loop and join the thread.
196+
Should be called from the Scrapy thread.
197+
"""
198+
cls._stop_events[download_handler_id].set()
161199
if all(ev.is_set() for ev in cls._stop_events.values()):
162200
asyncio.run_coroutine_threadsafe(cls._coro_queue.join(), cls._loop)
163201
cls._loop.call_soon_threadsafe(cls._loop.stop)

scrapy_playwright/handler.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from scrapy.http.headers import Headers
3232
from scrapy.responsetypes import responsetypes
3333
from scrapy.settings import Settings
34-
from scrapy.utils.defer import deferred_from_coro, maybe_deferred_to_future
34+
from scrapy.utils.defer import deferred_from_coro
3535
from scrapy.utils.misc import load_object
3636
from scrapy.utils.reactor import verify_installed_reactor
3737
from twisted.internet.defer import Deferred, inlineCallbacks
@@ -144,18 +144,21 @@ def __init__(self, crawler: Crawler) -> None:
144144
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
145145
if _SCRAPY_ASYNC_API:
146146
super().__init__(crawler=crawler)
147-
crawler.signals.connect(self._launch, signals.engine_started)
148147
else:
149148
super().__init__( # pylint: disable=unexpected-keyword-arg
150149
settings=crawler.settings, crawler=crawler
151150
)
152-
crawler.signals.connect(self._engine_started, signals.engine_started)
153151
self.stats = crawler.stats
154152
self.config = Config.from_settings(crawler.settings)
155153

156154
if self.config.use_threaded_loop:
157155
_ThreadedLoopAdapter.start(id(self))
158156

157+
if _SCRAPY_ASYNC_API:
158+
crawler.signals.connect(self._maybe_launch_in_thread, signals.engine_started)
159+
else:
160+
crawler.signals.connect(self._engine_started, signals.engine_started)
161+
159162
self.browser_launch_lock = asyncio.Lock()
160163
self.context_launch_lock = asyncio.Lock()
161164
self.context_wrappers: Dict[str, BrowserContextWrapper] = {}
@@ -186,10 +189,17 @@ def _deferred_from_coro(self, coro: Awaitable) -> Deferred:
186189
return _ThreadedLoopAdapter._deferred_from_coro(coro)
187190
return deferred_from_coro(coro)
188191

192+
def _maybe_future_from_coro(self, coro: Awaitable) -> Awaitable | asyncio.Future:
193+
if self.config.use_threaded_loop:
194+
return _ThreadedLoopAdapter._future_from_coro(coro)
195+
return coro
196+
189197
def _engine_started(self) -> Deferred:
190-
"""Launch the browser. Use the engine_started signal as it supports returning deferreds."""
191198
return self._deferred_from_coro(self._launch())
192199

200+
async def _maybe_launch_in_thread(self) -> None:
201+
await self._maybe_future_from_coro(self._launch())
202+
193203
async def _launch(self) -> None:
194204
"""Launch Playwright manager and configured startup context(s)."""
195205
logger.info("Starting download handler")
@@ -362,7 +372,9 @@ def _set_max_concurrent_context_count(self):
362372
async def close(self) -> None:
363373
logger.info("Closing download handler")
364374
await super().close()
365-
await self._close()
375+
await self._maybe_future_from_coro(self._close())
376+
if self.config.use_threaded_loop:
377+
_ThreadedLoopAdapter.stop(id(self))
366378

367379
else:
368380

@@ -371,6 +383,8 @@ def close(self) -> Deferred: # pylint: disable=invalid-overridden-method
371383
logger.info("Closing download handler")
372384
yield super().close()
373385
yield self._deferred_from_coro(self._close())
386+
if self.config.use_threaded_loop:
387+
_ThreadedLoopAdapter.stop(id(self))
374388

375389
async def _close(self) -> None:
376390
with suppress(TargetClosedError):
@@ -383,16 +397,13 @@ async def _close(self) -> None:
383397
await self.playwright_context_manager.__aexit__()
384398
if self.playwright:
385399
await self.playwright.stop()
386-
if self.config.use_threaded_loop:
387-
_ThreadedLoopAdapter.stop(id(self))
388400

389401
if _SCRAPY_ASYNC_API:
390402

391403
async def download_request(self, request: Request) -> Response:
392404
if request.meta.get("playwright"):
393-
return await maybe_deferred_to_future(
394-
self._deferred_from_coro(self._download_request(request, self._crawler.spider))
395-
)
405+
coro = self._download_request(request)
406+
return await self._maybe_future_from_coro(coro)
396407
return await super().download_request( # pylint: disable=no-value-for-parameter
397408
request
398409
)
@@ -408,7 +419,8 @@ def download_request( # type: ignore[misc] # pylint: disable=invalid-overridden
408419
request=request, spider=spider
409420
)
410421

411-
async def _download_request(self, request: Request, spider: Spider) -> Response:
422+
async def _download_request(self, request: Request, spider: Spider | None = None) -> Response:
423+
spider = spider or self._crawler.spider
412424
counter = 0
413425
while True:
414426
try:

tests/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,9 @@ def allow_windows(test_method):
2626

2727
@wraps(test_method)
2828
async def wrapped(self, *args, **kwargs):
29-
caller_id = 1234
30-
_ThreadedLoopAdapter.start(caller_id)
29+
_ThreadedLoopAdapter.start(id(test_method))
3130
coro = test_method(self, *args, **kwargs)
3231
asyncio.run_coroutine_threadsafe(coro=coro, loop=_ThreadedLoopAdapter._loop).result()
33-
_ThreadedLoopAdapter.stop(caller_id)
3432

3533
return wrapped
3634

@@ -50,7 +48,7 @@ async def make_handler(settings_dict: Optional[dict] = None):
5048
crawler = get_crawler(settings_dict=settings)
5149
handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
5250
try:
53-
await handler._launch()
51+
await handler._maybe_launch_in_thread()
5452
except: # noqa (E722), pylint: disable=bare-except
5553
pass
5654
else:

tests/tests_asyncio/test_browser.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import asyncio
22
import logging
3-
import platform
43
import random
54
import re
65
import subprocess
@@ -106,7 +105,6 @@ async def test_connect_devtools(self):
106105
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
107106
) in self._caplog.record_tuples
108107

109-
@pytest.mark.flaky(reruns=3)
110108
@allow_windows
111109
async def test_connect(self):
112110
async with remote_chromium(with_devtools_protocol=False) as browser_url:
@@ -192,10 +190,7 @@ async def test_browser_closed_restart(self):
192190
== 2 # one at the beginning, one after calling Browser.close() manually
193191
)
194192

195-
@pytest.mark.skipif(
196-
platform.system() == "Windows",
197-
reason="os.kill does not work as expected on Windows",
198-
)
193+
@allow_windows
199194
async def test_browser_crashed_restart(self):
200195
spider = Spider("foo")
201196
async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
@@ -239,10 +234,7 @@ async def test_browser_crashed_restart(self):
239234
== 2 # one at the beginning, one after killing the broser process
240235
)
241236

242-
@pytest.mark.skipif(
243-
platform.system() == "Windows",
244-
reason="os.kill does not work as expected on Windows",
245-
)
237+
@allow_windows
246238
async def test_browser_crashed_do_not_restart(self):
247239
spider = Spider("foo")
248240
settings_dict = {

tests/tests_twisted/test_mixed_requests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def _check_playwright_error(failure, url):
5656
# different errors depending on the platform
5757
self.assertTrue(
5858
f"net::ERR_CONNECTION_REFUSED at {url}" in str(failure.value)
59-
or f"Page.goto: Timeout {self.timeout_ms}ms exceeded" in str(failure.value)
59+
or f"Timeout {self.timeout_ms}ms exceeded" in str(failure.value)
6060
)
6161

6262
req1 = Request(self.server.urljoin("/index.html"))

0 commit comments

Comments
 (0)