test: Reduce flakiness in tests (#1716)

Pijukatel · web-flow · commit 8c0dae65e1d7 · 2026-02-10T12:29:13.000+01:00
### Description - Properly consume the first `SystemInfoEvent` in the fixture to ensure no interference with tests - Add extra wait buffer to flaky test sensitive to timing issues - Take into account that `asyncio.sleep` time can be slightly shorter than expected - Run resource-sensitive tests alone on Mac. Mac executor is the most sensitive to resource-dependent tests. Reduce flakiness by running some of them alone on Mac only. This should reduce the flakiness, without hiding issues too much. (Mac executor on GitHub has its own issues, which we do not need to deal with. If the test is flaky on other platforms, it should be investigated. If flaky on Mac only, try to run it alone first. ) ### Issues - Closes: #1652
diff --git a/tests/unit/README.md b/tests/unit/README.md
@@ -1 +1,11 @@
 # Unit tests
+
+Some tests may exhibit flaky behavior in CI. The reason for flaky behavior should be understood as it can indicate bug in the code or design flaw in the test. There are other reasons related to test execution, such as some tests that are not (or can not be) properly isolated, or limited resource constraints of the test executor.
+
+Here are some suggested approaches to mitigate flakiness, sorted in the order of preference:
+  - Investigate the root cause and fix the code or test.
+  - Apply one of the pytest marks to mitigate the flakiness:
+    - `@run_alone_on_mac` - Test with such mark will run alone on macOS exeutor in CI (normally several tests run in parallel, which can cause resource-sensitive tests to fail.) Use for resource sensitive tests that are known to be flaky only on macOS.
+    - `@run_alone` - Test with such mark will run alone on any executor. Use for resource sensitive tests that are known to be flaky on all platforms or for tests that can not be run in parallel with other test due to their design (This should be extremely rare).
+    - `@pytest.mark.flaky` - Test with such mark will be retried several times if it fails. Use for tests that are known to be flaky, but the reason for flakiness is not understood or can not be easily mitigated.
+    - `@pytest.mark.skip` - Test with such mark will be skipped. Use when none of the above approaches mitigate the test flakiness. Marking test as skipped should be a last resort, as it can hide potential bugs and give false sense of security. Skipped tests should be tracked in GitHub issue.
diff --git a/tests/unit/_autoscaling/test_snapshotter.py b/tests/unit/_autoscaling/test_snapshotter.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 from datetime import datetime, timedelta, timezone
 from logging import getLogger
 from typing import TYPE_CHECKING, cast
@@ -23,8 +24,22 @@
 
 @pytest.fixture
 async def event_manager() -> AsyncGenerator[LocalEventManager, None]:
-    # Use a long interval to avoid interference from periodic system info events during tests
-    async with LocalEventManager(system_info_interval=timedelta(hours=9999)) as event_manager:
+    # Use a long interval to avoid interference from periodic system info events during tests and ensure the first
+    # automatic event is consumed before yielding.
+
+    event_manager = LocalEventManager(system_info_interval=timedelta(hours=9999))
+
+    initial_system_info_consumed = asyncio.Event()
+
+    async def consume_automatic_system_info(_: EventSystemInfoData) -> None:
+        initial_system_info_consumed.set()
+
+    event_manager.on(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info)
+
+    async with event_manager:
+        await initial_system_info_consumed.wait()
+        event_manager.off(event=Event.SYSTEM_INFO, listener=consume_automatic_system_info)
+
         yield event_manager
 
 
diff --git a/tests/unit/_statistics/test_request_max_duration.py b/tests/unit/_statistics/test_request_max_duration.py
@@ -7,6 +7,11 @@
 
 async def test_request_max_duration_tracks_maximum() -> None:
     """Test that request_max_duration correctly tracks the maximum duration, not the minimum."""
+
+    # asyncio.sleep() can sleep slightly shorter than expected https://bugs.python.org/issue31539#msg302699
+    asyncio_sleep_time_tolerance = 0.015
+    sleep_time = 0.05
+
     async with Statistics.with_default_state() as statistics:
         # Record a short request
         statistics.record_request_processing_start('request_1')
@@ -15,15 +20,15 @@ async def test_request_max_duration_tracks_maximum() -> None:
 
         # Record a longer request
         statistics.record_request_processing_start('request_2')
-        await asyncio.sleep(0.05)  # 50ms delay
+        await asyncio.sleep(sleep_time)  # 50ms delay
         statistics.record_request_processing_finish('request_2')
         second_duration = statistics.state.request_max_duration
 
         # The max duration should be updated to the longer request's duration
         assert second_duration is not None
         assert first_duration is not None
         assert second_duration >= first_duration
-        assert second_duration.total_seconds() >= 0.05
+        assert second_duration.total_seconds() >= (sleep_time - asyncio_sleep_time_tolerance)
 
         # Record another short request - max should NOT decrease
         statistics.record_request_processing_start('request_3')
diff --git a/tests/unit/_utils/test_recurring_task.py b/tests/unit/_utils/test_recurring_task.py
@@ -7,6 +7,7 @@
 import pytest
 
 from crawlee._utils.recurring_task import RecurringTask
+from tests.unit.utils import run_alone_on_mac
 
 
 @pytest.fixture
@@ -41,6 +42,7 @@ async def test_start_and_stop(function: AsyncMock, delay: timedelta) -> None:
     assert rt.task.done()
 
 
+@run_alone_on_mac
 async def test_execution(function: AsyncMock, delay: timedelta) -> None:
     task = RecurringTask(function, delay)
 
diff --git a/tests/unit/browsers/test_browser_pool.py b/tests/unit/browsers/test_browser_pool.py
@@ -5,6 +5,7 @@
 import pytest
 
 from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
+from tests.unit.utils import run_alone_on_mac
 
 if TYPE_CHECKING:
     from yarl import URL
@@ -92,6 +93,7 @@ async def test_new_page_with_each_plugin(server_url: URL) -> None:
         assert browser_pool.total_pages_count == 2
 
 
+@run_alone_on_mac
 async def test_with_default_plugin_constructor(server_url: URL) -> None:
     async with BrowserPool.with_default_plugin(headless=True, browser_type='firefox') as browser_pool:
         assert len(browser_pool.plugins) == 1
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -1373,6 +1373,7 @@ async def test_timeout_in_handler(sleep_type: str) -> None:
     # Test is skipped in older Python versions.
     from asyncio import timeout  # type:ignore[attr-defined] # noqa: PLC0415
 
+    non_realtime_system_coefficient = 2
     handler_timeout = timedelta(seconds=1)
     max_request_retries = 3
     double_handler_timeout_s = handler_timeout.total_seconds() * 2
@@ -1401,7 +1402,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     # Timeout in pytest, because previous implementation would run crawler until following:
     # "The request queue seems to be stuck for 300.0s, resetting internal state."
-    async with timeout(max_request_retries * double_handler_timeout_s):
+    async with timeout(max_request_retries * double_handler_timeout_s * non_realtime_system_coefficient):
         await crawler.run(['https://a.placeholder.com'])
 
     assert crawler.statistics.state.requests_finished == 1
diff --git a/tests/unit/utils.py b/tests/unit/utils.py
@@ -0,0 +1,5 @@
+import sys
+
+import pytest
+
+run_alone_on_mac = pytest.mark.run_alone if sys.platform == 'darwin' else lambda x: x