MAINT: Fix flaky sleeps and MagicMock misuse in unit tests (#1874)

romanlutz · Copilot · web-flow · commit 5e8bf1d62da7 · 2026-06-02T22:13:45.000Z
Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/tests/unit/auth/test_copilot_authenticator.py b/tests/unit/auth/test_copilot_authenticator.py
@@ -491,7 +491,9 @@ async def test_get_token_serializes_concurrent_requests(self, mock_env_vars, moc
         async def mock_fetch():
             nonlocal fetch_call_count
             fetch_call_count += 1
-            await asyncio.sleep(0.01)  # minimal delay to test concurrency
+            # Yield once so concurrent callers contend for the lock; the lock
+            # guarantees serialization regardless of real-time delays.
+            await asyncio.sleep(0)
             return f"token.{fetch_call_count}"
 
         def mock_load_side_effect():
diff --git a/tests/unit/cli/test_pyrit_shell.py b/tests/unit/cli/test_pyrit_shell.py
@@ -191,7 +191,9 @@ def test_run_async_raises_timeout_error(self):
         try:
 
             async def hangs():
-                await asyncio.sleep(10)
+                # Block on an Event that's never set so the coroutine truly
+                # cannot complete on its own; the timeout under test must cut it off.
+                await asyncio.Event().wait()
 
             with pytest.raises(TimeoutError, match="did not complete"):
                 s._run_async(hangs(), timeout=0.05)
diff --git a/tests/unit/exceptions/test_retry_collector.py b/tests/unit/exceptions/test_retry_collector.py
@@ -95,12 +95,12 @@ def test_contextvar_isolation_across_tasks(self) -> None:
         async def task_a() -> None:
             c = RetryCollector()
             set_retry_collector(c)
-            await asyncio.sleep(0.01)
+            await asyncio.sleep(0)
             results["a_has_collector"] = get_retry_collector() is c
             clear_retry_collector()
 
         async def task_b() -> None:
-            await asyncio.sleep(0.005)
+            await asyncio.sleep(0)
             results["b_sees_none"] = get_retry_collector() is None
 
         async def run() -> None:
diff --git a/tests/unit/executor/attack/core/test_attack_executor.py b/tests/unit/executor/attack/core/test_attack_executor.py
@@ -259,7 +259,8 @@ async def mock_execute(*, context):
             nonlocal concurrent_count, max_concurrent
             concurrent_count += 1
             max_concurrent = max(max_concurrent, concurrent_count)
-            await asyncio.sleep(0.05)
+            # Yield so other tasks bounded by the semaphore can also enter.
+            await asyncio.sleep(0)
             concurrent_count -= 1
             return create_attack_result(context.params.objective)
 
@@ -282,7 +283,8 @@ async def test_single_concurrency_serializes_execution(self):
         async def mock_execute(*, context):
             objective = context.params.objective
             execution_order.append(f"start_{objective}")
-            await asyncio.sleep(0.01)
+            # Yield once so another task could interleave if max_concurrency > 1.
+            await asyncio.sleep(0)
             execution_order.append(f"end_{objective}")
             return create_attack_result(objective)
 
@@ -455,9 +457,9 @@ async def test_attribution_parallel_safe_with_high_concurrency(self):
         async def out_of_order(context):
             attr = context._attribution
             assert attr is not None
-            # Reverse-delay tasks so completion order is inverse of input order.
-            i = int(context.params.objective.split("-")[1])
-            await asyncio.sleep(0.005 * (10 - i))
+            # Yield so all tasks run concurrently under the high-concurrency executor;
+            # the assertion verifies attribution is per-task regardless of order.
+            await asyncio.sleep(0)
             seen[context.params.objective] = attr
             return create_attack_result(context.params.objective)
 
diff --git a/tests/unit/models/test_message.py b/tests/unit/models/test_message.py
@@ -152,17 +152,23 @@ def test_duplicate_message_preserves_original_prompt_id(self, message: Message)
 
     def test_duplicate_message_creates_new_timestamp(self, message: Message) -> None:
         """Test that duplicate_message creates new timestamps."""
-        import time
+        from datetime import timedelta, timezone
+        from unittest.mock import patch
 
         original_timestamps = [piece.timestamp for piece in message.message_pieces]
+        fake_now = max(original_timestamps) + timedelta(seconds=1)
 
-        time.sleep(0.01)  # Small delay to ensure different timestamp
-        duplicated = message.duplicate_message()
+        with patch("pyrit.models.messages.message.datetime") as mock_datetime:
+            mock_datetime.now.return_value = fake_now
+            duplicated = message.duplicate_message()
 
         for dup_piece in duplicated.message_pieces:
-            # Verify timestamp is newer than all original timestamps
+            # Every duplicated piece shares the new timestamp produced by duplicate_message.
+            assert dup_piece.timestamp == fake_now
+            # And it is strictly newer than every original timestamp.
             for orig_ts in original_timestamps:
-                assert dup_piece.timestamp >= orig_ts
+                assert dup_piece.timestamp > orig_ts
+        mock_datetime.now.assert_called_once_with(tz=timezone.utc)
 
     def test_duplicate_message_is_deep_copy(self, message: Message) -> None:
         """Test that duplicate_message creates a deep copy (modifications don't affect original)."""
diff --git a/tests/unit/models/test_message_piece.py b/tests/unit/models/test_message_piece.py
@@ -3,11 +3,11 @@
 
 import os
 import tempfile
-import time
 import uuid
 import warnings
 from collections.abc import MutableSequence
 from datetime import datetime, timedelta, timezone
+from unittest.mock import patch
 
 import pytest
 from unit.mocks import MockPromptTarget, get_mock_target, get_sample_conversations
@@ -41,14 +41,16 @@ def test_id_set():
 
 
 def test_datetime_set():
-    now = datetime.now(tz=timezone.utc)
-    time.sleep(0.1)
-    entry = MessagePiece(
-        role="user",
-        original_value="Hello",
-        converted_value="Hello",
-    )
-    assert entry.timestamp > now
+    fake_now = datetime(2099, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
+    with patch("pyrit.models.messages.message_piece.datetime") as mock_datetime:
+        mock_datetime.now.return_value = fake_now
+        entry = MessagePiece(
+            role="user",
+            original_value="Hello",
+            converted_value="Hello",
+        )
+    assert entry.timestamp == fake_now
+    mock_datetime.now.assert_called_once_with(tz=timezone.utc)
 
 
 def test_converters_serialize():
diff --git a/tests/unit/output/test_helpers.py b/tests/unit/output/test_helpers.py
@@ -117,7 +117,7 @@ async def test_output_scenario_async_forwards_sort_groups_by_success_rate(mock_c
 
 async def test_output_scenario_async_unsupported_format():
     with pytest.raises(ValueError, match="Unsupported format"):
-        await output_scenario_async(MagicMock(), format="markdown")
+        await output_scenario_async(AsyncMock(), format="markdown")
 
 
 # --- output_scorer_async tests ---
@@ -150,7 +150,7 @@ async def test_output_scorer_async_with_harm_category(mock_cls):
 
 async def test_output_scorer_async_unsupported_format():
     with pytest.raises(ValueError, match="Unsupported format"):
-        await output_scorer_async(scorer_identifier=MagicMock(), format="markdown")
+        await output_scorer_async(scorer_identifier=AsyncMock(), format="markdown")
 
 
 # --- output_conversation_async tests ---
@@ -185,7 +185,7 @@ async def test_output_conversation_async_with_scores(mock_cls):
 
 async def test_output_conversation_async_unsupported_format():
     with pytest.raises(ValueError, match="Unsupported format"):
-        await output_conversation_async([MagicMock()], format="markdown")
+        await output_conversation_async([AsyncMock()], format="markdown")
 
 
 # --- output_score_async tests ---
@@ -208,4 +208,4 @@ async def test_output_score_async_pretty_default(mock_cls):
 
 async def test_output_score_async_unsupported_format():
     with pytest.raises(ValueError, match="Unsupported format"):
-        await output_score_async([MagicMock()], format="markdown")
+        await output_score_async([AsyncMock()], format="markdown")
diff --git a/tests/unit/prompt_target/test_discover_target_capabilities.py b/tests/unit/prompt_target/test_discover_target_capabilities.py
@@ -755,7 +755,9 @@ async def test_timeout_returns_false_after_retries(self) -> None:
         target = MockPromptTarget()
 
         async def _hang(**_kwargs: object) -> list[Message]:
-            await asyncio.sleep(10)
+            # Block on an Event that's never set so the probe truly cannot
+            # complete on its own; per_probe_timeout_s must cut it off.
+            await asyncio.Event().wait()
             return _ok_response()
 
         target._send_prompt_to_target_async = AsyncMock(side_effect=_hang)  # type: ignore[method-assign]
diff --git a/tests/unit/scenario/core/test_scenario.py b/tests/unit/scenario/core/test_scenario.py
@@ -1218,7 +1218,8 @@ async def run_async(*, executor, **kwargs):
                         async with lock:
                             in_flight[0] += 1
                             peak[0] = max(peak[0], in_flight[0])
-                        await asyncio.sleep(0.02)
+                        # Yield so other tasks contending for the semaphore can enter.
+                        await asyncio.sleep(0)
                         async with lock:
                             in_flight[0] -= 1
                 _stamp_scenario_linkage(
@@ -1313,7 +1314,9 @@ async def test_failure_lets_inflight_siblings_finish_but_skips_queued(
 
         async def ok_run(idx, name):
             started_calls.append(name)
-            await asyncio.sleep(0.05)
+            # Wait for the bad task to fail before this one completes, so the
+            # failure is observed mid-flight (no wall-clock dependency).
+            await bad_started.wait()
             completed_calls.append(name)
             _stamp_scenario_linkage(
                 attack_results=[sample_attack_results[idx]],
@@ -1367,7 +1370,8 @@ async def test_multiple_inflight_failures_are_grouped_into_exception_group(
         # observed (no queueing) and every failure should propagate.
         def make_fail_run(name: str):
             async def _run(*args, **kwargs):
-                await asyncio.sleep(0.01)
+                # Yield so all three workers are in-flight before any fails.
+                await asyncio.sleep(0)
                 raise RuntimeError(f"{name} boom")
 
             return AsyncMock(side_effect=_run)
diff --git a/tests/unit/score/test_scorer.py b/tests/unit/score/test_scorer.py
@@ -998,15 +998,15 @@ async def test_score_response_async_concurrent_execution():
 
     async def mock_aux_score_async(message: Message, **kwargs) -> list[Score]:
         call_order.append("aux_start")
-        # Simulate some async work
-        await asyncio.sleep(0.01)
+        # Yield so the other scorer can interleave (proves concurrent execution).
+        await asyncio.sleep(0)
         call_order.append("aux_end")
         return [MagicMock(spec=Score)]
 
     async def mock_obj_score_async(message: Message, **kwargs) -> list[Score]:
         call_order.append("obj_start")
-        # Simulate some async work
-        await asyncio.sleep(0.01)
+        # Yield so the other scorer can interleave (proves concurrent execution).
+        await asyncio.sleep(0)
         call_order.append("obj_end")
         score = MagicMock(spec=Score)
         score.get_value.return_value = True