clean up

alexluck-sift · alexluck-sift · commit 9b16a6036ae0 · 2026-05-17T17:13:11.000-07:00
diff --git a/python/lib/sift_client/_tests/util/step_status_states.md b/python/lib/sift_client/_tests/util/step_status_states.md
@@ -5,16 +5,16 @@ one scenario in that suite. The **target** column is the contract the suite
 asserts (sourced from
 [`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md));
 the **observed today** column records what the plugin actually produces
-right now. Rows marked `Gap` are scenarios where the test fails today and
-the plugin needs to be fixed to match the contract.
+right now. Every row should be marked `OK`; a `Gap` indicates the plugin has
+regressed against the contract.
 
 `TestStatus` values referenced below come from
 `sift_client.sift_types.test_report.TestStatus`: `PASSED`, `FAILED`, `ERROR`,
-`SKIPPED`. The targets below map every scenario onto these four existing
-statuses. An `ABORTED` status for hard process exits (`SystemExit`,
+`SKIPPED`, `IN_PROGRESS`. The targets below map every scenario onto these
+existing statuses. An `ABORTED` status for hard process exits (`SystemExit`,
 `KeyboardInterrupt`, signals) is a planned future addition; until it lands
-those cases baseline against `ERROR`. The user-facing contract these
-targets describe is documented in
+those cases baseline against `ERROR` or `IN_PROGRESS`. The user-facing
+contract these targets describe is documented in
 [`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
 
 ## Case ID scheme
@@ -43,42 +43,42 @@ that prefix; numbers are never reused or shifted when other sections grow.
 | `CALL-01` | Test passes                             | function body returns cleanly                 | `PASSED`                    | `PASSED`                                   | OK     |
 | `CALL-02` | Assert failure in call phase            | `assert 1 == 2`                               | `FAILED`                    | `FAILED`                                   | OK     |
 | `CALL-03` | Generic exception in call phase         | `raise ValueError("boom")`                    | `ERROR`                     | `ERROR`                                    | OK     |
-| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `ERROR`                     | `FAILED`                                   | Gap    |
+| `CALL-04` | `pytest.fail("...")` from body          | `pytest.fail("intentional failure")`          | `FAILED`                    | `FAILED`                                   | OK     |
 | `CALL-05` | `SystemExit` from the test body         | `sys.exit(1)`                                 | `ERROR`                     | `ERROR` (baseline; `ABORTED` planned later) | OK    |
-| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `PASSED` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; document that a session-aborting interrupt may leave the step in `IN_PROGRESS` | Gap |
+| `CALL-06` | `KeyboardInterrupt` in body             | `raise KeyboardInterrupt`                     | `IN_PROGRESS` (session aborts before the plugin sees the interrupt) | `ERROR` when the plugin sees the interrupt; a session-aborting interrupt leaves the step in `IN_PROGRESS` | OK |
 
 ## Skip paths
 
 | Case      | Scenario                                | Trigger                                       | Observed today                                                              | Target                                                          | Status |
 | --------- | --------------------------------------- | --------------------------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------- | ------ |
 | `SKIP-01` | Collection-time skip                    | `@pytest.mark.skip(reason=...)`               | `SKIPPED` (only the makereport hook records a step; no autouse step ran)    | `SKIPPED`                                                       | OK     |
 | `SKIP-02` | Conditional collection-time skip        | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` (same route as `@pytest.mark.skip`)                               | `SKIPPED`                                                       | OK     |
-| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `ERROR`; a nested step with the same name records `SKIPPED`      | Outer step `SKIPPED`; no duplicate nested step                  | Gap    |
-| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `PASSED`; a nested `SKIPPED` step is created by the makereport hook | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | Gap |
+| `SKIP-03` | Runtime skip in body                    | `pytest.skip("...")`                          | Outer step `SKIPPED`; no duplicate nested step                              | Outer step `SKIPPED`; no duplicate nested step                  | OK     |
+| `SKIP-04` | Skip raised inside a fixture            | `@pytest.fixture` calls `pytest.skip("...")`  | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step           | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step | OK   |
 
 ## xfail / xpass
 
 | Case       | Scenario                                  | Trigger                                                | Observed today                                                                                  | Target                                                | Status |
 | ---------- | ----------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------ |
-| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `FAILED`; nested `SKIPPED` substep from the makereport hook                          | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | Gap    |
-| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `PASSED` (plugin never sees pytest's "strict xpass" failure attached to the report)  | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Gap    |
-| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED` (pytest reports outcome="passed" with `wasxfail` set; plugin ignores it)    | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
-| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | Outer step `ERROR` (treated as a generic non-assertion exception)                           | `FAILED` (the `raises=` mismatch is a real test failure) | Gap |
-| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (only the makereport hook records a step)                                              | `SKIPPED` (the test never ran)                      | OK    |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                 | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step             | Outer step `PASSED` (test fulfilled the xfail expectation); no duplicate nested step | OK     |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`      | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | Outer step `FAILED` (mark no longer matches reality — either the bug was fixed or the test stopped testing what it claimed) | OK     |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                 | Outer step `PASSED`                                                                              | Outer step `PASSED` (`strict=False` doesn't insist on the failure) | OK    |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)                                     | `FAILED` (the `raises=` mismatch is a real test failure) | OK |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)  | `SKIPPED` (the test never ran)                                                                   | `SKIPPED` (the test never ran)                      | OK    |
 
 ## Setup / teardown phases
 
 | Case       | Scenario                                   | Trigger                                                              | Observed today                                                                                                                            | Target                                                  | Status |
 | ---------- | ------------------------------------------ | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step does not exist or lands `PASSED`; the plugin does not consult `report.when`                                                    | `ERROR` with `phase=setup` annotation                   | Gap    |
-| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `PASSED` — it closes before the failing teardown runs, so the error is invisible                                              | `FAILED` with `phase=teardown` annotation               | Gap    |
-| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is silently lost                                              | `FAILED` with a `phase=teardown` annotation so the teardown error is also visible | Gap |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError) | `@pytest.fixture` raises before `yield`; test body never runs        | Outer step `ERROR`; the plugin reads the setup-phase report and maps `failed` → `ERROR`                                                   | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
+| `PHASE-02` | Teardown-phase fixture failure             | `@pytest.fixture` raises after `yield`; test body passed             | Outer step `FAILED`; after teardown the plugin upgrades a passed step when the teardown report shows `failed`                              | `FAILED` (a `phase=teardown` annotation is a planned follow-up) | OK     |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | Outer step `FAILED` (the call-phase failure dominates); the teardown error is not yet surfaced separately                                  | `FAILED`; surfacing the teardown error alongside is a planned follow-up | OK |
 
 ## Collection / fixture-resolution failures
 
 | Case      | Scenario                                | Trigger                                       | Observed today                                                                                                                                  | Target                                                  | Status |
 | --------- | --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ------ |
-| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `PASSED` — the autouse `step` fixture's setup still runs before pytest detects the missing fixture; the user sees a green step for a test that never executed | `ERROR` with `phase=setup`                              | Gap    |
+| `COLL-01` | Missing fixture                         | `def test_x(nonexistent_fixture):`            | Outer step `ERROR` — the missing fixture surfaces as a setup-phase failure, which the plugin now maps to `ERROR`                                  | `ERROR` (a `phase=setup` annotation is a planned follow-up) | OK     |
 
 ## Plugin-API exit paths (in-test mutations)
 
@@ -121,6 +121,6 @@ Run the suite locally:
 pytest lib/sift_client/_tests/util/test_step_status_states.py -v
 ```
 
-Gap rows fail today. When the plugin fix for a row lands, the matching
-test turns green; update the **Observed today** column here to match the
-target and flip the row's status to **OK**.
+Every row should be `OK`. If a row regresses to `Gap`, the matching test
+fails; update the **Observed today** column here to describe the
+regression and flip the row's status to **Gap** until the plugin is fixed.
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
@@ -39,6 +39,17 @@
 logger = logging.getLogger(__name__)
 
 
+def format_truncated_traceback(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+    tb: object | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from a traceback, keeping the first frame and the last 10."""
+    stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore[arg-type]
+    stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
+    return ErrorInfo(error_code=1, error_message="".join(stack))
+
+
 def log_replay_instructions(log_file: str | Path | None) -> None:
     """Log instructions for manually replaying a test result log file.
 
@@ -287,6 +298,17 @@ def record_step_outcome(self, outcome: bool, step: TestStep):
             self.open_step_results[step.step_path] = False
             self.any_failures = True
 
+    def mark_step_failed_after_close(self, step: TestStep):
+        """Mark a step's parent as failed after the step has already been popped from the stack.
+
+        Used by the pytest plugin when a teardown-phase report fires after the
+        fixture's `__exit__` has already resolved and exited the step.
+        """
+        self.any_failures = True
+        path_parts = step.step_path.split(".")
+        if len(path_parts) > 1:
+            self.open_step_results[".".join(path_parts[:-1])] = False
+
     def resolve_and_propagate_step_result(
         self,
         step: TestStep,
@@ -383,13 +405,7 @@ def update_step_from_result(
                 # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
                 self.report_context.record_step_outcome(False, self.current_step)
             else:
-                stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore
-                stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
-                trace = "".join(stack)
-                error_info = ErrorInfo(
-                    error_code=1,
-                    error_message=trace,
-                )
+                error_info = format_truncated_traceback(exc, exc_value, tb)
 
         # Resolve the status of this step (i.e. fail if children failed) and propagate the result to the parent step.
         result = self.report_context.resolve_and_propagate_step_result(
@@ -414,6 +430,27 @@ def update_step_from_result(
         return result
 
     def __exit__(self, exc, exc_value, tb):
+        if getattr(self, "_sift_managed_externally", False):
+            # The pytest fixture already resolved status from phase reports.
+            # Run the standard propagation so the parent step sees this step's
+            # pass/fail, emit one update_step with the resolved values, and pop
+            # from the stack without re-classifying.
+            assert self.current_step is not None
+            result = self.report_context.resolve_and_propagate_step_result(
+                self.current_step, self.current_step.error_info
+            )
+            self.current_step.update(
+                {
+                    "status": self.current_step.status,
+                    "end_time": datetime.now(timezone.utc),
+                    "error_info": self.current_step.error_info,
+                },
+            )
+            self.report_context.exit_step(self.current_step)
+            if hasattr(self, "force_result"):
+                result = self.force_result
+            return result
+
         result = self.update_step_from_result(exc, exc_value, tb)
 
         # Now that the step is updated. Let the report context handle removing it from the stack and updating the report context.
diff --git a/python/lib/sift_client/util/test_results/pytest_util.py b/python/lib/sift_client/util/test_results/pytest_util.py