From 23fbdfd90c0b9f2e94a0bdf43fb8bb2f0673ae87 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 15 May 2026 17:05:41 -0700
Subject: [PATCH 01/19] initial dev version

---
 python/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index ddb4c7d80..403c89bf8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.16.2"
+version = "0.17.0.dev0"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }

From a3d9b3409491769785c76f516bdf5e11207e4dcb Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 19 May 2026 06:25:14 -0700
Subject: [PATCH 02/19] Python(feat): pytest plugin improvements (#567)

---
 python/docs/examples/pytest_plugin.md         | 173 +++++--
 python/lib/sift_client/_tests/conftest.py     |   4 -
 .../_tests/pytest_plugin/__init__.py          |   0
 .../_tests/pytest_plugin/conftest.py          |  54 +++
 .../pytest_plugin/test_configuration.py       | 394 ++++++++++++++++
 .../_tests/pytest_plugin/test_credentials.py  | 117 +++++
 .../lib/sift_client/_tests/util/conftest.py   |  35 +-
 python/lib/sift_client/pytest_plugin.py       | 436 ++++++++++++++++++
 .../sift_client/util/test_results/__init__.py | 120 +++--
 .../util/test_results/pytest_util.py          | 206 ---------
 python/pyproject.toml                         |   9 +
 11 files changed, 1255 insertions(+), 293 deletions(-)
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/__init__.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/conftest.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
 create mode 100644 python/lib/sift_client/pytest_plugin.py
 delete mode 100644 python/lib/sift_client/util/test_results/pytest_util.py

diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index cf56dd75e..3557dd9c7 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -9,11 +9,13 @@ This page walks through wiring the plugin into a project, the fixtures and
 hooks it provides, and the patterns you'll use day-to-day.
 
 !!! info "Where the plugin lives"
-    The plugin is part of `sift_client.util.test_results`. It is **not**
-    registered as a `pytest11` entry point. Projects opt in with a
-    `from sift_client.util.test_results import *` in their `conftest.py`.
-    That import is what wires up the fixtures, the CLI options, and the
-    `pytest_runtest_makereport` hook.
+    The plugin lives at `sift_client.pytest_plugin`. It is
+    **not** registered as a `pytest11` entry point. Projects opt in with a
+    `pytest_plugins` declaration in their top-level `conftest.py`. Pytest
+    then loads the module as a real plugin: the fixtures, CLI options, and
+    `pytest_runtest_makereport` hook all register through standard pytest
+    machinery, so `pytest --trace-config` lists it and
+    `pytest -p no:sift_client.pytest_plugin` disables it.
 
 ## Install
 
@@ -33,9 +35,26 @@ The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your
 
 ## Wire the plugin into `conftest.py`
 
-Two things are required: a session-scoped `sift_client` fixture (the plugin's
-`report_context` fixture resolves it by name), and a star-import that registers
-the plugin's fixtures into the conftest's namespace.
+A single `pytest_plugins` declaration in your top-level `conftest.py` is all
+that's required. The plugin ships a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+
+```python title="conftest.py"
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
+
+That's the whole setup. Every test in the session will now create a step on a
+single shared `TestReport`.
+
+### Customizing the `SiftClient`
+
+To construct the client differently (custom TLS, timeouts, alternate
+credentials, etc.), override the `sift_client` fixture in your conftest. The
+plugin's default falls away in favor of your definition.
 
 ```python title="conftest.py"
 import os
@@ -45,30 +64,23 @@ from dotenv import load_dotenv
 
 from sift_client import SiftClient, SiftConnectionConfig
 
-# Star-import wires fixtures + hooks + CLI options into pytest collection.
-from sift_client.util.test_results import *
-
 load_dotenv()
 
+pytest_plugins = ["sift_client.pytest_plugin"]
+
 
 @pytest.fixture(scope="session")
 def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI")
-    rest_url = os.getenv("SIFT_REST_URI")
-    api_key = os.getenv("SIFT_API_KEY")
-    
     return SiftClient(
         connection_config=SiftConnectionConfig(
-            api_key=api_key,
-            grpc_url=grpc_url,
-            rest_url=rest_url,
+            api_key=os.getenv("SIFT_API_KEY"),
+            grpc_url=os.getenv("SIFT_GRPC_URI"),
+            rest_url=os.getenv("SIFT_REST_URI"),
+            use_ssl=False,
         )
     )
 ```
 
-That's the whole setup. Every test in the session will now create a step on a
-single shared `TestReport`.
-
 ## Plugin provided fixtures
 
 | Name | Kind | Scope | Purpose |
@@ -86,17 +98,82 @@ single shared `TestReport`.
 | `--no-sift-test-results-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
 | `--sift-test-results-check-connection` | off | Make `report_context`, `step`, and `module_substep` no-op (yield `None`) when `client_has_connection` is `False`. Lets the same suite run locally without a Sift backend. |
 
-These can be set permanently in `pytest.ini`:
+These can be passed permanently via `addopts`:
 
 ```ini title="pytest.ini"
 [pytest]
 addopts = --sift-test-results-check-connection
 ```
 
+Or set the matching ini key directly (recommended for stable per-project
+configuration). Each CLI flag has a corresponding key under
+`[tool.pytest.ini_options]` in `pyproject.toml` or `[pytest]` in `pytest.ini`.
+CLI flags, when passed, override the ini values.
+
+| Ini key | Type | Equivalent CLI flag |
+|---|---|---|
+| `sift_test_results_log_file` | string (`true` / `false` / `none` / path) | `--sift-test-results-log-file=<value>` |
+| `sift_test_results_git_metadata` | bool (default `true`) | `--no-sift-test-results-git-metadata` (sets to `false`) |
+| `sift_test_results_check_connection` | bool (default `false`) | `--sift-test-results-check-connection` |
+| `sift_test_results_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
+
+The default `sift_client` fixture reads its two URIs from environment first
+and falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
+intentionally env-only — keep it out of source control and supply it through
+`pytest-dotenv` (see [API key handling](#api-key-handling) below). The env
+var wins when both are set, so secrets injected into a CI environment
+continue to override values committed to `pyproject.toml`. There are no CLI
+flags for credentials.
+
+| Ini key | Environment variable | Notes |
+|---|---|---|
+| _(none)_ | `SIFT_API_KEY` | Env-only. Use `.env` + `pytest-dotenv` locally; inject from your secret store in CI. |
+| `sift_grpc_uri` | `SIFT_GRPC_URI` | Stable per-org gRPC endpoint; safe to commit. |
+| `sift_rest_uri` | `SIFT_REST_URI` | Stable per-org REST endpoint; safe to commit. |
+
+```toml title="pyproject.toml"
+[tool.pytest.ini_options]
+sift_test_results_check_connection = true
+sift_test_results_log_file = "false"
+sift_test_results_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
+```
+
+```ini title="pytest.ini"
+[pytest]
+sift_test_results_check_connection = true
+sift_test_results_log_file = false
+sift_test_results_git_metadata = false
+sift_grpc_uri = your-org.sift.example:443
+sift_rest_uri = https://your-org.sift.example
+```
+
+#### API key handling
+
+`SIFT_API_KEY` is deliberately read from the process environment only. The
+recommended workflow uses the
+[`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) plugin (already a
+dependency of `sift-stack-py`), which loads variables from a `.env` file
+into `os.environ` before tests run.
+
+1. Add `.env` to `.gitignore`.
+2. Drop your key into `.env` at the project root:
+
+    ```bash title=".env"
+    SIFT_API_KEY=sk-...your-key...
+    ```
+
+3. In CI, set `SIFT_API_KEY` directly via your provider's secret manager
+   instead of committing a `.env` file.
+
+`pytest-dotenv` picks the file up automatically; no `pytest_configure`
+glue is needed.
+
 !!! warning "FedRAMP / shared environments"
-    Pass `--sift-test-results-log-file=false` to skip the temp file + worker
-    pipeline. Create/update calls then run inline against the API instead of
-    being deferred through a subprocess.
+    Pass `--sift-test-results-log-file=false` (or set the ini key to `"false"`)
+    to skip the temp file + worker pipeline. Create/update calls then run
+    inline against the API instead of being deferred through a subprocess.
 
 ### Report metadata captured automatically
 
@@ -122,6 +199,50 @@ metadata), call `report_context.report.update({...})` from any test or
 fixture. See [Linking a Run](#linking-a-run-to-the-report) for the same
 pattern applied to `run_id`.
 
+## Controlling which tests produce reports
+
+By default every test in the session produces a Sift step. Two markers
+and one ini key let you narrow that to a specific set of tests, which is
+useful when a repo holds tests that you don't want included in the Sift test report.
+
+| Setting                                                 | Effect                                                                                       |
+|---------------------------------------------------------|----------------------------------------------------------------------------------------------|
+| `sift_test_results_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
+| `@pytest.mark.sift_include` on a test, class, or module | Force reporting on for that scope, regardless of the project default.                        |
+| `@pytest.mark.sift_exclude` on a test, class, or module | Force reporting off for that scope, regardless of the project default.                       |
+
+Closest marker determines setting. `sift_exclude` beats `sift_include` when both apply.
+`pytestmark` at the class or module level inherits to every test in scope.
+
+### Bulk-applying a marker to a directory
+
+To opt an entire directory in (or out) without editing each file, hook
+`pytest_collection_modifyitems` in the directory's `conftest.py`:
+
+```python title="tests/example/conftest.py"
+from pathlib import Path
+
+import pytest
+
+_HERE = Path(__file__).parent
+
+
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
+```
+
+This applies `sift_include` to every test collected under `tests/example/`.
+Combine with `sift_test_results_autouse = false` in `pyproject.toml` for
+opting in to specific directories. 
+
+`pytest_collection_modifyitems` receives every item in the session, not just
+this directory's, so the `relative_to` filter is what scopes the marker.
+
 ## Basic usage
 
 With the conftest in place, the simplest test needs nothing extra. The `step`
@@ -585,7 +706,7 @@ automatic skip.
 ```python title="conftest.py"
 import pytest
 
-from sift_client.util.test_results import *
+pytest_plugins = ["sift_client.pytest_plugin"]
 
 
 @pytest.fixture(autouse=True)
diff --git a/python/lib/sift_client/_tests/conftest.py b/python/lib/sift_client/_tests/conftest.py
index 5683182e5..79b079d39 100644
--- a/python/lib/sift_client/_tests/conftest.py
+++ b/python/lib/sift_client/_tests/conftest.py
@@ -78,10 +78,6 @@ def ci_pytest_tag(sift_client):
     return tag
 
 
-# Import the Sift test results fixtures the way we recommend to users.
-from sift_client.util.test_results import *  # noqa: F403
-
-
 def pytest_configure(config: pytest.Config) -> None:
     """Enable the Sift connection-check mode for the fixtures used in this test suite since we run w/ mock client in non-integration tests."""
     config.option.sift_test_results_check_connection = True
diff --git a/python/lib/sift_client/_tests/pytest_plugin/__init__.py b/python/lib/sift_client/_tests/pytest_plugin/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/lib/sift_client/_tests/pytest_plugin/conftest.py b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
new file mode 100644
index 000000000..1fbd61e46
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
@@ -0,0 +1,54 @@
+"""Shared helpers for the pytest-plugin test suite.
+
+The tests in this directory drive inner pytester sessions to exercise the
+plugin's behavior in isolation. The fixtures below produce the boilerplate
+conftests those inner sessions need:
+
+- ``write_plugin_conftest``: minimal conftest that loads the plugin
+- ``write_probe_conftest``: conftest that loads the plugin and runs a probe
+  block inside ``pytest_configure``, useful for inspecting internal state
+  without running tests against a real backend
+
+Every test in this suite invokes the inner session via
+``pytester.runpytest_subprocess(...)`` rather than ``pytester.runpytest(...)``.
+``runpytest`` runs the inner pytest in-process, which re-imports the Sift
+plugin on each test; the plugin transitively imports numpy, whose C
+extensions refuse to initialize twice in one process and raise
+``cannot load module more than once per process``. Spawning a subprocess
+gives each inner session a fresh interpreter and sidesteps that guard.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from typing import Callable
+
+import pytest
+
+
+@pytest.fixture
+def write_plugin_conftest(pytester: pytest.Pytester) -> Callable[[], None]:
+    """Return a callable that writes a minimal conftest loading the plugin."""
+
+    def _write() -> None:
+        pytester.makeconftest('pytest_plugins = ["sift_client.pytest_plugin"]')
+
+    return _write
+
+
+@pytest.fixture
+def write_probe_conftest(pytester: pytest.Pytester) -> Callable[[str], None]:
+    """Return a callable that writes a conftest running ``probe_body`` in ``pytest_configure``.
+
+    ``probe_body`` is python source that runs at config time with ``config``
+    in scope; use ``print(...)`` calls and capture them with
+    ``result.stdout.fnmatch_lines``.
+    """
+
+    def _write(probe_body: str) -> None:
+        pytester.makeconftest(
+            'pytest_plugins = ["sift_client.pytest_plugin"]\n\n'
+            "def pytest_configure(config):\n" + textwrap.indent(textwrap.dedent(probe_body), "    ")
+        )
+
+    return _write
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
new file mode 100644
index 000000000..9b9be2d63
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
@@ -0,0 +1,394 @@
+"""Tests for the plugin's CLI/ini configuration surface.
+
+Covers flag parsing, ini-key resolution, CLI-over-ini precedence, the
+defaults that apply when nothing is set, and the marker-based gate that
+governs the autouse fixtures. Credentials are tested in
+``test_credentials.py``.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestIniConfiguration:
+    """`addini` keys configure the plugin via pyproject.toml / pytest.ini."""
+
+    def test_ini_log_file_none(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_log_file = "none"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["RESOLVED: None"])
+
+    def test_python_false_disables_log_file(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """`config.option.sift_test_results_log_file = False` disables logging.
+
+        Conftests use this pattern (see lib/sift_client/_tests/util/conftest.py)
+        to opt their subtree out of log-file mode. Regression test for the
+        resolver case where Python `False` was previously confused with `None`
+        and silently kept the temp-file default.
+        """
+        write_probe_conftest(
+            """
+            config.option.sift_test_results_log_file = False
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["RESOLVED: None"])
+
+    def test_ini_log_file_path(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        log_path = tmp_path / "sift-run.jsonl"
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            f"""
+            [tool.pytest.ini_options]
+            sift_test_results_log_file = "{log_path}"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines([f"RESOLVED: {log_path}"])
+
+    def test_ini_check_connection_true(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _check_connection_enabled
+            print("CHECK:", _check_connection_enabled(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_check_connection = true
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["CHECK: True"])
+
+    def test_ini_git_metadata_false(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            print("INI_GIT:", config.getini("sift_test_results_git_metadata"))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_git_metadata = false
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["INI_GIT: False"])
+
+    def test_cli_overrides_ini(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """A CLI flag takes precedence over the matching ini key."""
+        cli_path = tmp_path / "cli-wins.jsonl"
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _resolve_log_file
+            print("RESOLVED:", _resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_log_file = "none"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess(
+            "-s", "--co", f"--sift-test-results-log-file={cli_path}"
+        )
+        result.stdout.fnmatch_lines([f"RESOLVED: {cli_path}"])
+
+    def test_cli_check_connection_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--sift-test-results-check-connection`` CLI flag flips the resolver to True."""
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _check_connection_enabled
+            print("CHECK:", _check_connection_enabled(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-test-results-check-connection")
+        result.stdout.fnmatch_lines(["CHECK: True"])
+
+    def test_cli_no_git_metadata_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--no-sift-test-results-git-metadata`` CLI flag flips git_metadata to False.
+
+        Guards the negation flag's ``dest`` binding: the flag name doesn't match
+        the ini key, so a broken ``dest`` would silently fall back to the ini
+        default and pass every other test in this file.
+        """
+        write_probe_conftest(
+            """
+            print("CLI_GIT:", config.getoption("sift_test_results_git_metadata"))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--no-sift-test-results-git-metadata")
+        result.stdout.fnmatch_lines(["CLI_GIT: False"])
+
+    def test_defaults_when_neither_set(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import (
+                _check_connection_enabled,
+                _resolve_log_file,
+            )
+            print("RESOLVED:", _resolve_log_file(config))
+            print("CHECK:", _check_connection_enabled(config))
+            print("INI_GIT:", config.getini("sift_test_results_git_metadata"))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(
+            [
+                "RESOLVED: True",
+                "CHECK: False",
+                "INI_GIT: True",
+            ]
+        )
+
+
+# A session-scoped `report_context` stub for the autouse-gate tests. Overrides
+# the plugin's real `report_context` so the inner pytest sessions don't try to
+# talk to a Sift backend; the gate tests only need to observe whether `step`
+# resolves to a real value or to None.
+_GATE_INNER_CONFTEST = textwrap.dedent(
+    """
+    from unittest.mock import MagicMock
+
+    import pytest
+
+    pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+    @pytest.fixture(scope="session")
+    def report_context():
+        yield MagicMock()
+    """
+)
+
+
+class TestAutouseGate:
+    """`sift_include` / `sift_exclude` markers and the `sift_test_results_autouse` ini gate."""
+
+    def test_default_ini_true_activates(self, pytester: pytest.Pytester) -> None:
+        """Plugin default (ini absent) keeps the autouse fixtures active."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            def test_inner(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_default_ini_false_skips(self, pytester: pytest.Pytester) -> None:
+        """`sift_test_results_autouse = false` makes the autouse fixtures no-op by default."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_sift_include_marker_forces_on(self, pytester: pytest.Pytester) -> None:
+        """`@pytest.mark.sift_include` overrides ini-false to enable the gate."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_inner(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_sift_exclude_marker_forces_off(self, pytester: pytest.Pytester) -> None:
+        """`@pytest.mark.sift_exclude` overrides ini-true to disable the gate."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_exclude
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_exclude_beats_include(self, pytester: pytest.Pytester) -> None:
+        """When both markers are present, `sift_exclude` wins (safer default)."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            @pytest.mark.sift_exclude
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_module_pytestmark_inherits(self, pytester: pytest.Pytester) -> None:
+        """Module-level `pytestmark = pytest.mark.sift_include` covers every test in the module."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            pytestmark = pytest.mark.sift_include
+
+            def test_inner_a(step):
+                assert step is not None
+
+            def test_inner_b(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=2)
+
+    def test_bulk_apply_via_conftest_hook(self, pytester: pytest.Pytester) -> None:
+        """A subtree opts in via `pytest_collection_modifyitems`; siblings stay off.
+
+        Regression test for this repo's wiring pattern: the project default is
+        autouse-off, the integration subtree's conftest bulk-applies
+        `sift_include`, and sibling subtrees remain disabled. Verifies the
+        per-directory mechanism works in a single pytest invocation.
+        """
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_test_results_autouse = false
+            """
+        )
+        included = pytester.mkdir("included_subtree")
+        (included / "conftest.py").write_text(
+            textwrap.dedent(
+                """
+                from pathlib import Path
+
+                import pytest
+
+                _HERE = Path(__file__).parent
+
+
+                def pytest_collection_modifyitems(config, items):
+                    for item in items:
+                        try:
+                            item.path.relative_to(_HERE)
+                        except ValueError:
+                            continue
+                        item.add_marker(pytest.mark.sift_include)
+                """
+            )
+        )
+        (included / "test_included.py").write_text(
+            "def test_included(step):\n    assert step is not None\n"
+        )
+        untouched = pytester.mkdir("untouched_subtree")
+        (untouched / "test_untouched.py").write_text(
+            "def test_untouched(step):\n    assert step is None\n"
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=2)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
new file mode 100644
index 000000000..9ee628e69
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
@@ -0,0 +1,117 @@
+"""Tests for the default ``sift_client`` fixture's credential resolution.
+
+Covers the env-var-then-ini fallback for URIs, the env-only handling of
+``SIFT_API_KEY``, and the error path that names missing credentials.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    import pytest
+
+
+class TestCredentials:
+    """The default ``sift_client`` fixture's resolution of env vars and ini keys."""
+
+    def test_uris_from_ini(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The default sift_client fixture reads URI credentials from ini when env vars are unset."""
+        monkeypatch.setenv("SIFT_API_KEY", "env-key")
+        monkeypatch.delenv("SIFT_GRPC_URI", raising=False)
+        monkeypatch.delenv("SIFT_REST_URI", raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            sift_test_results_check_connection = true
+            sift_test_results_log_file = "false"
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_credentials_loaded(sift_client):
+                cfg = sift_client.grpc_client._config
+                assert cfg.api_key == "env-key"
+                assert "ini-grpc:1234" in cfg.uri
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_env_var_overrides_ini_uri(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """When both env var and ini set a URI, the env var wins."""
+        monkeypatch.setenv("SIFT_API_KEY", "env-key")
+        monkeypatch.setenv("SIFT_GRPC_URI", "env-grpc:9999")
+        monkeypatch.delenv("SIFT_REST_URI", raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            sift_test_results_check_connection = true
+            sift_test_results_log_file = "false"
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_env_wins(sift_client):
+                assert "env-grpc:9999" in sift_client.grpc_client._config.uri
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_api_key_ignored_from_ini(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """`sift_api_key` is not registered as an ini key; the fixture refuses to use it."""
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            monkeypatch.delenv(name, raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_api_key = "should-be-ignored"
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            """
+        )
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "SIFT_API_KEY" in combined, combined
+
+    def test_missing_credentials_named_in_error(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """A missing credential aborts with all missing names listed."""
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            monkeypatch.delenv(name, raising=False)
+        write_plugin_conftest()
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            assert name in combined, combined
diff --git a/python/lib/sift_client/_tests/util/conftest.py b/python/lib/sift_client/_tests/util/conftest.py
index 45279cca6..2f371e69e 100644
--- a/python/lib/sift_client/_tests/util/conftest.py
+++ b/python/lib/sift_client/_tests/util/conftest.py
@@ -1,14 +1,35 @@
-import pytest
+from pathlib import Path
 
+import pytest
 
-def pytest_addoption(parser: pytest.Parser) -> None:
-    existing_options = [opt.names() for opt in parser._anonymous.options]
-    # Flatten the list of lists into a single list of strings
-    flat_options = [item for sublist in existing_options for item in sublist]
-    if not any("--sift-test-results-log-file" in name for name in flat_options):
-        parser.addoption("--sift-test-results-log-file", action="store_true", default=False)
+_HERE = Path(__file__).parent
 
 
 def pytest_configure(config: pytest.Config) -> None:
     """Configure the pytest configuration to disable the Sift test results log file."""
     config.option.sift_test_results_log_file = False
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: "list[pytest.Item]") -> None:
+    """Bulk-apply ``@pytest.mark.sift_include`` to integration tests under util/.
+
+    The project-wide default in ``pyproject.toml`` is ``sift_test_results_autouse
+    = false`` so unit tests pay nothing for the globally-loaded Sift plugin.
+    Integration tests in this subtree still need the autouse fixtures, so this
+    hook flips the gate back on for any test already marked
+    ``@pytest.mark.integration``. Unit tests in the same directory (e.g.
+    ``test_cel_utils.py``) are left alone.
+
+    ``pytest_collection_modifyitems`` receives all items in the session (pytest
+    does not auto-scope it to the conftest's directory), so we filter by path
+    explicitly. ``Path.relative_to`` is the 3.8-compatible form of the path
+    containment check (``Path.is_relative_to`` arrived in 3.9).
+    """
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        if item.get_closest_marker("integration") is None:
+            continue
+        item.add_marker(pytest.mark.sift_include)
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
new file mode 100644
index 000000000..f2699a954
--- /dev/null
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -0,0 +1,436 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.sift_types.test_report import TestStatus
+from sift_client.util.test_results import ReportContext
+
+if TYPE_CHECKING:
+    from sift_client.util.test_results.context_manager import NewStep
+
+REPORT_CONTEXT: ReportContext | None = None
+
+
+@dataclass(frozen=True)
+class _Option:
+    """A single Sift plugin setting, registered as a CLI flag and/or an ini key.
+
+    ``ini_name`` is used as both the ini key and the CLI ``dest``, so a value
+    set either way lands on the same config slot. ``cli_flag=None`` makes the
+    option ini-only (e.g. the URI fallbacks).
+    """
+
+    ini_name: str
+    ini_help: str
+    cli_flag: str | None = None
+    cli_help: str | None = None
+    action: str | None = None
+    ini_type: str | None = None
+    ini_default: Any = None
+
+
+_LOG_FILE = _Option(
+    cli_flag="--sift-test-results-log-file",
+    ini_name="sift_test_results_log_file",
+    cli_help="Path to write the Sift test result log file. "
+    "Use 'true' (default) to auto-create a temp file, "
+    "False, 'false', or 'none' to disable logging, "
+    "or a file path to write to a specific location.",
+    ini_help="Default value for --sift-test-results-log-file. Same values "
+    "accepted as the CLI flag (path, 'true', 'false', 'none').",
+)
+
+_GIT_METADATA = _Option(
+    cli_flag="--no-sift-test-results-git-metadata",
+    ini_name="sift_test_results_git_metadata",
+    action="store_false",
+    cli_help="Exclude git metadata from the Sift test results. "
+    "Git metadata (repo, branch, commit) is included by default.",
+    ini_help="Include git repo/branch/commit in the report (true/false). "
+    "Defaults to true. The --no-sift-test-results-git-metadata CLI flag "
+    "overrides this when passed.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_CHECK_CONNECTION = _Option(
+    cli_flag="--sift-test-results-check-connection",
+    ini_name="sift_test_results_check_connection",
+    action="store_true",
+    cli_help="Skip the sift test-result fixtures (report_context, step, module_substep) "
+    "when the Sift client has no connection to the server. Requires a "
+    "`client_has_connection` fixture to be available in the test session.",
+    ini_help="When true, skip the sift test-result fixtures if the client has "
+    "no connection (same effect as --sift-test-results-check-connection). "
+    "Defaults to false.",
+    ini_type="bool",
+    ini_default=False,
+)
+
+_GRPC_URI = _Option(
+    ini_name="sift_grpc_uri",
+    ini_help="Sift gRPC endpoint URI. The default `sift_client` fixture "
+    "prefers the SIFT_GRPC_URI environment variable and falls back to "
+    "this ini value.",
+)
+
+_REST_URI = _Option(
+    ini_name="sift_rest_uri",
+    ini_help="Sift REST endpoint URI. The default `sift_client` fixture "
+    "prefers the SIFT_REST_URI environment variable and falls back to "
+    "this ini value.",
+)
+
+_AUTOUSE = _Option(
+    ini_name="sift_test_results_autouse",
+    ini_help="Default for the Sift autouse fixtures (report_context, step, "
+    "module_substep). When true (default), tests are included unless marked "
+    "with @pytest.mark.sift_exclude. When false, tests are skipped unless "
+    "marked with @pytest.mark.sift_include. Bulk-apply markers in a "
+    "directory's conftest via `pytest_collection_modifyitems`.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_OPTIONS: tuple[_Option, ...] = (
+    _LOG_FILE,
+    _GIT_METADATA,
+    _CHECK_CONNECTION,
+    _GRPC_URI,
+    _REST_URI,
+    _AUTOUSE,
+)
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Register Sift-specific command-line options and ini keys.
+
+    Each option can be set on the command line or under ``[tool.pytest.ini_options]``
+    in ``pyproject.toml`` (or ``[pytest]`` in ``pytest.ini``). CLI values take
+    precedence over ini values, which take precedence over the built-in default.
+    """
+    group = parser.getgroup("sift", description="Sift test results")
+    for opt in _OPTIONS:
+        if opt.cli_flag is not None:
+            cli_kwargs: dict[str, Any] = {
+                "dest": opt.ini_name,
+                "default": None,
+                "help": opt.cli_help,
+            }
+            if opt.action is not None:
+                cli_kwargs["action"] = opt.action
+            group.addoption(opt.cli_flag, **cli_kwargs)
+
+        ini_kwargs: dict[str, Any] = {"help": opt.ini_help, "default": opt.ini_default}
+        if opt.ini_type is not None:
+            ini_kwargs["type"] = opt.ini_type
+        parser.addini(opt.ini_name, **ini_kwargs)
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the Sift gate markers so they show up in `pytest --markers`."""
+    config.addinivalue_line(
+        "markers",
+        "sift_include: force the Sift autouse fixtures to activate for this test "
+        "regardless of the `sift_test_results_autouse` ini default.",
+    )
+    config.addinivalue_line(
+        "markers",
+        "sift_exclude: force the Sift autouse fixtures to skip this test "
+        "regardless of the `sift_test_results_autouse` ini default.",
+    )
+
+
+def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
+    """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
+
+    `get_closest_marker` walks the node hierarchy upward, so markers applied
+    at any level (function, class, module, package, session) are honored.
+    """
+    if node.get_closest_marker("sift_exclude"):
+        return False
+    if node.get_closest_marker("sift_include"):
+        return True
+    return default
+
+
+def _module_has_included_tests(request: pytest.FixtureRequest, default: bool) -> bool:
+    """True when at least one test in `request`'s module is gated on.
+
+    Used by the module-scoped `module_substep` fixture to decide whether to
+    activate without triggering `report_context` creation for modules where
+    every test is excluded.
+    """
+    module_path = request.path
+    for item in request.session.items:
+        if item.path != module_path:
+            continue
+        if _sift_enabled_for(item, default):
+            return True
+    return False
+
+
+def _option_or_ini(pytestconfig: pytest.Config | None, opt: _Option) -> Any:
+    """Resolve a Sift plugin setting from CLI > ini > None.
+
+    The ``addoption`` registrations use ``default=None`` so we can tell whether
+    the CLI was actually used. When the CLI didn't set a value, fall back to
+    the matching ``addini`` key.
+    """
+    if pytestconfig is None:
+        return None
+    cli = pytestconfig.getoption(opt.ini_name, default=None)
+    if cli is not None:
+        return cli
+    try:
+        return pytestconfig.getini(opt.ini_name)
+    except (KeyError, ValueError):
+        return None
+
+
+def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
+    """Determine log_file value from CLI flag or ini key.
+
+    Three signal types arrive here:
+
+    * ``None`` — unset; nothing was passed on the CLI and the ini key is
+      absent. Treat as the default "use a temp file."
+    * Python ``False`` — an explicit disable, typically set in a conftest via
+      ``config.option.sift_test_results_log_file = False``. Return ``None`` so
+      the rest of the pipeline knows to skip logging entirely.
+    * A string (from CLI or ini) — interpret ``"true"`` / ``"1"`` as the temp
+      file default, ``"false"`` / ``"none"`` as disable, anything else as a
+      file path.
+    """
+    raw = _option_or_ini(pytestconfig, _LOG_FILE)
+    if raw is False:
+        return None
+    if not raw:
+        return True
+    lower = str(raw).lower()
+    if lower in ("true", "1"):
+        return True
+    if lower in ("false", "none"):
+        return None
+    return Path(raw)
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
+    """Capture pytest outcomes so assertion failures and skips land on the Sift step."""
+    outcome = yield
+    report = outcome.get_result()
+    if report.outcome == "skipped":
+        # Skipped tests bypass the autouse `step` fixture, so we record the step manually here.
+        if REPORT_CONTEXT:
+            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
+                new_step.current_step.update({"status": TestStatus.SKIPPED})
+    setattr(item, "rep_" + report.when, call)
+
+
+def _report_context_impl(
+    sift_client: SiftClient,
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config | None = None,
+) -> Generator[ReportContext | None, None, None]:
+    args = request.config.invocation_params.args
+    test_path = Path(args[0]) if args else None
+    if test_path is not None and test_path.exists():
+        base_name = test_path.name
+        test_case: Path | str = test_path
+    else:
+        base_name = "pytest " + " ".join(args) if args else "pytest"
+        test_case = base_name
+    log_file = _resolve_log_file(pytestconfig)
+    git_metadata = _option_or_ini(pytestconfig, _GIT_METADATA)
+    include_git_metadata = True if git_metadata is None else bool(git_metadata)
+    with ReportContext(
+        sift_client,
+        name=f"{base_name} {datetime.now(timezone.utc).isoformat()}",
+        test_case=str(test_case),
+        log_file=log_file,
+        include_git_metadata=include_git_metadata,
+    ) as context:
+        global REPORT_CONTEXT
+        REPORT_CONTEXT = context
+        yield context
+
+
+def _check_connection_enabled(pytestconfig: pytest.Config | None) -> bool:
+    """Return True when the caller opted into the check-connection mode via CLI or ini."""
+    return bool(_option_or_ini(pytestconfig, _CHECK_CONNECTION))
+
+
+def _has_sift_connection(request: pytest.FixtureRequest) -> bool:
+    """Resolve the `client_has_connection` fixture lazily; only called when the check is enabled."""
+    return bool(request.getfixturevalue("client_has_connection"))
+
+
+_CREDENTIAL_KEYS: tuple[tuple[str, _Option | None], ...] = (
+    ("SIFT_API_KEY", None),  # env-only; never read from ini to keep secrets out of source control.
+    ("SIFT_GRPC_URI", _GRPC_URI),
+    ("SIFT_REST_URI", _REST_URI),
+)
+
+
+def _resolve_credential(
+    pytestconfig: pytest.Config | None, env_name: str, opt: _Option | None
+) -> str | None:
+    """Resolve a Sift credential: env var first, then ini key (if registered), else None."""
+    env_value = os.getenv(env_name)
+    if env_value:
+        return env_value
+    if opt is None or pytestconfig is None:
+        return None
+    ini_value = pytestconfig.getini(opt.ini_name)
+    return ini_value if isinstance(ini_value, str) and ini_value else None
+
+
+@pytest.fixture(scope="session")
+def sift_client(pytestconfig: pytest.Config) -> SiftClient:
+    """Default ``SiftClient`` resolved from environment variables and ini keys.
+
+    Each credential is read from its environment variable first. The URIs
+    (``SIFT_GRPC_URI``, ``SIFT_REST_URI``) additionally fall back to the
+    ``sift_grpc_uri`` / ``sift_rest_uri`` ini keys, since they are stable
+    per-org values that are safe to commit. ``SIFT_API_KEY`` is intentionally
+    env-only — use ``pytest-dotenv`` (already a project dependency) to load
+    it from a ``.env`` file kept out of version control.
+
+    Projects that need custom construction (TLS toggles, custom timeouts,
+    etc.) can override this fixture by defining their own ``sift_client``
+    in their ``conftest.py``; pytest fixture resolution prefers the local
+    definition.
+    """
+    resolved = {env: _resolve_credential(pytestconfig, env, opt) for env, opt in _CREDENTIAL_KEYS}
+    missing = [env for env, value in resolved.items() if not value]
+    if missing:
+        raise pytest.UsageError(
+            "Sift credentials missing: "
+            + ", ".join(missing)
+            + ". Set the environment variable(s) — pytest-dotenv loads them "
+            "from a `.env` file automatically — or set the URIs via "
+            "`sift_grpc_uri` / `sift_rest_uri` under `[tool.pytest.ini_options]` "
+            "in pyproject.toml, or override the sift_client fixture in your "
+            "conftest.py."
+        )
+    # `or ""` is unreachable in practice since the `missing` check above guarantees
+    # non-None values
+    return SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key=resolved.get("SIFT_API_KEY") or "",
+            grpc_url=resolved.get("SIFT_GRPC_URI") or "",
+            rest_url=resolved.get("SIFT_REST_URI") or "",
+        )
+    )
+
+
+@pytest.fixture(scope="session")
+def report_context(
+    sift_client: SiftClient, request: pytest.FixtureRequest, pytestconfig: pytest.Config
+) -> Generator[ReportContext | None, None, None]:
+    """Lazy session-scoped Sift ReportContext.
+
+    The fixture is no longer autouse; it's instantiated on the first call to
+    ``request.getfixturevalue("report_context")``, which today happens inside
+    the gated ``step`` and ``module_substep`` fixtures. If every test in the
+    session is excluded via the marker gate, this fixture is never resolved
+    and no ReportContext (and no teardown subprocess) is created.
+
+    The log file destination is controlled by ``--sift-test-results-log-file``.
+    Defaults to a temp file when not set.
+
+    When ``--sift-test-results-check-connection`` is passed, this fixture will
+    yield ``None`` if the Sift client has no connection to the server. That mode
+    requires a ``client_has_connection`` fixture to be available in the session.
+    """
+    if _check_connection_enabled(pytestconfig) and not _has_sift_connection(request):
+        yield None
+        return
+    yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
+
+
+def _step_impl(
+    report_context: ReportContext, request: pytest.FixtureRequest
+) -> Generator[NewStep | None, None, None]:
+    name = str(request.node.name)
+    existing_docstring = request.node.obj.__doc__ or None
+    with report_context.new_step(
+        name=name, description=existing_docstring, assertion_as_fail_not_error=False
+    ) as new_step:
+        yield new_step
+        if hasattr(request.node, "rep_call") and request.node.rep_call.excinfo:
+            new_step.update_step_from_result(
+                request.node.rep_call.excinfo,
+                request.node.rep_call.excinfo.value,
+                request.node.rep_call.excinfo.tb,
+            )
+
+
+@pytest.fixture(autouse=True)
+def step(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+) -> Generator[NewStep | None, None, None]:
+    """Create an outer step for the function when the Sift gate is on.
+
+    Resolves the gate via `_sift_enabled_for(request.node, ini_default)`:
+    `sift_exclude` marker forces off, `sift_include` forces on, otherwise the
+    `sift_test_results_autouse` ini default applies. When on, requests the
+    session `report_context` lazily — the first gated test in the session
+    triggers its creation, subsequent gated tests reuse it.
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _sift_enabled_for(request.node, default):
+        yield None
+        return
+    rc = request.getfixturevalue("report_context")
+    if rc is None:
+        yield None
+        return
+    yield from _step_impl(rc, request)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def module_substep(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+) -> Generator[NewStep | None, None, None]:
+    """Create a per-module step when at least one test in the module is gated on.
+
+    Inspects the module's collected items rather than gating on a single marker,
+    so a module with mixed inclusion/exclusion still produces the module-level
+    step (individual `step` fixtures then decide per-test). When every test in
+    the module is excluded, the substep is skipped without requesting
+    `report_context`.
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _module_has_included_tests(request, default):
+        yield None
+        return
+    rc = request.getfixturevalue("report_context")
+    if rc is None:
+        yield None
+        return
+    yield from _step_impl(rc, request)
+
+
+@pytest.fixture(scope="session")
+def client_has_connection(sift_client):
+    """Check if the SiftClient has a connection to the Sift server.
+
+    Can be used to skip tests that require a connection to the Sift server, and is
+    consulted by the Sift fixtures when ``--sift-test-results-check-connection`` is set.
+    """
+    try:
+        sift_client.ping.ping()
+        return True
+    except Exception:
+        return False
diff --git a/python/lib/sift_client/util/test_results/__init__.py b/python/lib/sift_client/util/test_results/__init__.py
index e7a82866c..ea213056e 100644
--- a/python/lib/sift_client/util/test_results/__init__.py
+++ b/python/lib/sift_client/util/test_results/__init__.py
@@ -49,78 +49,98 @@ def main(self):
     cleanup()
 ```
 
-## Pytest Fixtures
+## Pytest Plugin
 
-The report context and steps can also be accessed in pytest by importing the `report_context` and `step` fixtures.
+The pytest plugin lives at `sift_client.pytest_plugin`. Opt in
+from your `conftest.py`:
 
-### How to use:
-- These fixtures are set to autouse and will automatically create a report and steps for each test function.
-  - If you want each module(file) to be marked as a step w/ each test as a substep, import the `module_substep` fixture as well.
-- The `report_context` fixture requires a fixture `sift_client` returning an `SiftClient` instance to be passed in.
+```python
+# conftest.py
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
 
-Note: FedRAMP users: report_context will log test results to a temp file to avoid API calls during test execution. If this is a shared environment, you can disable logging by passing ``--sift-test-results-log-file=false``.
+By default, every test in the session produces a Sift report: one
+`TestReport` per session, one step per test function (`step`), and one
+parent step per test file (`module_substep`). The plugin also registers a
+default `sift_client` fixture that reads `SIFT_API_KEY`, `SIFT_GRPC_URI`,
+and `SIFT_REST_URI` from the environment. Override it by defining your own
+`sift_client` fixture in your conftest.
 
-#### Configuration
+Note: FedRAMP users: results are buffered to a temp file and uploaded by a
+subprocess at session end (no API calls during the run). Disable the buffer
+entirely with `--sift-test-results-log-file=false` for inline uploads.
 
-Import the `pytest_addoption` function to add configuration options for Test Results to the commandline or add the options to your pyproject.toml file (https://docs.pytest.org/en/stable/reference/customize.html#configuration). If ommitted, will use the default values described below.
+### Controlling which tests produce reports
 
-- Git metadata: Include git metadata (repo, branch, commit) in the test results. Default is True. You can disable it by passing `--no-sift-test-results-git-metadata`.
-- Log file: Write test results to a file. This happens automatically but you can configure specify a specific log file by passing `--sift-test-results-log-file=<path>` or disable logging by passing `--sift-test-results-log-file=false`.
-- Check connection: Pass `--sift-test-results-check-connection` (off by default) to make the `report_context`, `step`, and `module_substep` fixtures no-op when the Sift client has no connection to the server. Requires a `client_has_connection` fixture to be available.
+The autouse fixtures fire for every test by default. To narrow that:
 
-###### Example at top of your test file or in your conftest.py file:
+- Set `sift_test_results_autouse = false` in `pyproject.toml` to flip the
+  project default off, then opt tests back in below.
+- `@pytest.mark.sift_include` forces reporting on for a test, class, or
+  module. `@pytest.mark.sift_exclude` forces it off. Closest marker wins.
+  `sift_exclude` beats `sift_include` when both apply.
+- `pytestmark` at the class or module level inherits to every test in scope.
+- For a whole directory, apply the marker in bulk from that directory's
+  `conftest.py`:
 
 ```python
-import pytest
+# tests/integration/conftest.py
+from pathlib import Path
 
-@pytest.fixture(scope="session")
-def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI", "localhost:50051")
-    rest_url = os.getenv("SIFT_REST_URI", "localhost:8080")
-    api_key = os.getenv("SIFT_API_KEY", "")
+import pytest
 
-    client = SiftClient(api_key=api_key, grpc_url=grpc_url, rest_url=rest_url)
+_HERE = Path(__file__).parent
 
-    return client
 
-from sift_client.util.test_results import *
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
 ```
 
-###### Then in your test file:
+#### Configuration
 
-```python
-# Because step was already imported and set autouse=True, this test will automatically get a step created for it.
-def test_no_includes():
-    assert condition, "Example failure"
-
-# Passing the fixtures to the test function allows you to take measurements or create substeps.
-def test_example(report_context, step):
-    # This will add a measurement to the current step for this function
-    step.measure(name="Example Measurement", value=test_string_value, bounds="expected_string_value")
-
-    with report_context.new_step(name="Example Step") as substep:
-        example_measurement = tlm.read(channel_name)
-        substep.measure(name="Substep Measurement", value=example_measurement, bounds=(min=74.9, max=75.1))
+CLI options registered by the plugin:
+
+- `--sift-test-results-log-file`: Path to write the JSONL log file. `true`
+  (default) auto-creates a temp file. `false` or `none` disables logging.
+  Any other value is treated as a file path.
+- `--no-sift-test-results-git-metadata`: Exclude git metadata (repo, branch,
+  commit) from the test report. Included by default.
+- `--sift-test-results-check-connection`: Make `report_context`, `step`, and
+  `module_substep` no-op when the client has no connection. Requires a
+  `client_has_connection` fixture (the plugin ships a default).
+
+Each option has a matching ini key for per-project configuration under
+``[tool.pytest.ini_options]`` in ``pyproject.toml`` (or ``[pytest]`` in
+``pytest.ini``). CLI flags override ini values. The
+``sift_test_results_autouse`` ini key (bool, default ``true``) sets the
+project-wide default for the gate described above. The default
+``sift_client`` fixture reads ``sift_grpc_uri`` and ``sift_rest_uri`` as
+fallbacks when the corresponding env vars are unset (env vars win when
+both are set). ``SIFT_API_KEY`` is env-only. Load it from a ``.env`` file
+via the ``pytest-dotenv`` plugin or inject it via your CI secret manager.
+
+```toml
+[tool.pytest.ini_options]
+sift_test_results_autouse = false
+sift_test_results_log_file = "false"
+sift_test_results_check_connection = true
+sift_test_results_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
 ```
+
+To disable the plugin for a single run:
+`pytest -p no:sift_client.pytest_plugin`.
 """
 
 from .context_manager import NewStep, ReportContext
-from .pytest_util import (
-    client_has_connection,
-    module_substep,
-    pytest_addoption,
-    pytest_runtest_makereport,
-    report_context,
-    step,
-)
 
 __all__ = [
     "NewStep",
     "ReportContext",
-    "client_has_connection",
-    "module_substep",
-    "pytest_addoption",
-    "pytest_runtest_makereport",
-    "report_context",
-    "step",
 ]
diff --git a/python/lib/sift_client/util/test_results/pytest_util.py b/python/lib/sift_client/util/test_results/pytest_util.py
deleted file mode 100644
index a96a47fb3..000000000
--- a/python/lib/sift_client/util/test_results/pytest_util.py
+++ /dev/null
@@ -1,206 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generator
-
-import pytest
-
-from sift_client.sift_types.test_report import TestStatus
-from sift_client.util.test_results import ReportContext
-
-if TYPE_CHECKING:
-    from sift_client.client import SiftClient
-    from sift_client.util.test_results.context_manager import NewStep
-
-REPORT_CONTEXT: ReportContext | None = None
-
-
-def pytest_addoption(parser: pytest.Parser) -> None:
-    """Register Sift-specific command-line options."""
-    parser.addoption(
-        "--sift-test-results-log-file",
-        default=None,
-        help="Path to write the Sift test result log file. "
-        "Use 'true' (default) to auto-create a temp file, "
-        "False, 'false', or 'none' to disable logging, "
-        "or a file path to write to a specific location.",
-    )
-    parser.addoption(
-        "--no-sift-test-results-git-metadata",
-        action="store_false",
-        dest="sift_test_results_git_metadata",
-        default=True,
-        help="Exclude git metadata from the Sift test results. "
-        "Git metadata (repo, branch, commit) is included by default.",
-    )
-    parser.addoption(
-        "--sift-test-results-check-connection",
-        action="store_true",
-        default=False,
-        help="Skip the sift test-result fixtures (report_context, step, module_substep) "
-        "when the Sift client has no connection to the server. Requires a "
-        "`client_has_connection` fixture to be available in the test session.",
-    )
-
-
-def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
-    """Determine log_file value from --sift-test-results-log-file option."""
-    raw = None
-    if pytestconfig is not None:
-        raw = pytestconfig.getoption("--sift-test-results-log-file", default=None)
-    if raw is None:
-        return True
-    lower = str(raw).lower()
-    if lower in ("true", "1"):
-        return True
-    if lower in ("false", "none"):
-        return None
-    return Path(raw)
-
-
-@pytest.hookimpl(tryfirst=True, hookwrapper=True)
-def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """You should import this hook to capture any AssertionErrors that occur during the test. If not included, any assert failures in a test will not automatically fail the step."""
-    outcome = yield
-    report = outcome.get_result()
-    if report.outcome == "skipped":
-        # Skipped steps won't invoke the method/fixtures at all, so we need to manually record a step.
-        if REPORT_CONTEXT:
-            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
-                new_step.current_step.update({"status": TestStatus.SKIPPED})
-    setattr(item, "rep_" + report.when, call)
-
-
-def _report_context_impl(
-    sift_client: SiftClient,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config | None = None,
-) -> Generator[ReportContext | None, None, None]:
-    args = request.config.invocation_params.args
-    test_path = Path(args[0]) if args else None
-    if test_path is not None and test_path.exists():
-        base_name = test_path.name
-        test_case: Path | str = test_path
-    else:
-        base_name = "pytest " + " ".join(args) if args else "pytest"
-        test_case = base_name
-    log_file = _resolve_log_file(pytestconfig)
-    include_git_metadata = (
-        bool(pytestconfig.getoption("sift_test_results_git_metadata", default=True))
-        if pytestconfig
-        else True
-    )
-    with ReportContext(
-        sift_client,
-        name=f"{base_name} {datetime.now(timezone.utc).isoformat()}",
-        test_case=str(test_case),
-        log_file=log_file,
-        include_git_metadata=include_git_metadata,
-    ) as context:
-        # Set a global so we can access this in pytest hooks.
-        global REPORT_CONTEXT
-        REPORT_CONTEXT = context
-        yield context
-
-
-def _check_connection_enabled(pytestconfig: pytest.Config | None) -> bool:
-    """Return True when the caller opted into `--sift-test-results-check-connection`."""
-    if pytestconfig is None:
-        return False
-    return bool(pytestconfig.getoption("sift_test_results_check_connection", default=False))
-
-
-def _has_sift_connection(request: pytest.FixtureRequest) -> bool:
-    """Resolve the `client_has_connection` fixture lazily; only called when the check is enabled."""
-    return bool(request.getfixturevalue("client_has_connection"))
-
-
-@pytest.fixture(scope="session", autouse=True)
-def report_context(
-    sift_client: SiftClient, request: pytest.FixtureRequest, pytestconfig: pytest.Config
-) -> Generator[ReportContext | None, None, None]:
-    """Create a report context for the session.
-
-    The log file destination is controlled by ``--sift-test-results-log-file``.
-    Defaults to a temp file when not set.
-
-    When ``--sift-test-results-check-connection`` is passed, this fixture will no-op
-    (yield None) if the Sift client has no connection to the server. That mode
-    requires a ``client_has_connection`` fixture to be available in the session.
-    """
-    if _check_connection_enabled(pytestconfig) and not _has_sift_connection(request):
-        yield None
-        return
-    yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
-
-
-def _step_impl(
-    report_context: ReportContext, request: pytest.FixtureRequest
-) -> Generator[NewStep | None, None, None]:
-    name = str(request.node.name)
-    existing_docstring = request.node.obj.__doc__ or None
-    with report_context.new_step(
-        name=name, description=existing_docstring, assertion_as_fail_not_error=False
-    ) as new_step:
-        yield new_step
-        if hasattr(request.node, "rep_call") and request.node.rep_call.excinfo:
-            new_step.update_step_from_result(
-                request.node.rep_call.excinfo,
-                request.node.rep_call.excinfo.value,
-                request.node.rep_call.excinfo.tb,
-            )
-
-
-@pytest.fixture(autouse=True)
-def step(
-    report_context: ReportContext | None,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create an outer step for the function.
-
-    No-ops when ``--sift-test-results-check-connection`` is set and the client
-    has no connection (or when the session-scoped ``report_context`` resolved to None).
-    """
-    if report_context is None or (
-        _check_connection_enabled(pytestconfig) and not _has_sift_connection(request)
-    ):
-        yield None
-        return
-    yield from _step_impl(report_context, request)
-
-
-@pytest.fixture(scope="module", autouse=True)
-def module_substep(
-    report_context: ReportContext | None,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create a step per module.
-
-    No-ops when ``--sift-test-results-check-connection`` is set and the client
-    has no connection (or when the session-scoped ``report_context`` resolved to None).
-    """
-    if report_context is None or (
-        _check_connection_enabled(pytestconfig) and not _has_sift_connection(request)
-    ):
-        yield None
-        return
-    yield from _step_impl(report_context, request)
-
-
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client):
-    """Check if the SiftClient has a connection to the Sift server.
-
-    Can be used to skip tests that require a connection to the Sift server, and is
-    consulted by the Sift fixtures when ``--sift-test-results-check-connection`` is set.
-    """
-    has_connection = False
-    try:
-        sift_client.ping.ping()
-        has_connection = True
-    except Exception:
-        has_connection = False
-    return has_connection
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 403c89bf8..79afdf464 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -409,6 +409,15 @@ select = [
 env_files = [
     ".env"
 ]
+# `pytester` is registered globally because pytest 8+ disallows `pytest_plugins`
+# in non-top-level conftests. Only the plugin test suite uses it; activating it
+# globally is harmless since the fixture is opt-in.
+addopts = "-p pytester"
+# The Sift plugin is loaded for the whole project via `python/conftest.py`.
+# The autouse gate defaults to off here so unit tests don't use it. The
+# integration subtree (lib/sift_client/_tests/util/) opts back in via
+# `pytest.mark.sift_include` applied in its conftest.
+sift_test_results_autouse = false
 testpaths = [
     "lib/sift_py",
     "lib/sift_client/_tests",

From 74011c698be25bdf8322c318cdde7ab50542685d Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Thu, 21 May 2026 14:40:46 -0700
Subject: [PATCH 03/19] Python(feat): pytest graceful handling missing
 connection (#569)

---
 python/docs/examples/pytest_plugin.md         | 230 ++++++++---------
 .../low_level_wrappers/test_results.py        |  33 ++-
 python/lib/sift_client/_tests/conftest.py     |  12 +-
 .../_tests/pytest_plugin/conftest.py          |   9 +
 .../pytest_plugin/test_configuration.py       | 106 +++++---
 .../_tests/pytest_plugin/test_credentials.py  |   8 +-
 .../_tests/pytest_plugin/test_disabled.py     | 183 +++++++++++++
 .../_tests/pytest_plugin/test_offline.py      | 135 ++++++++++
 .../_tests/pytest_plugin/test_online.py       | 133 ++++++++++
 .../lib/sift_client/_tests/util/conftest.py   |   4 +-
 .../_tests/util/test_report_context.py        |  95 +++++++
 python/lib/sift_client/client.py              |   5 +
 python/lib/sift_client/pytest_plugin.py       | 243 +++++++++++++-----
 .../lib/sift_client/resources/test_results.py |  23 +-
 .../sift_types/_mixins/simulated.py           |  32 +++
 .../lib/sift_client/sift_types/test_report.py |  13 +-
 .../sift_client/util/test_results/__init__.py |  28 +-
 .../sift_client/util/test_results/bounds.py   | 111 ++++++--
 .../util/test_results/context_manager.py      | 131 ++++++----
 python/pyproject.toml                         |  18 +-
 python/scripts/dev                            |   3 +-
 21 files changed, 1203 insertions(+), 352 deletions(-)
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_offline.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_online.py
 create mode 100644 python/lib/sift_client/_tests/util/test_report_context.py
 create mode 100644 python/lib/sift_client/sift_types/_mixins/simulated.py

diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index 3557dd9c7..2ac298256 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -88,21 +88,22 @@ def sift_client() -> SiftClient:
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
 | `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
 | `module_substep` | fixture (autouse) | module | One step per test file with each function nested as a substep. |
-| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted only when `--sift-test-results-check-connection` is set. |
+| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
 
 ### CLI options
 
 | Flag | Default | Effect |
 |---|---|---|
-| `--sift-test-results-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. |
-| `--no-sift-test-results-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
-| `--sift-test-results-check-connection` | off | Make `report_context`, `step`, and `module_substep` no-op (yield `None`) when `client_has_connection` is `False`. Lets the same suite run locally without a Sift backend. |
+| `--sift-offline` | off (online) | Skip the session-start ping and don't contact Sift. All create/update calls go to the JSONL log file for later replay via `import-test-result-log`. Missing `SIFT_*` env vars are tolerated; placeholders are filled. |
+| `--sift-disabled` | off | Skip Sift entirely. Nothing contacts the API and no log file is written; `step.measure(...)` still evaluates bounds and returns a real pass/fail boolean. Also honored via `SIFT_DISABLED=1`. Supersedes every other flag (disabled wins over offline). |
+| `--sift-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. Incompatible with `--sift-offline` since offline mode needs the log file as its sole sink. |
+| `--no-sift-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
 
 These can be passed permanently via `addopts`:
 
 ```ini title="pytest.ini"
 [pytest]
-addopts = --sift-test-results-check-connection
+addopts = --sift-offline
 ```
 
 Or set the matching ini key directly (recommended for stable per-project
@@ -112,10 +113,11 @@ CLI flags, when passed, override the ini values.
 
 | Ini key | Type | Equivalent CLI flag |
 |---|---|---|
-| `sift_test_results_log_file` | string (`true` / `false` / `none` / path) | `--sift-test-results-log-file=<value>` |
-| `sift_test_results_git_metadata` | bool (default `true`) | `--no-sift-test-results-git-metadata` (sets to `false`) |
-| `sift_test_results_check_connection` | bool (default `false`) | `--sift-test-results-check-connection` |
-| `sift_test_results_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
+| `sift_log_file` | string (`true` / `false` / `none` / path) | `--sift-log-file=<value>` |
+| `sift_git_metadata` | bool (default `true`) | `--no-sift-git-metadata` (sets to `false`) |
+| `sift_offline` | bool (default `false`) | `--sift-offline` |
+| `sift_disabled` | bool (default `false`) | `--sift-disabled` (also honors `SIFT_DISABLED` env var) |
+| `sift_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
 
 The default `sift_client` fixture reads its two URIs from environment first
 and falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
@@ -133,18 +135,16 @@ flags for credentials.
 
 ```toml title="pyproject.toml"
 [tool.pytest.ini_options]
-sift_test_results_check_connection = true
-sift_test_results_log_file = "false"
-sift_test_results_git_metadata = false
+sift_offline = true
+sift_git_metadata = false
 sift_grpc_uri = "your-org.sift.example:443"
 sift_rest_uri = "https://your-org.sift.example"
 ```
 
 ```ini title="pytest.ini"
 [pytest]
-sift_test_results_check_connection = true
-sift_test_results_log_file = false
-sift_test_results_git_metadata = false
+sift_offline = true
+sift_git_metadata = false
 sift_grpc_uri = your-org.sift.example:443
 sift_rest_uri = https://your-org.sift.example
 ```
@@ -171,7 +171,7 @@ into `os.environ` before tests run.
 glue is needed.
 
 !!! warning "FedRAMP / shared environments"
-    Pass `--sift-test-results-log-file=false` (or set the ini key to `"false"`)
+    Pass `--sift-log-file=false` (or set the ini key to `"false"`)
     to skip the temp file + worker pipeline. Create/update calls then run
     inline against the API instead of being deferred through a subprocess.
 
@@ -184,7 +184,7 @@ Every report the plugin creates includes:
 - `system_operator`: `getpass.getuser()`.
 - `start_time` / `end_time`: set on session enter/exit.
 - `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
-- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-test-results-git-metadata` or when not in a git repo.
+- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-git-metadata` or when not in a git repo.
 
 Example invocations:
 
@@ -207,7 +207,7 @@ useful when a repo holds tests that you don't want included in the Sift test rep
 
 | Setting                                                 | Effect                                                                                       |
 |---------------------------------------------------------|----------------------------------------------------------------------------------------------|
-| `sift_test_results_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
+| `sift_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
 | `@pytest.mark.sift_include` on a test, class, or module | Force reporting on for that scope, regardless of the project default.                        |
 | `@pytest.mark.sift_exclude` on a test, class, or module | Force reporting off for that scope, regardless of the project default.                       |
 
@@ -237,7 +237,7 @@ def pytest_collection_modifyitems(config, items):
 ```
 
 This applies `sift_include` to every test collected under `tests/example/`.
-Combine with `sift_test_results_autouse = false` in `pyproject.toml` for
+Combine with `sift_autouse = false` in `pyproject.toml` for
 opting in to specific directories. 
 
 `pytest_collection_modifyitems` receives every item in the session, not just
@@ -657,151 +657,129 @@ The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
 pytest
 
 # Pin the log file so you can replay it later if the import worker dies
-pytest --sift-test-results-log-file=./sift-results.jsonl
+pytest --sift-log-file=./sift-results.jsonl
 ```
 
-See [Running offline](#running-offline) for the same suite running with or
-without a reachable Sift server.
+See [Running modes](#running-modes) for the offline and disabled flags
+that let the same suite run without (or without contacting) Sift.
 
-## Running offline
+## Running modes
 
-The plugin supports two offline workflows, depending on whether you want a
-Sift report at all when the test environment can't reach Sift. The first
-turns the plugin into a no-op when the server is unreachable. The second
-keeps the plugin running normally and writes every create/update to a local
-JSONL file that you upload from a connected machine afterward.
+The plugin runs in one of three modes, picked at invocation:
 
-| Pattern | Flag | Runtime behavior | Follow-up |
-|---|---|---|---|
-| Skip when offline | `--sift-test-results-check-connection` | Fixtures yield `None`, no log file, no report. Pytest still reports pass/fail. | None. |
-| Capture locally, upload later | `--sift-test-results-log-file=<path>` | Plugin writes every create/update to the JSONL file. | `import-test-result-log <path>` from a connected machine. |
+| Mode | Flag | Network | Log file | `step.measure(...)` | When to use |
+|---|---|---|---|---|---|
+| Online (default) | _(none)_ | yes (pings at session start, aborts if it fails) | optional write-through backup | real measurement against Sift | CI with Sift credentials, local dev hitting your tenant |
+| Offline | `--sift-offline` | none | required (the sole sink) | real measurement queued to log | field tests, air-gapped labs, CI without network |
+| Disabled | `--sift-disabled` | none | none | bounds eval; returns a real bool | local dev or CI that doesn't have (or want) Sift |
 
-Pattern 1 suits laptop dev and CI without Sift secrets. Pattern 2 suits
-field tests, vehicles on remote sites, and air-gapped labs.
+Pass both flags? Disabled wins. It's the "skip Sift entirely" hammer and
+supersedes everything else.
 
-### Pattern 1: skip when offline
+### Online mode (default)
 
-`--sift-test-results-check-connection` makes the plugin ping Sift once at
-session start through the `client_has_connection` fixture (which by default
-calls `sift_client.ping.ping()`). On a failed ping, `report_context`,
-`step`, and `module_substep` yield `None` for the rest of the session.
-Pytest still runs the tests and still reports pass/fail.
+`report_context` resolves `client_has_connection` at session start. The
+default implementation calls `sift_client.ping.ping()`. A failed ping
+aborts the whole session with `pytest.UsageError` and points at
+`--sift-offline` and `--sift-disabled` as escape hatches.
 
-```bash
-pytest --sift-test-results-check-connection
-```
+This is loud on purpose. A CI run that silently no-ops on a flaky network
+won't get noticed until somebody goes looking for the report, which is
+usually weeks later, which is usually too late.
 
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-test-results-check-connection
-```
+With the default `--sift-log-file` setting on, create/update calls are
+written to a JSONL log file during the run and an
+`import-test-result-log --incremental` worker replays them against Sift
+in the background. If the worker crashes mid-session (connection failure,
+API error) or is still draining its backlog at session end, the failure
+is logged at session end with a `replay-test-result-log` command for
+manual recovery — test outcomes are unaffected and the local log file is
+preserved. Pass `--sift-log-file=false` to make every create/update
+synchronous against the API instead.
 
-#### Handling `None` in tests
+#### Overriding the connection check
 
-Calls on `step` raise `AttributeError` when it's `None`, so tests that take
-`step` as a parameter need a guard. The cleanest fix is to shadow the
-plugin's `step` fixture in your conftest and turn the `None` case into an
-automatic skip.
+Override `client_has_connection` when ping isn't the right signal, for
+example a token cache that's only warm when authenticated:
 
 ```python title="conftest.py"
-import pytest
+from pathlib import Path
 
-pytest_plugins = ["sift_client.pytest_plugin"]
+import pytest
 
 
-@pytest.fixture(autouse=True)
-def step(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    yield step
+@pytest.fixture(scope="session")
+def client_has_connection(sift_client) -> bool:
+    return Path("~/.sift-token-cache").expanduser().is_file()
 ```
 
-The `step` parameter on the override resolves to the plugin's fixture, not
-to the override itself. `autouse=True` is required so the skip applies to
-tests that don't request `step` directly. The same shadowing trick works
-for `module_substep` and `report_context`.
+The override is ignored under `--sift-offline` and `--sift-disabled`.
 
-For one-off tests that don't share a conftest, an inline guard works just
-as well:
+### Offline mode (`--sift-offline`)
 
-```python
-def test_battery_voltage(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    step.measure(name="battery_voltage", value=4.97, bounds={"min": 4.8, "max": 5.2})
-```
+Same fixtures, same `step.measure(...)` semantics as online. The
+difference is where the writes go: every create/update lands in a JSONL
+log file instead of hitting the Sift API. The session-start ping is
+skipped, missing `SIFT_*` env vars are tolerated (placeholders are
+filled), and the replay worker (`import-test-result-log --incremental`)
+does not get spawned at session end.
 
-If you'd rather have tests pass through silently than skip them, wrap the
-calls in a helper that no-ops on `None`:
-
-```python
-def safe_measure(step, **kwargs):
-    if step is None:
-        return True
-    return step.measure(**kwargs)
+```bash
+pytest --sift-offline --sift-log-file=./run.jsonl
 ```
 
-#### Overriding the connection check
+Once you have connectivity, replay it:
 
-The default `client_has_connection` fixture calls `sift_client.ping.ping()`.
-Override it in your conftest if pinging is the wrong signal for your
-environment, for example a token cache that's only warm when authenticated:
+```bash
+import-test-result-log ./run.jsonl
+```
 
-```python title="conftest.py"
-from pathlib import Path
+That replay creates the report, steps, and measurements against Sift.
+See [Replaying a saved log file](#replaying-a-saved-log-file) for cleanup
+and the incremental flag.
 
-import pytest
+`--sift-log-file=none` is rejected when offline is set. The
+log file is the only sink in offline mode, so without it the results are
+gone.
 
+!!! warning "Pin the log path"
+    Without `--sift-log-file=<path>`, offline mode writes to
+    a `tempfile.NamedTemporaryFile` and only surfaces the path via a
+    `logger.info` line. Pin a known path when you intend to replay later.
 
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client) -> bool:
-    return Path("~/.sift-token-cache").expanduser().is_file()
-```
+### Disabled mode (`--sift-disabled`)
 
-The plugin only consults this fixture when `--sift-test-results-check-connection`
-is set, so an unused override has no effect on a normal run.
+The plugin stays loaded with the same fixtures and markers as the other
+modes. Nothing contacts Sift, no log file is written, and no `SIFT_*`
+env vars are required. `step.measure(...)`, `step.measure_avg(...)`,
+`step.measure_all(...)`, `step.substep(...)`, and
+`report_context.report.update({...})` all behave normally — bounds
+evaluate and you get a real pass/fail boolean back.
 
-### Pattern 2: capture locally, upload later
+Entities returned in disabled mode report `is_simulated == True` (on
+`TestReport`, `TestStep`, `TestMeasurement`, and `ReportContext`) so
+consumers and tests can branch on provenance. Offline-mode entities
+also report `is_simulated == True`.
 
-This pattern keeps the plugin running normally even when Sift is
-unreachable. The plugin writes to the log file, the worker dies on connect,
-and the file is left on disk for you to upload later. Pin the log file path
-so you can find it afterward, and don't pass
-`--sift-test-results-check-connection`, which would suppress the logging
-this pattern relies on.
+How to turn it on, in the order most projects pick:
 
 ```bash
-pytest --sift-test-results-log-file=./run.jsonl
-```
-
-What happens during the run:
+# In an .envrc, devcontainer, or CI job config
+export SIFT_DISABLED=1
 
-- Every report, step, and measurement create/update is written to
-  `run.jsonl`. The plugin doesn't contact the Sift API for any of these
-  calls; they return simulated responses keyed by UUIDs that the replay
-  later maps to real IDs.
-- The `import-test-result-log --incremental` worker subprocess starts and
-  exits early when it can't reach Sift. The session does not fail when the
-  worker exits before the run ends.
-- Tests run against a real `step` fixture, so `step.measure(...)`,
-  substeps, parametrize, fixtures, and `module_substep` behave exactly as
-  they do online. No conftest changes are needed.
+# Per-invocation kill-switch
+pytest --sift-disabled
 
-Once you have connectivity, replay the file:
-
-```bash
-import-test-result-log ./run.jsonl
+# Per-project default (uncommon; online is usually the right default)
+# pyproject.toml:
+#   [tool.pytest.ini_options]
+#   sift_disabled = true
 ```
 
-The replay creates the report, steps, and measurements against Sift in one
-batch. See [Replaying a saved log file](#replaying-a-saved-log-file) for
-details on cleanup and the incremental flag.
-
-!!! warning "Pin the log path for Pattern 2"
-    Without `--sift-test-results-log-file=<path>`, the plugin writes to a
-    `tempfile.NamedTemporaryFile` and only surfaces the path via a
-    `logger.info` line. Always pin a known path when you intend to replay
-    the file later.
+Good fit for local dev without Sift credentials. Also for library
+consumers who don't have a Sift tenant. Also useful in CI for runs that
+shouldn't add noise to the report stream, like a PR job re-running the
+same suite five times in a row.
 
 ## Replaying a saved log file
 
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
index d15f86c48..ff0c2b515 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
@@ -3,7 +3,7 @@
 import logging
 import uuid
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, TypeVar, cast
 
 from google.protobuf import json_format
 from sift.test_reports.v1.test_reports_pb2 import (
@@ -68,6 +68,9 @@
 logger = logging.getLogger(__name__)
 
 
+_EntityT = TypeVar("_EntityT", TestReport, TestStep, TestMeasurement)
+
+
 class TestResultsLowLevelClient(LowLevelClientBase, WithGrpcClient):
     """Low-level client for the TestResultsAPI.
 
@@ -82,6 +85,16 @@ def __init__(self, grpc_client: GrpcClient):
         """
         super().__init__(grpc_client)
 
+    @staticmethod
+    def _mark_simulated(instance: _EntityT) -> _EntityT:
+        """Stamp an entity as having been produced by the simulate path.
+
+        Mirrors the ``__dict__`` write used by ``BaseType._apply_client_to_instance``
+        to bypass pydantic's frozen-model guard.
+        """
+        instance.__dict__["_simulated"] = True
+        return instance
+
     @staticmethod
     def simulate_create_test_report_response(
         request: CreateTestReportRequest,
@@ -387,7 +400,7 @@ async def create_test_report(
                     request,
                     response_id=simulated_proto.test_report_id,
                 )
-            return TestReport._from_proto(simulated_proto)
+            return self._mark_simulated(TestReport._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestReport(request)
         grpc_test_report = cast("CreateTestReportResponse", response).test_report
@@ -505,7 +518,9 @@ async def update_test_report(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestReport", request)
-            return self.simulate_update_test_report_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_report_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestReport(request)
         grpc_test_report = cast("UpdateTestReportResponse", response).test_report
@@ -560,7 +575,7 @@ async def create_test_step(
                     request,
                     response_id=simulated_proto.test_step_id,
                 )
-            return TestStep._from_proto(simulated_proto)
+            return self._mark_simulated(TestStep._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestStep(request)
         grpc_test_step = cast("CreateTestStepResponse", response).test_step
@@ -661,7 +676,9 @@ async def update_test_step(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestStep", request)
-            return self.simulate_update_test_step_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_step_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestStep(request)
         grpc_test_step = cast("UpdateTestStepResponse", response).test_step
@@ -716,7 +733,7 @@ async def create_test_measurement(
                     request,
                     response_id=simulated_proto.measurement_id,
                 )
-            return TestMeasurement._from_proto(simulated_proto)
+            return self._mark_simulated(TestMeasurement._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestMeasurement(
             request
@@ -861,7 +878,9 @@ async def update_test_measurement(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestMeasurement", request)
-            return self.simulate_update_test_measurement_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_measurement_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestMeasurement(
             request
diff --git a/python/lib/sift_client/_tests/conftest.py b/python/lib/sift_client/_tests/conftest.py
index 79b079d39..0b939ae39 100644
--- a/python/lib/sift_client/_tests/conftest.py
+++ b/python/lib/sift_client/_tests/conftest.py
@@ -79,5 +79,13 @@ def ci_pytest_tag(sift_client):
 
 
 def pytest_configure(config: pytest.Config) -> None:
-    """Enable the Sift connection-check mode for the fixtures used in this test suite since we run w/ mock client in non-integration tests."""
-    config.option.sift_test_results_check_connection = True
+    """Pick a Sift plugin mode based on whether integration tests are running.
+
+    Integration runs (``-m integration``) stay online with the default
+    log-file pipeline enabled so CI exercises the JSONL write + import
+    worker replay path that production users hit. Every other run defaults
+    to ``--sift-disabled`` so unit tests don't need credentials.
+    """
+    is_integration_run = "integration" in (config.option.markexpr or "")
+    if not is_integration_run:
+        config.option.sift_disabled = True
diff --git a/python/lib/sift_client/_tests/pytest_plugin/conftest.py b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
index 1fbd61e46..783a12bf4 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/conftest.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
@@ -25,6 +25,15 @@
 
 import pytest
 
+_SIFT_ENV_VARS = ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI", "SIFT_DISABLED")
+
+
+@pytest.fixture
+def clear_sift_env(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Unset all ``SIFT_*`` environment variables for the duration of the test."""
+    for name in _SIFT_ENV_VARS:
+        monkeypatch.delenv(name, raising=False)
+
 
 @pytest.fixture
 def write_plugin_conftest(pytester: pytest.Pytester) -> Callable[[], None]:
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
index 9b9be2d63..4efb9f554 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
@@ -34,7 +34,7 @@ def test_ini_log_file_none(
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_log_file = "none"
+            sift_log_file = "none"
             """
         )
         pytester.makepyfile("def test_noop(): pass")
@@ -46,7 +46,7 @@ def test_python_false_disables_log_file(
         pytester: pytest.Pytester,
         write_probe_conftest: Callable[[str], None],
     ) -> None:
-        """`config.option.sift_test_results_log_file = False` disables logging.
+        """`config.option.sift_log_file = False` disables logging.
 
         Conftests use this pattern (see lib/sift_client/_tests/util/conftest.py)
         to opt their subtree out of log-file mode. Regression test for the
@@ -55,7 +55,7 @@ def test_python_false_disables_log_file(
         """
         write_probe_conftest(
             """
-            config.option.sift_test_results_log_file = False
+            config.option.sift_log_file = False
             from sift_client.pytest_plugin import _resolve_log_file
             print("RESOLVED:", _resolve_log_file(config))
             """,
@@ -80,33 +80,54 @@ def test_ini_log_file_path(
         pytester.makepyprojecttoml(
             f"""
             [tool.pytest.ini_options]
-            sift_test_results_log_file = "{log_path}"
+            sift_log_file = "{log_path}"
             """
         )
         pytester.makepyfile("def test_noop(): pass")
         result = pytester.runpytest_subprocess("-s", "--co")
         result.stdout.fnmatch_lines([f"RESOLVED: {log_path}"])
 
-    def test_ini_check_connection_true(
+    def test_ini_offline_true(
         self,
         pytester: pytest.Pytester,
         write_probe_conftest: Callable[[str], None],
     ) -> None:
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _check_connection_enabled
-            print("CHECK:", _check_connection_enabled(config))
+            from sift_client.pytest_plugin import _is_offline
+            print("OFFLINE:", _is_offline(config))
             """,
         )
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_check_connection = true
+            sift_offline = true
             """
         )
         pytester.makepyfile("def test_noop(): pass")
         result = pytester.runpytest_subprocess("-s", "--co")
-        result.stdout.fnmatch_lines(["CHECK: True"])
+        result.stdout.fnmatch_lines(["OFFLINE: True"])
+
+    def test_ini_disabled_true(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _is_disabled
+            print("DISABLED:", _is_disabled(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_disabled = true
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["DISABLED: True"])
 
     def test_ini_git_metadata_false(
         self,
@@ -115,13 +136,13 @@ def test_ini_git_metadata_false(
     ) -> None:
         write_probe_conftest(
             """
-            print("INI_GIT:", config.getini("sift_test_results_git_metadata"))
+            print("INI_GIT:", config.getini("sift_git_metadata"))
             """,
         )
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_git_metadata = false
+            sift_git_metadata = false
             """
         )
         pytester.makepyfile("def test_noop(): pass")
@@ -145,37 +166,51 @@ def test_cli_overrides_ini(
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_log_file = "none"
+            sift_log_file = "none"
             """
         )
         pytester.makepyfile("def test_noop(): pass")
-        result = pytester.runpytest_subprocess(
-            "-s", "--co", f"--sift-test-results-log-file={cli_path}"
-        )
+        result = pytester.runpytest_subprocess("-s", "--co", f"--sift-log-file={cli_path}")
         result.stdout.fnmatch_lines([f"RESOLVED: {cli_path}"])
 
-    def test_cli_check_connection_flag(
+    def test_cli_offline_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--sift-offline`` CLI flag flips the resolver to True."""
+        write_probe_conftest(
+            """
+            from sift_client.pytest_plugin import _is_offline
+            print("OFFLINE:", _is_offline(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-offline")
+        result.stdout.fnmatch_lines(["OFFLINE: True"])
+
+    def test_cli_disabled_flag(
         self,
         pytester: pytest.Pytester,
         write_probe_conftest: Callable[[str], None],
     ) -> None:
-        """The ``--sift-test-results-check-connection`` CLI flag flips the resolver to True."""
+        """The ``--sift-disabled`` CLI flag flips the resolver to True."""
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _check_connection_enabled
-            print("CHECK:", _check_connection_enabled(config))
+            from sift_client.pytest_plugin import _is_disabled
+            print("DISABLED:", _is_disabled(config))
             """,
         )
         pytester.makepyfile("def test_noop(): pass")
-        result = pytester.runpytest_subprocess("-s", "--co", "--sift-test-results-check-connection")
-        result.stdout.fnmatch_lines(["CHECK: True"])
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-disabled")
+        result.stdout.fnmatch_lines(["DISABLED: True"])
 
     def test_cli_no_git_metadata_flag(
         self,
         pytester: pytest.Pytester,
         write_probe_conftest: Callable[[str], None],
     ) -> None:
-        """The ``--no-sift-test-results-git-metadata`` CLI flag flips git_metadata to False.
+        """The ``--no-sift-git-metadata`` CLI flag flips git_metadata to False.
 
         Guards the negation flag's ``dest`` binding: the flag name doesn't match
         the ini key, so a broken ``dest`` would silently fall back to the ini
@@ -183,11 +218,11 @@ def test_cli_no_git_metadata_flag(
         """
         write_probe_conftest(
             """
-            print("CLI_GIT:", config.getoption("sift_test_results_git_metadata"))
+            print("CLI_GIT:", config.getoption("sift_git_metadata"))
             """,
         )
         pytester.makepyfile("def test_noop(): pass")
-        result = pytester.runpytest_subprocess("-s", "--co", "--no-sift-test-results-git-metadata")
+        result = pytester.runpytest_subprocess("-s", "--co", "--no-sift-git-metadata")
         result.stdout.fnmatch_lines(["CLI_GIT: False"])
 
     def test_defaults_when_neither_set(
@@ -198,12 +233,14 @@ def test_defaults_when_neither_set(
         write_probe_conftest(
             """
             from sift_client.pytest_plugin import (
-                _check_connection_enabled,
+                _is_disabled,
+                _is_offline,
                 _resolve_log_file,
             )
             print("RESOLVED:", _resolve_log_file(config))
-            print("CHECK:", _check_connection_enabled(config))
-            print("INI_GIT:", config.getini("sift_test_results_git_metadata"))
+            print("OFFLINE:", _is_offline(config))
+            print("DISABLED:", _is_disabled(config))
+            print("INI_GIT:", config.getini("sift_git_metadata"))
             """,
         )
         pytester.makepyfile("def test_noop(): pass")
@@ -211,7 +248,8 @@ def test_defaults_when_neither_set(
         result.stdout.fnmatch_lines(
             [
                 "RESOLVED: True",
-                "CHECK: False",
+                "OFFLINE: False",
+                "DISABLED: False",
                 "INI_GIT: True",
             ]
         )
@@ -238,7 +276,7 @@ def report_context():
 
 
 class TestAutouseGate:
-    """`sift_include` / `sift_exclude` markers and the `sift_test_results_autouse` ini gate."""
+    """`sift_include` / `sift_exclude` markers and the `sift_autouse` ini gate."""
 
     def test_default_ini_true_activates(self, pytester: pytest.Pytester) -> None:
         """Plugin default (ini absent) keeps the autouse fixtures active."""
@@ -253,12 +291,12 @@ def test_inner(step):
         result.assert_outcomes(passed=1)
 
     def test_default_ini_false_skips(self, pytester: pytest.Pytester) -> None:
-        """`sift_test_results_autouse = false` makes the autouse fixtures no-op by default."""
+        """`sift_autouse = false` makes the autouse fixtures no-op by default."""
         pytester.makeconftest(_GATE_INNER_CONFTEST)
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_autouse = false
+            sift_autouse = false
             """
         )
         pytester.makepyfile(
@@ -276,7 +314,7 @@ def test_sift_include_marker_forces_on(self, pytester: pytest.Pytester) -> None:
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_autouse = false
+            sift_autouse = false
             """
         )
         pytester.makepyfile(
@@ -328,7 +366,7 @@ def test_module_pytestmark_inherits(self, pytester: pytest.Pytester) -> None:
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_autouse = false
+            sift_autouse = false
             """
         )
         pytester.makepyfile(
@@ -359,7 +397,7 @@ def test_bulk_apply_via_conftest_hook(self, pytester: pytest.Pytester) -> None:
         pytester.makepyprojecttoml(
             """
             [tool.pytest.ini_options]
-            sift_test_results_autouse = false
+            sift_autouse = false
             """
         )
         included = pytester.mkdir("included_subtree")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
index 9ee628e69..3f6d22a6e 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
@@ -31,8 +31,8 @@ def test_uris_from_ini(
             [tool.pytest.ini_options]
             sift_grpc_uri = "ini-grpc:1234"
             sift_rest_uri = "https://ini-rest"
-            sift_test_results_check_connection = true
-            sift_test_results_log_file = "false"
+            sift_offline = true
+
             """
         )
         pytester.makepyfile(
@@ -62,8 +62,8 @@ def test_env_var_overrides_ini_uri(
             [tool.pytest.ini_options]
             sift_grpc_uri = "ini-grpc:1234"
             sift_rest_uri = "https://ini-rest"
-            sift_test_results_check_connection = true
-            sift_test_results_log_file = "false"
+            sift_offline = true
+
             """
         )
         pytester.makepyfile(
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
new file mode 100644
index 000000000..cba4bc1ee
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
@@ -0,0 +1,183 @@
+"""Tests for ``--sift-disabled`` mode.
+
+Disabled mode skips Sift entirely. Autouse fixtures yield stub objects so
+test code that calls ``step.measure(...)`` keeps working without any Sift
+configuration; ``measure*`` evaluates bounds locally and returns the real
+pass/fail boolean. Nothing reaches Sift and no log file is written.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestDisabledMode:
+    def test_in_bounds_passes_out_of_bounds_fails(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Stub measure* evaluates bounds locally; pass/fail matches the real plugin."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_passes_in_bounds(step):
+                assert step.measure(name="v", value=5.0, bounds={"min": 4.8, "max": 5.2})
+
+            def test_fails_out_of_bounds(step):
+                assert step.measure(name="v", value=99.0, bounds={"max": 5.2}) is False
+
+            def test_substep_and_report_outcome(step):
+                with step.substep(name="inner") as inner:
+                    assert inner.report_outcome(name="ok", result=True) is True
+
+            def test_string_bounds(step):
+                assert step.measure(name="fw", value="1.0", bounds="1.0") is True
+                assert step.measure(name="fw", value="1.0", bounds="2.0") is False
+
+            def test_measure_avg(step):
+                assert step.measure_avg(
+                    name="bus", values=[4.97, 5.01, 5.03], bounds={"min": 4.9, "max": 5.1}
+                ) is True
+
+            def test_measure_all_outlier(step):
+                assert step.measure_all(
+                    name="p", values=[10.1, 10.2, 99.9], bounds={"max": 11.0}
+                ) is False
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=6)
+
+    def test_disabled_does_not_require_credentials(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Disabled mode never reads SIFT_* env vars; runs cleanly without them."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_via_env_var(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        """``SIFT_DISABLED=1`` triggers disabled mode without the CLI flag."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        monkeypatch.setenv("SIFT_DISABLED", "1")
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_supersedes_offline(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-disabled`` wins when combined with ``--sift-offline``.
+
+        Disabled is the "skip Sift entirely" hammer; passing it alongside
+        offline shouldn't error. The session runs without credentials, without
+        a log file, and without the offline-mode replay machinery.
+        """
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_runs(step):
+                assert step.measure(name="v", value=5.0, bounds={"max": 10.0}) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled", "--sift-offline")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_yields_stub_fixtures(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """`report_context` / `step` / `module_substep` are real instances backed by a simulate client."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            from sift_client.util.test_results import ReportContext
+            from sift_client.util.test_results.context_manager import NewStep
+
+            def test_types(step, report_context, module_substep):
+                assert isinstance(report_context, ReportContext)
+                assert report_context.is_simulated is True
+                assert report_context.report.is_simulated is True
+                assert step.current_step.is_simulated is True
+                assert isinstance(step, NewStep)
+                assert isinstance(module_substep, NewStep)
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_writes_no_log_file_even_when_path_pinned(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Disabled mode skips the log-file pipeline even when a path is pinned."""
+        log_path = tmp_path / "should-not-exist.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        assert not log_path.exists(), f"log file unexpectedly created at {log_path}"
+
+    def test_disabled_skips_client_has_connection_and_sift_client(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Disabled mode never resolves ``client_has_connection`` or ``sift_client``.
+
+        The plugin's ``report_context`` short-circuits to the stub before
+        consulting either fixture. Overrides that raise on resolution stay
+        un-triggered, so the inner test passes cleanly.
+        """
+        pytester.makeconftest(
+            """
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                raise AssertionError("sift_client should not resolve in disabled mode")
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                raise AssertionError(
+                    "client_has_connection should not resolve in disabled mode"
+                )
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_runs(step):
+                assert step.measure(name="v", value=5.0, bounds={"max": 10.0}) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_offline.py b/python/lib/sift_client/_tests/pytest_plugin/test_offline.py
new file mode 100644
index 000000000..f0470bad3
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_offline.py
@@ -0,0 +1,135 @@
+"""Tests for ``--sift-offline`` mode.
+
+Offline mode routes every create/update through the JSONL log file without
+contacting Sift. The session-start ping is skipped, the import worker is not
+spawned, and missing ``SIFT_*`` env vars are tolerated (placeholders are
+filled). Offline + ``--sift-log-file=none`` is rejected as a
+usage error since the log file is the sole sink in this mode.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestOfflineMode:
+    def test_offline_runs_without_network(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode constructs the client locally and never pings."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_in_bounds(step):
+                assert step.measure(name="v", value=5.0, bounds={"min": 4.8, "max": 5.2})
+
+            def test_out_of_bounds(step):
+                assert step.measure(name="v", value=10.0, bounds={"max": 5.2}) is False
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=2)
+
+    def test_log_file_none_incompatible_with_offline(
+        self,
+        pytester: pytest.Pytester,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-log-file=none`` + ``--sift-offline`` is a usage error."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", "--sift-log-file=none")
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "incompatible with --sift-offline" in combined, combined
+
+    def test_offline_yields_real_fixtures(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode runs a real ReportContext; entities still report `is_simulated=True` because the log-file path synthesizes responses prior to replay."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            from sift_client.util.test_results import ReportContext
+            from sift_client.util.test_results.context_manager import NewStep
+
+            def test_types(step, report_context):
+                assert isinstance(report_context, ReportContext)
+                assert isinstance(step, NewStep)
+                assert report_context.client._simulate is False
+                # log-file mode synthesizes responses, so entities are flagged simulated.
+                assert report_context.is_simulated is True
+                assert step.current_step.is_simulated is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=1)
+
+    def test_offline_writes_jsonl_to_pinned_log_file(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode populates the pinned JSONL file with create/update entries."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_one(step):
+                assert step.measure(
+                    name="v", value=5.0, bounds={"min": 4.8, "max": 5.2}
+                ) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        assert log_path.exists(), f"offline mode did not create {log_path}"
+        content = log_path.read_text()
+        assert content.strip(), "log file is empty"
+        # Each non-empty line is ``[Operation:uuid] {json}``. A successful
+        # session produces at least the report create + step create lines.
+        lines = [line for line in content.splitlines() if line.strip()]
+        assert any(line.startswith("[CreateTestReport:") for line in lines), content
+        assert any(line.startswith("[CreateTestStep:") for line in lines), content
+
+    def test_offline_skips_client_has_connection(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Offline mode never resolves ``client_has_connection``.
+
+        Override the fixture to raise on resolution. If the override is
+        invoked, the session aborts. If it isn't, the inner test passes
+        cleanly, which confirms the offline path skipped the ping check.
+        """
+        pytester.makeconftest(
+            """
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                raise AssertionError(
+                    "client_has_connection should not resolve in offline mode"
+                )
+            """
+        )
+        pytester.makepyfile("def test_runs(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_online.py b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
new file mode 100644
index 000000000..876fffb0e
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
@@ -0,0 +1,133 @@
+"""Tests for online mode (the default).
+
+Online mode requires connectivity to Sift. The plugin pings via
+``client_has_connection`` at session start and aborts with
+``pytest.UsageError`` on failure. Missing ``SIFT_API_KEY`` /
+``SIFT_GRPC_URI`` / ``SIFT_REST_URI`` env vars are reported as a usage error
+so the failure is actionable.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestOnlineMode:
+    def test_ping_failure_aborts(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Online mode with an unreachable ping aborts the session via UsageError."""
+        pytester.makeconftest(
+            """
+            import pytest
+            from unittest.mock import MagicMock
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                client = MagicMock()
+                client.ping.ping.side_effect = ConnectionError("unreachable")
+                return client
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_should_not_run():
+                assert True
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Sift ping failed" in combined, combined
+
+    def test_missing_env_vars_named_in_error(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The default ``sift_client`` fixture names missing env vars in its error."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_should_not_run():
+                pass
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        for var in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            assert var in combined, combined
+
+    def test_online_resolves_client_has_connection_once(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+    ) -> None:
+        """Online mode resolves ``client_has_connection`` exactly once at session start.
+
+        Overrides the fixture to bump a counter persisted to a file the outer
+        test reads after the inner session finishes. Outcomes aren't asserted
+        because the real ``ReportContext`` constructed against a ``MagicMock``
+        client crashes downstream when Pydantic sees mock IDs; what we're
+        verifying is the ping path itself, which runs before construction.
+        """
+        counter_file = tmp_path / "ping_calls.txt"
+        pytester.makeconftest(
+            f"""
+            from pathlib import Path
+            from unittest.mock import MagicMock
+
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+            _COUNTER = Path({str(counter_file)!r})
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                return MagicMock()
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                prior = int(_COUNTER.read_text()) if _COUNTER.exists() else 0
+                _COUNTER.write_text(str(prior + 1))
+                return True
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_a(): pass
+
+            @pytest.mark.sift_include
+            def test_b(): pass
+            """
+        )
+        pytester.runpytest_subprocess()
+        assert counter_file.exists(), "client_has_connection was not resolved"
+        assert counter_file.read_text() == "1", (
+            f"expected session-scoped fixture to resolve once, got {counter_file.read_text()}"
+        )
diff --git a/python/lib/sift_client/_tests/util/conftest.py b/python/lib/sift_client/_tests/util/conftest.py
index 2f371e69e..9e255da8a 100644
--- a/python/lib/sift_client/_tests/util/conftest.py
+++ b/python/lib/sift_client/_tests/util/conftest.py
@@ -7,13 +7,13 @@
 
 def pytest_configure(config: pytest.Config) -> None:
     """Configure the pytest configuration to disable the Sift test results log file."""
-    config.option.sift_test_results_log_file = False
+    config.option.sift_log_file = False
 
 
 def pytest_collection_modifyitems(config: pytest.Config, items: "list[pytest.Item]") -> None:
     """Bulk-apply ``@pytest.mark.sift_include`` to integration tests under util/.
 
-    The project-wide default in ``pyproject.toml`` is ``sift_test_results_autouse
+    The project-wide default in ``pyproject.toml`` is ``sift_autouse
     = false`` so unit tests pay nothing for the globally-loaded Sift plugin.
     Integration tests in this subtree still need the autouse fixtures, so this
     hook flips the gate back on for any test already marked
diff --git a/python/lib/sift_client/_tests/util/test_report_context.py b/python/lib/sift_client/_tests/util/test_report_context.py
new file mode 100644
index 000000000..f12247c7a
--- /dev/null
+++ b/python/lib/sift_client/_tests/util/test_report_context.py
@@ -0,0 +1,95 @@
+"""Tier 1 tests for `ReportContext.__exit__`'s replay-worker handling.
+
+Each test substitutes the `import-test-result-log` argv with a tiny Python
+`-c` invocation that produces a controlled end-state (clean exit / hang /
+non-zero exit), then enters and exits a `ReportContext` against a
+simulate-mode `SiftClient`. This validates that real subprocess outcomes
+route to the right branch of `__exit__` without depending on the real
+replay binary or a Sift backend.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+from typing import TYPE_CHECKING
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.util.test_results import ReportContext
+
+if TYPE_CHECKING:
+    import pytest
+
+
+def _make_simulate_client() -> SiftClient:
+    """Build a SiftClient flagged for in-process simulation.
+
+    Constructor URLs are placeholders; nothing dials them because every
+    test-results write short-circuits through the simulate path.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="test",
+            grpc_url="test.invalid:0",
+            rest_url="http://test.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
+
+def _make_context(command: list[str]) -> ReportContext:
+    """Build a ReportContext whose replay subprocess is the provided command.
+
+    `log_file=True` triggers the temp-file path so `_open_import_proc` fires
+    on `__enter__`. The substitute argv is swapped in via the public-ish
+    `_build_replay_command` hook so the production Popen kwargs stay
+    exercised.
+    """
+    rc = ReportContext(_make_simulate_client(), name="test", log_file=True)
+    rc._build_replay_command = lambda: command  # type: ignore[method-assign]
+    return rc
+
+
+def test_worker_clean_exit_is_silent(caplog: pytest.LogCaptureFixture) -> None:
+    """Worker exits with code 0 → __exit__ is silent (case 1)."""
+    rc = _make_context([sys.executable, "-c", "pass"])
+    with caplog.at_level(logging.ERROR):
+        with rc:
+            pass
+    assert "Import process" not in caplog.text
+    assert "replay-test-result-log" not in caplog.text
+    assert rc._import_proc is not None
+    assert rc._import_proc.returncode == 0
+
+
+def test_worker_timeout_kills_and_logs(caplog: pytest.LogCaptureFixture) -> None:
+    """Worker still running at session end → kill + log, no raise (case 2)."""
+    rc = _make_context([sys.executable, "-c", "import time; time.sleep(30)"])
+    with caplog.at_level(logging.ERROR):
+        with rc:
+            pass
+    assert rc._import_proc is not None
+    # `kill()` + `wait()` were called; process is dead.
+    assert rc._import_proc.poll() is not None
+    assert "did not exit in 1s" in caplog.text
+    assert "replay-test-result-log" in caplog.text
+
+
+def test_worker_nonzero_exit_logs_stderr_no_raise(caplog: pytest.LogCaptureFixture) -> None:
+    """Worker exits non-zero with stderr → log stderr + replay hint, no raise (case 3)."""
+    rc = _make_context(
+        [
+            sys.executable,
+            "-c",
+            "import sys; sys.stderr.write('rpc deadline exceeded'); sys.exit(2)",
+        ]
+    )
+    with caplog.at_level(logging.ERROR):
+        with rc:
+            pass
+    assert rc._import_proc is not None
+    assert rc._import_proc.returncode == 2
+    assert "exited with code 2" in caplog.text
+    assert "rpc deadline exceeded" in caplog.text
+    assert "replay-test-result-log" in caplog.text
diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py
index 95fd25b71..ff574adba 100644
--- a/python/lib/sift_client/client.py
+++ b/python/lib/sift_client/client.py
@@ -152,6 +152,11 @@ def __init__(
         WithGrpcClient.__init__(self, grpc_client=grpc_client)
         WithRestClient.__init__(self, rest_client=rest_client)
 
+        # When set, test-results writes return synthesized responses without
+        # contacting Sift. Read by `TestResultsAPIAsync._simulate`. Used by the
+        # pytest plugin's ``--sift-disabled`` mode.
+        self._simulate: bool = False
+
         self.ping = PingAPI(self)
         self.assets = AssetsAPI(self)
         self.calculated_channels = CalculatedChannelsAPI(self)
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index f2699a954..494ded3b6 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -15,7 +15,7 @@
 if TYPE_CHECKING:
     from sift_client.util.test_results.context_manager import NewStep
 
-REPORT_CONTEXT: ReportContext | None = None
+REPORT_CONTEXT: Any = None
 
 
 @dataclass(frozen=True)
@@ -37,39 +37,53 @@ class _Option:
 
 
 _LOG_FILE = _Option(
-    cli_flag="--sift-test-results-log-file",
-    ini_name="sift_test_results_log_file",
+    cli_flag="--sift-log-file",
+    ini_name="sift_log_file",
     cli_help="Path to write the Sift test result log file. "
     "Use 'true' (default) to auto-create a temp file, "
     "False, 'false', or 'none' to disable logging, "
     "or a file path to write to a specific location.",
-    ini_help="Default value for --sift-test-results-log-file. Same values "
-    "accepted as the CLI flag (path, 'true', 'false', 'none').",
+    ini_help="Default value for --sift-log-file. Same values accepted as "
+    "the CLI flag (path, 'true', 'false', 'none').",
 )
 
 _GIT_METADATA = _Option(
-    cli_flag="--no-sift-test-results-git-metadata",
-    ini_name="sift_test_results_git_metadata",
+    cli_flag="--no-sift-git-metadata",
+    ini_name="sift_git_metadata",
     action="store_false",
     cli_help="Exclude git metadata from the Sift test results. "
     "Git metadata (repo, branch, commit) is included by default.",
     ini_help="Include git repo/branch/commit in the report (true/false). "
-    "Defaults to true. The --no-sift-test-results-git-metadata CLI flag "
-    "overrides this when passed.",
+    "Defaults to true. The --no-sift-git-metadata CLI flag overrides "
+    "this when passed.",
     ini_type="bool",
     ini_default=True,
 )
 
-_CHECK_CONNECTION = _Option(
-    cli_flag="--sift-test-results-check-connection",
-    ini_name="sift_test_results_check_connection",
+_OFFLINE = _Option(
+    cli_flag="--sift-offline",
+    ini_name="sift_offline",
     action="store_true",
-    cli_help="Skip the sift test-result fixtures (report_context, step, module_substep) "
-    "when the Sift client has no connection to the server. Requires a "
-    "`client_has_connection` fixture to be available in the test session.",
-    ini_help="When true, skip the sift test-result fixtures if the client has "
-    "no connection (same effect as --sift-test-results-check-connection). "
-    "Defaults to false.",
+    cli_help="Run without contacting Sift. All create/update calls are written "
+    "to a JSONL log file for later replay via `import-test-result-log`. "
+    "No session-start ping is attempted.",
+    ini_help="When true, run in offline mode (same effect as --sift-offline). Defaults to false.",
+    ini_type="bool",
+    ini_default=False,
+)
+
+_DISABLED = _Option(
+    cli_flag="--sift-disabled",
+    ini_name="sift_disabled",
+    action="store_true",
+    cli_help="Disable Sift integration entirely. Nothing contacts the API "
+    "and no log file is written. `step.measure(...)` still returns real "
+    "pass/fail booleans. Returned entities expose `is_simulated == True`. "
+    "Also honored via the `SIFT_DISABLED` env var. Supersedes every other "
+    "flag.",
+    ini_help="When true, run in disabled mode (same effect as --sift-disabled). "
+    "Also honored via the SIFT_DISABLED env var. Supersedes every other "
+    "setting. Defaults to false.",
     ini_type="bool",
     ini_default=False,
 )
@@ -89,7 +103,7 @@ class _Option:
 )
 
 _AUTOUSE = _Option(
-    ini_name="sift_test_results_autouse",
+    ini_name="sift_autouse",
     ini_help="Default for the Sift autouse fixtures (report_context, step, "
     "module_substep). When true (default), tests are included unless marked "
     "with @pytest.mark.sift_exclude. When false, tests are skipped unless "
@@ -102,7 +116,8 @@ class _Option:
 _OPTIONS: tuple[_Option, ...] = (
     _LOG_FILE,
     _GIT_METADATA,
-    _CHECK_CONNECTION,
+    _OFFLINE,
+    _DISABLED,
     _GRPC_URI,
     _REST_URI,
     _AUTOUSE,
@@ -139,15 +154,25 @@ def pytest_configure(config: pytest.Config) -> None:
     config.addinivalue_line(
         "markers",
         "sift_include: force the Sift autouse fixtures to activate for this test "
-        "regardless of the `sift_test_results_autouse` ini default.",
+        "regardless of the `sift_autouse` ini default.",
     )
     config.addinivalue_line(
         "markers",
         "sift_exclude: force the Sift autouse fixtures to skip this test "
-        "regardless of the `sift_test_results_autouse` ini default.",
+        "regardless of the `sift_autouse` ini default.",
     )
 
 
+def _is_offline(pytestconfig: pytest.Config | None) -> bool:
+    return bool(_option_or_ini(pytestconfig, _OFFLINE))
+
+
+def _is_disabled(pytestconfig: pytest.Config | None) -> bool:
+    if bool(_option_or_ini(pytestconfig, _DISABLED)):
+        return True
+    return os.getenv("SIFT_DISABLED", "").lower() in ("1", "true", "yes")
+
+
 def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
     """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
 
@@ -203,13 +228,23 @@ def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool |
     * ``None`` — unset; nothing was passed on the CLI and the ini key is
       absent. Treat as the default "use a temp file."
     * Python ``False`` — an explicit disable, typically set in a conftest via
-      ``config.option.sift_test_results_log_file = False``. Return ``None`` so
+      ``config.option.sift_log_file = False``. Return ``None`` so
       the rest of the pipeline knows to skip logging entirely.
     * A string (from CLI or ini) — interpret ``"true"`` / ``"1"`` as the temp
       file default, ``"false"`` / ``"none"`` as disable, anything else as a
       file path.
+
+    Rejects ``--sift-log-file=none`` combined with ``--sift-offline`` since
+    offline mode needs the log file as its sole sink.
     """
     raw = _option_or_ini(pytestconfig, _LOG_FILE)
+    disabled = raw is False or (isinstance(raw, str) and raw.lower() in ("false", "none"))
+    if disabled and _is_offline(pytestconfig):
+        raise pytest.UsageError(
+            "--sift-log-file=none is incompatible with --sift-offline; offline "
+            "mode requires a log file. Pin one with --sift-log-file=<path>, or "
+            "drop --sift-log-file=none to use a temp file."
+        )
     if raw is False:
         return None
     if not raw:
@@ -239,7 +274,7 @@ def _report_context_impl(
     sift_client: SiftClient,
     request: pytest.FixtureRequest,
     pytestconfig: pytest.Config | None = None,
-) -> Generator[ReportContext | None, None, None]:
+) -> Generator[ReportContext, None, None]:
     args = request.config.invocation_params.args
     test_path = Path(args[0]) if args else None
     if test_path is not None and test_path.exists():
@@ -248,7 +283,13 @@ def _report_context_impl(
     else:
         base_name = "pytest " + " ".join(args) if args else "pytest"
         test_case = base_name
-    log_file = _resolve_log_file(pytestconfig)
+    # Mode → ReportContext flags:
+    #   online (default): log_file=<temp or user path>, replay_log_file=True
+    #   --sift-offline:   log_file=<temp or user path>, replay_log_file=False
+    #   --sift-disabled:  log_file=False,               replay_log_file=False
+    disabled = sift_client._simulate
+    offline = False if disabled else _is_offline(pytestconfig)
+    log_file: str | Path | bool | None = False if disabled else _resolve_log_file(pytestconfig)
     git_metadata = _option_or_ini(pytestconfig, _GIT_METADATA)
     include_git_metadata = True if git_metadata is None else bool(git_metadata)
     with ReportContext(
@@ -257,28 +298,46 @@ def _report_context_impl(
         test_case=str(test_case),
         log_file=log_file,
         include_git_metadata=include_git_metadata,
+        replay_log_file=not (disabled or offline),
     ) as context:
         global REPORT_CONTEXT
         REPORT_CONTEXT = context
         yield context
 
 
-def _check_connection_enabled(pytestconfig: pytest.Config | None) -> bool:
-    """Return True when the caller opted into the check-connection mode via CLI or ini."""
-    return bool(_option_or_ini(pytestconfig, _CHECK_CONNECTION))
-
-
-def _has_sift_connection(request: pytest.FixtureRequest) -> bool:
-    """Resolve the `client_has_connection` fixture lazily; only called when the check is enabled."""
-    return bool(request.getfixturevalue("client_has_connection"))
-
-
 _CREDENTIAL_KEYS: tuple[tuple[str, _Option | None], ...] = (
     ("SIFT_API_KEY", None),  # env-only; never read from ini to keep secrets out of source control.
     ("SIFT_GRPC_URI", _GRPC_URI),
     ("SIFT_REST_URI", _REST_URI),
 )
 
+# Placeholder credentials used in --sift-offline mode when env/ini values
+# are missing. Offline mode never makes network calls, so the values are
+# only syntactically required by SiftConnectionConfig.
+_OFFLINE_DEFAULTS = {
+    "SIFT_API_KEY": "offline",
+    "SIFT_GRPC_URI": "offline.invalid:0",
+    "SIFT_REST_URI": "http://offline.invalid",
+}
+
+
+def _build_disabled_client() -> SiftClient:
+    """Construct a SiftClient for ``--sift-disabled`` mode.
+
+    Tagged with ``_simulate=True`` so test-results writes short-circuit through
+    the existing low-level simulate path without contacting Sift. The URLs are
+    syntactically valid but unreachable; nothing dials them.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="disabled",
+            grpc_url="disabled.invalid:0",
+            rest_url="http://disabled.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
 
 def _resolve_credential(
     pytestconfig: pytest.Config | None, env_name: str, opt: _Option | None
@@ -308,10 +367,19 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
     etc.) can override this fixture by defining their own ``sift_client``
     in their ``conftest.py``; pytest fixture resolution prefers the local
     definition.
+
+    In ``--sift-offline`` mode the missing-credential check is relaxed:
+    real env vars and ini values still win when set (so the client is
+    constructible against a real backend even though no calls are made), but
+    anything still missing is filled with a placeholder. In ``--sift-disabled``
+    mode the credential resolution is skipped entirely and placeholders are
+    always used.
     """
+    if _is_disabled(pytestconfig):
+        return _build_disabled_client()
     resolved = {env: _resolve_credential(pytestconfig, env, opt) for env, opt in _CREDENTIAL_KEYS}
     missing = [env for env, value in resolved.items() if not value]
-    if missing:
+    if missing and not _is_offline(pytestconfig):
         raise pytest.UsageError(
             "Sift credentials missing: "
             + ", ".join(missing)
@@ -319,8 +387,11 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
             "from a `.env` file automatically — or set the URIs via "
             "`sift_grpc_uri` / `sift_rest_uri` under `[tool.pytest.ini_options]` "
             "in pyproject.toml, or override the sift_client fixture in your "
-            "conftest.py."
+            "conftest.py, or pass --sift-offline / --sift-disabled to run "
+            "without contacting Sift."
         )
+    for env in missing:
+        resolved[env] = _OFFLINE_DEFAULTS[env]
     # `or ""` is unreachable in practice since the `missing` check above guarantees
     # non-None values
     return SiftClient(
@@ -334,32 +405,61 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
 
 @pytest.fixture(scope="session")
 def report_context(
-    sift_client: SiftClient, request: pytest.FixtureRequest, pytestconfig: pytest.Config
-) -> Generator[ReportContext | None, None, None]:
+    request: pytest.FixtureRequest, pytestconfig: pytest.Config
+) -> Generator[ReportContext, None, None]:
     """Lazy session-scoped Sift ReportContext.
 
-    The fixture is no longer autouse; it's instantiated on the first call to
-    ``request.getfixturevalue("report_context")``, which today happens inside
-    the gated ``step`` and ``module_substep`` fixtures. If every test in the
-    session is excluded via the marker gate, this fixture is never resolved
-    and no ReportContext (and no teardown subprocess) is created.
-
-    The log file destination is controlled by ``--sift-test-results-log-file``.
-    Defaults to a temp file when not set.
-
-    When ``--sift-test-results-check-connection`` is passed, this fixture will
-    yield ``None`` if the Sift client has no connection to the server. That mode
-    requires a ``client_has_connection`` fixture to be available in the session.
+    The fixture is no longer autouse; it's instantiated on the first call
+    to ``request.getfixturevalue("report_context")``, which today happens
+    inside the gated ``step`` and ``module_substep`` fixtures. If every
+    test in the session is excluded via the marker gate, this fixture is
+    never resolved and no ReportContext (or teardown subprocess) is created.
+
+    What gets yielded depends on the mode:
+
+    * ``--sift-disabled``: a real ``ReportContext`` against a placeholder
+      ``SiftClient`` with ``_simulate=True``. Every test-results write
+      returns a synthesized response without contacting Sift; no log file
+      is written; the replay subprocess never spawns. Test code that calls
+      ``step.measure(...)`` keeps working because bounds are evaluated as
+      usual and routed through the simulate path.
+    * ``--sift-offline``: a real ReportContext, but the session-start ping
+      is skipped, all create/update calls go to the JSONL log file, and
+      the import-test-result-log replay subprocess is not spawned at
+      session end.
+    * default (online): verify connectivity via ``client_has_connection``
+      before constructing the context. A failed ping aborts the session
+      with ``pytest.UsageError`` and points at ``--sift-offline`` and
+      ``--sift-disabled`` as escape hatches.
+
+    The log-file destination is controlled by
+    ``--sift-log-file``; defaults to a temp file when unset.
     """
-    if _check_connection_enabled(pytestconfig) and not _has_sift_connection(request):
-        yield None
+    if _is_disabled(pytestconfig):
+        yield from _report_context_impl(
+            _build_disabled_client(), request, pytestconfig=pytestconfig
+        )
         return
+    sift_client = request.getfixturevalue("sift_client")
+    if not _is_offline(pytestconfig):
+        try:
+            request.getfixturevalue("client_has_connection")
+        except pytest.UsageError:
+            raise
+        except Exception as exc:
+            grpc_config = getattr(getattr(sift_client, "grpc_client", None), "_config", None)
+            grpc_url = getattr(grpc_config, "uri", "<unknown>")
+            raise pytest.UsageError(
+                f"Sift ping failed against {grpc_url}: {exc}. "
+                "Pass --sift-offline to run without contacting Sift, or "
+                "--sift-disabled to skip Sift entirely."
+            ) from exc
     yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
 
 
 def _step_impl(
     report_context: ReportContext, request: pytest.FixtureRequest
-) -> Generator[NewStep | None, None, None]:
+) -> Generator[NewStep, None, None]:
     name = str(request.node.name)
     existing_docstring = request.node.obj.__doc__ or None
     with report_context.new_step(
@@ -383,18 +483,18 @@ def step(
 
     Resolves the gate via `_sift_enabled_for(request.node, ini_default)`:
     `sift_exclude` marker forces off, `sift_include` forces on, otherwise the
-    `sift_test_results_autouse` ini default applies. When on, requests the
+    `sift_autouse` ini default applies. When on, requests the
     session `report_context` lazily — the first gated test in the session
-    triggers its creation, subsequent gated tests reuse it.
+    triggers its creation, subsequent gated tests reuse it. In
+    ``--sift-disabled`` mode the report context is backed by a
+    ``SiftClient(_simulate=True)`` placeholder, so every write returns a
+    synthesized response without contacting Sift.
     """
     default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
     if not _sift_enabled_for(request.node, default):
         yield None
         return
     rc = request.getfixturevalue("report_context")
-    if rc is None:
-        yield None
-        return
     yield from _step_impl(rc, request)
 
 
@@ -416,21 +516,22 @@ def module_substep(
         yield None
         return
     rc = request.getfixturevalue("report_context")
-    if rc is None:
-        yield None
-        return
     yield from _step_impl(rc, request)
 
 
 @pytest.fixture(scope="session")
-def client_has_connection(sift_client):
-    """Check if the SiftClient has a connection to the Sift server.
-
-    Can be used to skip tests that require a connection to the Sift server, and is
-    consulted by the Sift fixtures when ``--sift-test-results-check-connection`` is set.
+def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRequest) -> bool:
+    """Verify the ``SiftClient`` can reach Sift via ``/ping``.
+
+    Consulted at session start by ``report_context`` in online mode. A failed
+    ping raises through ``report_context`` and aborts the session with
+    ``pytest.UsageError``. Override this fixture in your conftest to use a
+    different reachability signal (e.g. a cached auth token) for environments
+    where pinging is the wrong check. Returns ``False`` in ``--sift-disabled``
+    mode without constructing a client.
     """
-    try:
-        sift_client.ping.ping()
-        return True
-    except Exception:
+    if _is_disabled(pytestconfig):
         return False
+    sift_client = request.getfixturevalue("sift_client")
+    sift_client.ping.ping()
+    return True
diff --git a/python/lib/sift_client/resources/test_results.py b/python/lib/sift_client/resources/test_results.py
index 22e984b5e..9e88b6081 100644
--- a/python/lib/sift_client/resources/test_results.py
+++ b/python/lib/sift_client/resources/test_results.py
@@ -96,6 +96,7 @@ async def create(
         created_report = await self._low_level_client.create_test_report(
             test_report=test_report,
             log_file=log_file,
+            simulate=self.client._simulate,
         )
         return self._finalize(created_report, log_file)
 
@@ -271,7 +272,7 @@ async def update(
         update.resource_id = test_report_id
         existing = test_report if isinstance(test_report, TestReport) else None
         updated_test_report = await self._low_level_client.update_test_report(
-            update, log_file=log_file, existing=existing
+            update, log_file=log_file, existing=existing, simulate=self.client._simulate
         )
         return self._finalize(updated_test_report, log_file)
 
@@ -319,7 +320,7 @@ async def create_step(
         if isinstance(test_step, dict):
             test_step = TestStepCreate.model_validate(test_step)
         test_step_result = await self._low_level_client.create_test_step(
-            test_step, log_file=log_file
+            test_step, log_file=log_file, simulate=self.client._simulate
         )
         return self._finalize(test_step_result, log_file)
 
@@ -450,7 +451,7 @@ async def update_step(
         update.resource_id = test_step_id
         existing = test_step if isinstance(test_step, TestStep) else None
         updated_test_step = await self._low_level_client.update_test_step(
-            update, log_file=log_file, existing=existing
+            update, log_file=log_file, existing=existing, simulate=self.client._simulate
         )
         return self._finalize(updated_test_step, log_file)
 
@@ -484,10 +485,10 @@ async def create_measurement(
         if isinstance(test_measurement, dict):
             test_measurement = TestMeasurementCreate.model_validate(test_measurement)
         test_measurement_result = await self._low_level_client.create_test_measurement(
-            test_measurement, log_file=log_file
+            test_measurement, log_file=log_file, simulate=self.client._simulate
         )
         measurement = self._finalize(test_measurement_result, log_file)
-        if update_step and log_file is None:
+        if update_step and log_file is None and not self.client._simulate:
             step = await self.get_step(test_step=test_measurement_result.test_step_id)
             if step.status == TestStatus.PASSED and not measurement.passed:
                 await self.update_step(test_step=step, update={"status": TestStatus.FAILED})
@@ -508,7 +509,7 @@ async def create_measurements(
             A tuple of (measurements_created_count, measurement_ids).
         """
         return await self._low_level_client.create_test_measurements(
-            test_measurements, log_file=log_file
+            test_measurements, log_file=log_file, simulate=self.client._simulate
         )
 
     async def list_measurements(
@@ -621,10 +622,16 @@ async def update_measurement(
 
         update.resource_id = test_measurement.id_
         updated_test_measurement = await self._low_level_client.update_test_measurement(
-            update, log_file=log_file, existing=test_measurement
+            update, log_file=log_file, existing=test_measurement, simulate=self.client._simulate
         )
         updated_test_measurement = self._finalize(updated_test_measurement, log_file)
-        if update_step and log_file is None and update.passed is not None and not update.passed:
+        if (
+            update_step
+            and log_file is None
+            and not self.client._simulate
+            and update.passed is not None
+            and not update.passed
+        ):
             step = await self.get_step(test_step=updated_test_measurement.test_step_id)
             if step.status == TestStatus.PASSED:
                 await self.update_step(test_step=step, update={"status": TestStatus.FAILED})
diff --git a/python/lib/sift_client/sift_types/_mixins/simulated.py b/python/lib/sift_client/sift_types/_mixins/simulated.py
new file mode 100644
index 000000000..bdc2c572a
--- /dev/null
+++ b/python/lib/sift_client/sift_types/_mixins/simulated.py
@@ -0,0 +1,32 @@
+"""Mixin that exposes ``is_simulated`` on test-results entity types."""
+
+from __future__ import annotations
+
+
+class SimulatedMixin:
+    """Mixin for sift_types whose response can be produced by the simulate path.
+
+    The low-level wrapper stamps ``_simulated=True`` on entities it returns from
+    a simulated branch (see ``TestResultsLowLevelClient._mark_simulated``). This
+    mixin exposes that flag as a read-only ``is_simulated`` property so
+    consumers and tests can detect when an instance was synthesized rather than
+    round-tripped through Sift.
+
+    Inheriting classes are expected to declare a private field
+    ``_simulated: bool = False`` so pydantic tracks the default correctly.
+    """
+
+    _simulated: bool
+
+    @property
+    def is_simulated(self) -> bool:
+        """True when this instance was returned from the simulate path.
+
+        Set by the low-level wrapper when the call short-circuited to a
+        synthesized response (either ``SiftClient._simulate`` mode or per-call
+        ``log_file`` / ``simulate=True``). False for entities returned from a
+        normal online call or constructed manually outside the SDK. Offline
+        mode also reports True since responses are synthesized prior to
+        replay.
+        """
+        return self._simulated
diff --git a/python/lib/sift_client/sift_types/test_report.py b/python/lib/sift_client/sift_types/test_report.py
index ecc24f52f..c4abfc548 100644
--- a/python/lib/sift_client/sift_types/test_report.py
+++ b/python/lib/sift_client/sift_types/test_report.py
@@ -36,6 +36,7 @@
     ModelUpdate,
 )
 from sift_client.sift_types._mixins.file_attachments import FileAttachmentsMixin
+from sift_client.sift_types._mixins.simulated import SimulatedMixin
 from sift_client.sift_types.channel import Channel
 from sift_client.util.metadata import metadata_dict_to_proto, metadata_proto_to_dict
 
@@ -153,7 +154,7 @@ def to_proto(self) -> TestStepProto:
         return proto
 
 
-class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin):
+class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin, SimulatedMixin):
     """TestStep model representing a step in a test."""
 
     test_report_id: str
@@ -169,6 +170,8 @@ class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin):
     metadata: dict[str, str | float | bool] | None = None
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(cls, proto: TestStepProto, sift_client: SiftClient | None = None) -> TestStep:
@@ -383,7 +386,7 @@ def to_proto(self) -> TestMeasurementProto:
         return proto
 
 
-class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"]):
+class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"], SimulatedMixin):
     """TestMeasurement model representing a measurement in a test."""
 
     measurement_type: TestMeasurementType
@@ -404,6 +407,8 @@ class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"]):
 
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(
@@ -599,7 +604,7 @@ def _to_proto(self) -> ErrorInfoProto:
         )
 
 
-class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
+class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin, SimulatedMixin):
     """TestReport model representing a test report."""
 
     status: TestStatus
@@ -617,6 +622,8 @@ class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
     is_archived: bool
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(
diff --git a/python/lib/sift_client/util/test_results/__init__.py b/python/lib/sift_client/util/test_results/__init__.py
index ea213056e..ddce0326c 100644
--- a/python/lib/sift_client/util/test_results/__init__.py
+++ b/python/lib/sift_client/util/test_results/__init__.py
@@ -68,13 +68,13 @@ def main(self):
 
 Note: FedRAMP users: results are buffered to a temp file and uploaded by a
 subprocess at session end (no API calls during the run). Disable the buffer
-entirely with `--sift-test-results-log-file=false` for inline uploads.
+entirely with `--sift-log-file=false` for inline uploads.
 
 ### Controlling which tests produce reports
 
 The autouse fixtures fire for every test by default. To narrow that:
 
-- Set `sift_test_results_autouse = false` in `pyproject.toml` to flip the
+- Set `sift_autouse = false` in `pyproject.toml` to flip the
   project default off, then opt tests back in below.
 - `@pytest.mark.sift_include` forces reporting on for a test, class, or
   module. `@pytest.mark.sift_exclude` forces it off. Closest marker wins.
@@ -105,19 +105,24 @@ def pytest_collection_modifyitems(config, items):
 
 CLI options registered by the plugin:
 
-- `--sift-test-results-log-file`: Path to write the JSONL log file. `true`
+- `--sift-offline`: Run without contacting Sift. All create/update calls are
+  written to the JSONL log file for later replay via `import-test-result-log`.
+  No session-start ping is attempted.
+- `--sift-disabled`: Skip Sift entirely. Nothing contacts the API and no
+  log file is written. `step.measure(...)` still evaluates bounds and
+  returns a real pass/fail boolean. Returned entities expose
+  ``is_simulated == True``. Also honored via the `SIFT_DISABLED` env
+  var. Supersedes every other flag.
+- `--sift-log-file`: Path to write the JSONL log file. `true`
   (default) auto-creates a temp file. `false` or `none` disables logging.
   Any other value is treated as a file path.
-- `--no-sift-test-results-git-metadata`: Exclude git metadata (repo, branch,
+- `--no-sift-git-metadata`: Exclude git metadata (repo, branch,
   commit) from the test report. Included by default.
-- `--sift-test-results-check-connection`: Make `report_context`, `step`, and
-  `module_substep` no-op when the client has no connection. Requires a
-  `client_has_connection` fixture (the plugin ships a default).
 
 Each option has a matching ini key for per-project configuration under
 ``[tool.pytest.ini_options]`` in ``pyproject.toml`` (or ``[pytest]`` in
 ``pytest.ini``). CLI flags override ini values. The
-``sift_test_results_autouse`` ini key (bool, default ``true``) sets the
+``sift_autouse`` ini key (bool, default ``true``) sets the
 project-wide default for the gate described above. The default
 ``sift_client`` fixture reads ``sift_grpc_uri`` and ``sift_rest_uri`` as
 fallbacks when the corresponding env vars are unset (env vars win when
@@ -126,10 +131,9 @@ def pytest_collection_modifyitems(config, items):
 
 ```toml
 [tool.pytest.ini_options]
-sift_test_results_autouse = false
-sift_test_results_log_file = "false"
-sift_test_results_check_connection = true
-sift_test_results_git_metadata = false
+sift_autouse = false
+sift_offline = true
+sift_git_metadata = false
 sift_grpc_uri = "your-org.sift.example:443"
 sift_rest_uri = "https://your-org.sift.example"
 ```
diff --git a/python/lib/sift_client/util/test_results/bounds.py b/python/lib/sift_client/util/test_results/bounds.py
index ef5c67ce5..b734cc126 100644
--- a/python/lib/sift_client/util/test_results/bounds.py
+++ b/python/lib/sift_client/util/test_results/bounds.py
@@ -1,5 +1,10 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
 from sift_client.sift_types.test_report import (
     NumericBounds,
     TestMeasurement,
@@ -8,6 +13,55 @@
     TestMeasurementUpdate,
 )
 
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+def to_numpy_array(
+    values: list[float | int] | NDArray[np.float64] | pd.Series,
+) -> NDArray[np.float64]:
+    """Normalize a list / ndarray / pandas Series into a numpy array.
+
+    Shared by ``measure_avg`` and ``measure_all`` on ``NewStep`` so the
+    accepted input types stay in sync across measurement variants.
+    """
+    if isinstance(values, list):
+        return np.array(values)
+    if isinstance(values, np.ndarray):
+        return values
+    if isinstance(values, pd.Series):
+        return values.to_numpy()
+    raise ValueError(f"Invalid value type: {type(values)}")
+
+
+def out_of_bounds_mask(
+    arr: NDArray[np.float64],
+    bounds: dict[str, float] | NumericBounds,
+) -> NDArray[np.bool_]:
+    """Return a boolean mask selecting elements of ``arr`` that violate ``bounds``.
+
+    Raises ``ValueError`` when ``bounds`` has neither ``min`` nor ``max`` set.
+    """
+    if isinstance(bounds, dict):
+        bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
+    mask: NDArray[np.bool_] | None = None
+    if bounds.min is not None:
+        mask = arr < bounds.min
+    if bounds.max is not None:
+        above = arr > bounds.max
+        mask = mask | above if mask is not None else above
+    if mask is None:
+        raise ValueError("No bounds provided")
+    return mask
+
+
+def all_within_bounds(
+    arr: NDArray[np.float64],
+    bounds: dict[str, float] | NumericBounds,
+) -> bool:
+    """Return True when every element of ``arr`` is within ``bounds``."""
+    return bool(arr[out_of_bounds_mask(arr, bounds)].size == 0)
+
 
 def assign_value_to_measurement(
     measurement: TestMeasurement | TestMeasurementCreate | TestMeasurementUpdate,
@@ -32,6 +86,38 @@ def assign_value_to_measurement(
         raise ValueError(f"Invalid value type: {type(value)}")
 
 
+def value_passes_bounds(
+    value: float | str | bool,
+    bounds: dict[str, float] | NumericBounds | str | bool | None,
+) -> bool:
+    """Evaluate a value against bounds without recording a measurement."""
+    if bounds is None:
+        return True
+    if isinstance(bounds, dict):
+        bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
+    if isinstance(bounds, bool):
+        if isinstance(value, str):
+            return str(value).lower() == str(bounds).lower()
+        return bool(value) == bounds
+    if isinstance(bounds, str):
+        if not (isinstance(value, str) or isinstance(value, bool)):
+            raise ValueError("Value must be a string if bounds provided is a string")
+        if isinstance(value, bool):
+            return str(value).lower() == str(bounds).lower()
+        return value == bounds
+    # NumericBounds
+    try:
+        if bounds.min is not None and bounds.min > value:  # type: ignore[operator]
+            return False
+        if bounds.max is not None and bounds.max < value:  # type: ignore[operator]
+            return False
+    except TypeError:
+        raise TypeError(
+            f"Value must be a float or int to evaluate numeric bounds but gave {type(value)}"
+        ) from None
+    return True
+
+
 def evaluate_measurement_bounds(
     measurement: TestMeasurement | TestMeasurementCreate | TestMeasurementUpdate,
     value: float | str | bool,
@@ -53,31 +139,10 @@ def evaluate_measurement_bounds(
 
     if isinstance(bounds, dict):
         bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
-    if isinstance(bounds, bool):
-        if isinstance(value, str):
-            measurement.passed = str(value).lower() == str(bounds).lower()
-        else:
-            measurement.passed = bool(value) == bounds
-        return bool(measurement.passed)
-    elif isinstance(bounds, str):
-        if not (isinstance(value, str) or isinstance(value, bool)):
-            raise ValueError("Value must be a string if bounds provided is a string")
+    if isinstance(bounds, str) and not isinstance(bounds, bool):
         measurement.string_expected_value = bounds
-        if isinstance(value, bool):
-            measurement.passed = str(value).lower() == str(bounds).lower()
-        else:
-            measurement.passed = value == bounds
     elif isinstance(bounds, NumericBounds):
         measurement.numeric_bounds = bounds
-        measurement.passed = True
-        try:
-            if measurement.numeric_bounds.min is not None:
-                measurement.passed = measurement.passed and measurement.numeric_bounds.min <= value  # type: ignore
-            if measurement.numeric_bounds.max is not None:
-                measurement.passed = measurement.passed and measurement.numeric_bounds.max >= value  # type: ignore
-        except TypeError:
-            raise TypeError(
-                f"Value must be a float or int to evaluate numeric bounds but gave {type(value)}"
-            ) from None
 
+    measurement.passed = value_passes_bounds(value, bounds)
     return bool(measurement.passed)
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 354f8564d..3d375814a 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -13,7 +13,6 @@
 from typing import TYPE_CHECKING
 
 import numpy as np
-import pandas as pd
 
 from sift_client.sift_types.test_report import (
     ErrorInfo,
@@ -28,9 +27,12 @@
 )
 from sift_client.util.test_results.bounds import (
     evaluate_measurement_bounds,
+    out_of_bounds_mask,
+    to_numpy_array,
 )
 
 if TYPE_CHECKING:
+    import pandas as pd
     from numpy.typing import NDArray
 
     from sift_client.client import SiftClient
@@ -118,6 +120,7 @@ def __init__(
         test_case: str | None = None,
         log_file: str | Path | bool | None = None,
         include_git_metadata: bool = False,
+        replay_log_file: bool = True,
     ):
         """Initialize a new report context.
 
@@ -128,10 +131,18 @@ def __init__(
             system_operator: The operator of the test system. Will default to the current user if not provided.
             test_case: The name of the test case. Will default to the basename of the file containing the test if not provided.
             log_file: If True, create a temp log file. If a path, use that path.
-                All create/update operations will be logged to this file.
+                If False/None, no log file is written and create/update calls
+                the API.
             include_git_metadata: If True, include git metadata in the report.
+            replay_log_file: When True (the default) and ``log_file`` is set,
+                spawn ``import-test-result-log --incremental`` to push log
+                entries to Sift in the background during the session. When
+                False, the log file is just a record and no worker is spawned.
+                Replay happens later via ``replay-test-result-log <path>``.
+                Has no effect when ``log_file`` is None.
         """
         self.client = client
+        self.replay_log_file = replay_log_file
         self.step_is_open = False
         self.step_stack = []
         self.step_number_at_depth = {}
@@ -163,28 +174,41 @@ def __init__(
         )
         self.report = client.test_results.create(create, log_file=self.log_file)
 
+    def _build_replay_command(self) -> list[str]:
+        """Build the argv for the import-test-result-log replay subprocess.
+
+        Factored out for testability — tests substitute commands that exit
+        with controlled returncodes / stderr to exercise the ``__exit__``
+        branches without depending on the real replay binary.
+        """
+        return [
+            "import-test-result-log",
+            "--incremental",
+            str(self.log_file),
+            "--grpc-url",
+            self.client.grpc_client._config.uri,
+            "--rest-url",
+            self.client.rest_client._config.base_url,
+            "--api-key",
+            self.client.grpc_client._config.api_key,
+        ]
+
     def _open_import_proc(self):
-        """Open a subprocess to import the log file."""
+        """Open a subprocess to import the log file.
+
+        ``stderr`` is captured so a worker crash mid-session can surface its
+        error at session end via ``__exit__`` rather than failing silently.
+        """
         with _quiet_fork_stderr():
             self._import_proc = subprocess.Popen(
-                [
-                    "import-test-result-log",
-                    "--incremental",
-                    str(self.log_file),
-                    "--grpc-url",
-                    self.client.grpc_client._config.uri,
-                    "--rest-url",
-                    self.client.rest_client._config.base_url,
-                    "--api-key",
-                    self.client.grpc_client._config.api_key,
-                ],
+                self._build_replay_command(),
                 stdin=subprocess.PIPE,
                 stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
             )
 
     def __enter__(self):
-        if self.log_file:
+        if self.log_file and self.replay_log_file:
             self._open_import_proc()
         return self
 
@@ -199,17 +223,49 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.report.update(update)
 
         if self._import_proc is not None:
+            # Three outcomes for the replay worker at session end. None of
+            # them fail the session — tests already ran and their outcome
+            # is independent of delivery. The local log file is the source
+            # of recovery for both failure modes via
+            # `replay-test-result-log <path>`:
+            #   1. Exits cleanly (returncode 0). Silent.
+            #   2. Still running after the 1s grace window (TimeoutExpired).
+            #      Healthy worker with a large backlog; kill and surface
+            #      replay instructions.
+            #   3. Exited with non-zero. Connection failures and API call
+            #      errors land here — the worker's replay loop has no retry,
+            #      so the first failed RPC crashes the subprocess. Log the
+            #      captured stderr at ERROR with replay instructions.
             try:
-                self._import_proc.communicate(timeout=1)
+                _, stderr_bytes = self._import_proc.communicate(timeout=1)
             except subprocess.TimeoutExpired:
-                logger.error("Import process did not exit in 10s, killing it")
+                logger.error("Import process did not exit in 1s, killing it")
                 self._import_proc.kill()
                 self._import_proc.wait()
                 log_replay_instructions(self.log_file)
-                raise
+                return True  # Ensures the session is marked as passed in pytest
+            if self._import_proc.returncode != 0:
+                stderr_text = (
+                    stderr_bytes.decode("utf-8", errors="replace").strip() if stderr_bytes else ""
+                )
+                logger.error(
+                    "Import process exited with code %d. stderr: %s",
+                    self._import_proc.returncode,
+                    stderr_text or "<empty>",
+                )
+                log_replay_instructions(self.log_file)
 
         return True
 
+    @property
+    def is_simulated(self) -> bool:
+        """True when this context's report came from the simulate path.
+
+        Delegates to ``self.report.is_simulated``; see ``TestReport.is_simulated``
+        for the full semantics.
+        """
+        return self.report.is_simulated
+
     def new_step(
         self,
         name: str,
@@ -505,15 +561,7 @@ def measure_avg(
         returns: The true if the average of the values is within the bounds, false otherwise.
         """
         timestamp = timestamp if timestamp else datetime.now(timezone.utc)
-        np_array = None
-        if isinstance(values, list):
-            np_array = np.array(values)
-        elif isinstance(values, np.ndarray):
-            np_array = values
-        elif isinstance(values, pd.Series):
-            np_array = values.to_numpy()
-        else:
-            raise ValueError(f"Invalid value type: {type(values)}")
+        np_array = to_numpy_array(values)
         avg = float(np.mean(np_array))
         result = self.measure(
             name=name,
@@ -561,31 +609,8 @@ def measure_all(
         returns: The true if all values are within the bounds, false otherwise.
         """
         timestamp = timestamp if timestamp else datetime.now(timezone.utc)
-        np_array = None
-        if isinstance(values, list):
-            np_array = np.array(values)
-        elif isinstance(values, np.ndarray):
-            np_array = values
-        elif isinstance(values, pd.Series):
-            np_array = values.to_numpy()
-        else:
-            raise ValueError(f"Invalid value type: {type(values)}")
-
-        numeric_bounds = bounds
-        if isinstance(numeric_bounds, dict):
-            numeric_bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))  # type: ignore
-
-        # Construct a mask of the values that are outside the bounds.
-        mask = None
-        if numeric_bounds.min is not None:
-            mask = np_array < numeric_bounds.min
-        if numeric_bounds.max is not None:
-            val_above_max = np_array > numeric_bounds.max
-            mask = mask | val_above_max if mask is not None else val_above_max
-        if mask is None:
-            raise ValueError("No bounds provided")
-
-        rows_outside_bounds = np_array[mask]
+        np_array = to_numpy_array(values)
+        rows_outside_bounds = np_array[out_of_bounds_mask(np_array, bounds)]
         for row in rows_outside_bounds:
             self.measure(
                 name=name,
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 79afdf464..a2cd6a410 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -412,12 +412,18 @@ env_files = [
 # `pytester` is registered globally because pytest 8+ disallows `pytest_plugins`
 # in non-top-level conftests. Only the plugin test suite uses it; activating it
 # globally is harmless since the fixture is opt-in.
-addopts = "-p pytester"
-# The Sift plugin is loaded for the whole project via `python/conftest.py`.
-# The autouse gate defaults to off here so unit tests don't use it. The
-# integration subtree (lib/sift_client/_tests/util/) opts back in via
-# `pytest.mark.sift_include` applied in its conftest.
-sift_test_results_autouse = false
+# The Sift pytest plugin is loaded so the project's own integration tests can
+# use its fixtures. Unit-test runs are flipped to `--sift-disabled` mode by
+# `lib/sift_client/_tests/conftest.py`.
+# `--import-mode=importlib` loads test files by path with unique synthetic
+# module names. The default `prepend` mode would try to import
+# `lib/sift_client/_tests/conftest.py` as `sift_client._tests.conftest`, which
+# fails because `_tests` is excluded from the wheel (see packages.find above).
+addopts = "-p pytester -p sift_client.pytest_plugin --import-mode=importlib"
+# The autouse gate defaults to off so unit tests don't use the Sift
+# fixtures. The integration subtree (lib/sift_client/_tests/util/) opts
+# back in via `pytest.mark.sift_include` applied in its conftest.
+sift_autouse = false
 testpaths = [
     "lib/sift_py",
     "lib/sift_client/_tests",
diff --git a/python/scripts/dev b/python/scripts/dev
index 510d66d95..ce572dba4 100755
--- a/python/scripts/dev
+++ b/python/scripts/dev
@@ -220,4 +220,5 @@ case "$1" in
         ;;
 esac
 
-exit 0
+# Leave the script's exit code as the subcommand's. A trailing `exit 0` here
+# silently masked ruff / mypy / pytest failures from the pre-push hook.

From 654a59c237b73786a420f9e8d081bdbe9dfb2a49 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 22 May 2026 17:42:06 -0700
Subject: [PATCH 04/19] Python(feat): hierarchical pytest report tree
 (packages, modules, classes, parametrize) (#570)

---
 python/docs/examples/index.md                 |   1 +
 python/docs/examples/pytest_plugin.md         | 120 ++-
 .../docs/examples/pytest_plugin_quickstart.md | 177 ++++
 python/examples/pytest_plugin/.env.example    |   3 +
 python/examples/pytest_plugin/README.md       | 119 +++
 python/examples/pytest_plugin/conftest.py     |  15 +
 python/examples/pytest_plugin/pytest.ini      |  11 +
 .../tests/pytest_only/__init__.py             |   7 +
 .../pytest_only/test_pytest_only_demo.py      |  49 +
 .../pytest_plugin/tests/with_sift/__init__.py |   8 +
 .../tests/with_sift/test_with_sift_demo.py    | 159 ++++
 .../_tests/pytest_plugin/_fakes.py            | 132 +++
 .../_tests/pytest_plugin/test_disabled.py     |   5 +-
 .../_tests/pytest_plugin/test_hierarchy.py    | 889 ++++++++++++++++++
 .../_tests/util/test_report_context.py        |  58 +-
 .../_tests/util/test_test_results_utils.py    |  40 +
 python/lib/sift_client/pytest_plugin.py       | 506 +++++++++-
 .../sift_client/util/test_results/__init__.py |  11 +-
 .../util/test_results/context_manager.py      |  73 +-
 python/mkdocs.yml                             |   2 +
 python/pyproject.toml                         |   6 +-
 21 files changed, 2239 insertions(+), 152 deletions(-)
 create mode 100644 python/docs/examples/pytest_plugin_quickstart.md
 create mode 100644 python/examples/pytest_plugin/.env.example
 create mode 100644 python/examples/pytest_plugin/README.md
 create mode 100644 python/examples/pytest_plugin/conftest.py
 create mode 100644 python/examples/pytest_plugin/pytest.ini
 create mode 100644 python/examples/pytest_plugin/tests/pytest_only/__init__.py
 create mode 100644 python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
 create mode 100644 python/examples/pytest_plugin/tests/with_sift/__init__.py
 create mode 100644 python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/_fakes.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py

diff --git a/python/docs/examples/index.md b/python/docs/examples/index.md
index b6a964b35..936a35cfd 100644
--- a/python/docs/examples/index.md
+++ b/python/docs/examples/index.md
@@ -7,6 +7,7 @@ This section contains interactive Jupyter notebook examples demonstrating how to
 - **[Basic Usage](basic.ipynb)** - Introduction to the Sift Python client, covering basic operations and API usage
 - **[Data Ingestion](ingestion.ipynb)** - Learn how to ingest telemetry data into Sift using various methods
 - **[Pytest Plugin](pytest_plugin.md)** - Turn a pytest run into a Sift TestReport with measurements, nested steps, and pass/fail outcomes
+- **[Pytest Plugin Quickstart](pytest_plugin_quickstart.md)** - Guided tour of the runnable demo project under `python/examples/pytest_plugin/`
 
 ## Running Examples Locally
 
diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index 2ac298256..c464e564e 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -87,7 +87,8 @@ def sift_client() -> SiftClient:
 |---|---|---|---|
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
 | `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
-| `module_substep` | fixture (autouse) | module | One step per test file with each function nested as a substep. |
+| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently — see [ini options](#ini-options). |
+| `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
 | `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
 
 ### CLI options
@@ -118,6 +119,10 @@ CLI flags, when passed, override the ini values.
 | `sift_offline` | bool (default `false`) | `--sift-offline` |
 | `sift_disabled` | bool (default `false`) | `--sift-disabled` (also honors `SIFT_DISABLED` env var) |
 | `sift_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
+| `sift_package_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each Python package (directory with `__init__.py`) in the test path. |
+| `sift_module_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each test module (file). |
+| `sift_class_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each test class, including nested classes. |
+| `sift_parametrize_nesting` | bool (default `true`) | _(ini-only)_ — cluster parametrized tests under shared parents (`test_x → axis=value`) instead of flat leaves (`test_x[value]`). |
 
 The default `sift_client` fixture reads its two URIs from environment first
 and falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
@@ -302,8 +307,8 @@ outcomes into `TestStatus`:
 | Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
 
 A failure or error at any depth propagates upward: the parent substep, the
-function step, the module step (if `module_substep` is active), and the
-session report all get marked failed.
+function step, the class/module/package steps above it, and the session
+report all get marked failed.
 
 ## Nested steps
 
@@ -339,12 +344,14 @@ Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …)
 assigned by `ReportContext`. Sibling substeps within the same parent
 auto-increment; opening a new top-level step starts a new branch.
 
-### One step per file
+### Mirroring the test layout
 
-`module_substep` is autouse and module-scoped. When it's active (it's pulled
-in by the star-import in `conftest.py`), each file becomes a parent step and
-every function in it nests one level down. Its name is the test file's
-basename and its description is the module's docstring (if any).
+The plugin opens a parent step for each Python package (`__init__.py`
+directory), test file, and test class above every test, plus a parent step
+for each `@pytest.mark.parametrize` axis. Every layer is on by default and
+individually opt-out via ini flags (`sift_package_step`, `sift_module_step`,
+`sift_class_step`, `sift_parametrize_nesting`). Class/module/package
+docstrings become the matching step's description.
 
 ### Linking a Run to the report
 
@@ -384,50 +391,43 @@ TestReport
 └── test_temperature
 ```
 
-### One step per file with `module_substep`
+### Modules nested under a package
 
-`module_substep` is autouse and module-scoped. Every file becomes a parent
-step and every function in it nests one level down.
+Two test files under the same Python package (directory with `__init__.py`)
+share that package step as their parent.
 
-```python title="test_battery.py"
+```python title="suites/__init__.py"
+```
+
+```python title="suites/test_battery.py"
 def test_voltage(step): ...
 def test_current(step): ...
 ```
 
-```python title="test_thermal.py"
+```python title="suites/test_thermal.py"
 def test_idle_temp(step): ...
 def test_load_temp(step): ...
 ```
 
 ```text title="Sift report"
 TestReport
-├── test_battery.py
-│   ├── test_voltage
-│   └── test_current
-└── test_thermal.py
-    ├── test_idle_temp
-    └── test_load_temp
+└── suites
+    ├── test_battery.py
+    │   ├── test_voltage
+    │   └── test_current
+    └── test_thermal.py
+        ├── test_idle_temp
+        └── test_load_temp
 ```
 
-### Test classes
+### Test classes (and nested classes)
 
-Pytest classes (`class TestFoo: ...`) do not create a parent step on their
-own. The plugin keys off the test node's `name`, which is just the method
-name. To group a class's methods under a class-level step, add a class-scoped
-fixture that opens a step with `report_context.new_step(...)`:
+`class TestFoo:` and `class TestOuter: class TestInner:` produce class and
+nested class steps automatically — no manual fixture needed.
 
 ```python title="test_charging.py"
-import pytest
-
-
 class TestCharging:
-    @pytest.fixture(scope="class", autouse=True)
-    def class_step(self, report_context):
-        with report_context.new_step(
-            name="TestCharging",
-            description="Charging subsystem",
-        ) as parent:
-            yield parent
+    """Charging subsystem."""
 
     def test_starts_at_zero(self, step): ...
     def test_reaches_full(self, step): ...
@@ -436,23 +436,20 @@ class TestCharging:
 
 ```text title="Sift report"
 TestReport
-└── TestCharging
-    ├── test_starts_at_zero
-    ├── test_reaches_full
-    └── test_thermal_throttle
+└── test_charging.py
+    └── TestCharging
+        ├── test_starts_at_zero
+        ├── test_reaches_full
+        └── test_thermal_throttle
 ```
 
-!!! note "Combining with `module_substep`"
-    `module_substep` and a class-scoped step both open at module/class scope,
-    so they each grab the next sibling slot under the report and the inner
-    one nests under the outer. If you want both layers (file → class →
-    method), make the class step itself open via the active outer step
-    rather than the report root.
+The class's docstring becomes the step description.
 
 ### Parametrized tests
 
-Each parametrize case is a distinct pytest node, so each gets its own step.
-The step name includes the parameter id pytest generates.
+Parametrized tests cluster under a parent step named after the test function,
+with one inner parent per parametrize axis (outer-to-inner in
+decorator-on-page order). Stacked parametrize produces nested step levels.
 
 ```python
 @pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
@@ -462,11 +459,36 @@ def test_rail(step, voltage):
 
 ```text title="Sift report"
 TestReport
-├── test_rail[3.3]
-├── test_rail[5.0]
-└── test_rail[12.0]
+└── test_module.py
+    └── test_rail
+        ├── voltage=3.3
+        ├── voltage=5.0
+        └── voltage=12.0
 ```
 
+Stacked parametrize:
+
+```python
+@pytest.mark.parametrize("voltage", ["high", "low"])
+@pytest.mark.parametrize("component", ["motor", "valve"])
+def test_iso(step, voltage, component): ...
+```
+
+```text title="Sift report"
+TestReport
+└── test_module.py
+    └── test_iso
+        ├── voltage='high'
+        │   ├── component='motor'
+        │   └── component='valve'
+        └── voltage='low'
+            ├── component='motor'
+            └── component='valve'
+```
+
+Set `sift_parametrize_nesting = false` in `pytest.ini` to fall back to flat
+leaf names (`test_rail[3.3]`).
+
 ### Helper functions
 
 Helpers called from a test do not auto-create a step. The plugin only sees
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
new file mode 100644
index 000000000..54328c707
--- /dev/null
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -0,0 +1,177 @@
+# Pytest Plugin Quickstart
+
+A walkthrough of the runnable demo at
+[`python/examples/pytest_plugin/`](https://github.com/sift-stack/sift/tree/main/python/examples/pytest_plugin).
+The demo is a self-contained pytest project that exercises every layer of the
+plugin's step tree: packages, modules, classes (including nested), parametrize
+axes, manual substeps, and gate markers. It also includes a tests directory
+that uses no Sift APIs at all, to show how the autouse fixtures capture plain
+pytest tests for free.
+
+For a conceptual reference (fixtures, ini flags, status semantics), see
+[Pytest Plugin](pytest_plugin.md).
+
+## Project layout
+
+```
+examples/pytest_plugin/
+├── conftest.py                            # registers the plugin
+├── pytest.ini                             # available ini knobs (all commented at defaults)
+├── .env.example                           # credential template
+└── tests/
+    ├── pytest_only/                       # subpackage step
+    │   ├── __init__.py
+    │   └── test_pytest_only_demo.py       # plain pytest, no Sift APIs
+    └── with_sift/                         # subpackage step
+        ├── __init__.py
+        └── test_with_sift_demo.py         # measurements, substeps, classes, parametrize, gates
+```
+
+Every Python package (directory with `__init__.py`), test file, and test class
+above each test becomes its own parent step in the report tree.
+
+## `conftest.py`
+
+A single `pytest_plugins` declaration loads the plugin; `load_dotenv()` is
+optional and just lets the default `sift_client` fixture pick up
+`SIFT_API_KEY` / `SIFT_GRPC_URI` / `SIFT_REST_URI` from a local `.env`.
+
+```python title="conftest.py"
+--8<-- "examples/pytest_plugin/conftest.py"
+```
+
+## `pytest.ini`
+
+Every knob is commented at its default value. Uncomment any line to opt out of
+a layer of the step tree.
+
+```ini title="pytest.ini"
+--8<-- "examples/pytest_plugin/pytest.ini"
+```
+
+## `.env.example`
+
+```bash title=".env.example"
+--8<-- "examples/pytest_plugin/.env.example"
+```
+
+## The pytest_only module
+
+Plain pytest tests with no `sift_client` imports, no `step` fixture, no
+markers. Each one still becomes a leaf step in the report tree. The plugin's
+autouse fixtures capture pass/fail automatically.
+
+```python title="tests/pytest_only/test_pytest_only_demo.py"
+--8<-- "examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py"
+```
+
+## The with_sift module
+
+Exercises the plugin's full surface: numeric / string / bool bounds, nested
+`step.substep`, `@pytest.mark.sift_exclude`, class steps with docstring
+descriptions, nested classes, stacked `@pytest.mark.parametrize`, and
+`step.report_outcome`.
+
+```python title="tests/with_sift/test_with_sift_demo.py"
+--8<-- "examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py"
+```
+
+## Run it
+
+### Without Sift credentials
+
+```bash
+cd python/examples/pytest_plugin
+pytest --sift-disabled -v
+```
+
+`--sift-disabled` makes the plugin a no-op transport: `step.measure(...)`
+still evaluates bounds and returns a real pass/fail boolean, but nothing
+contacts Sift and no log file is written. Useful for previewing the report
+tree or unit-testing measurement logic.
+
+### Against a real Sift org
+
+```bash
+cp .env.example .env
+# Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
+pytest -v
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+### Offline (record now, replay later)
+
+```bash
+pytest --sift-offline --sift-log-file=/tmp/sift-demo.jsonl -v
+# Later, from anywhere with credentials:
+import-test-result-log /tmp/sift-demo.jsonl
+```
+
+## Expected report tree
+
+With the plugin's defaults (every layer enabled), the demo produces:
+
+```
+TestReport (FAILED, since failures propagate up from leaves)
+├── pytest_only                         ← package step (FAILED)
+│   └── test_pytest_only_demo.py        ← module step (FAILED)
+│       ├── test_passes                                              PASSED
+│       ├── test_uses_a_pytest_fixture                               PASSED
+│       ├── test_assertion_failure_marks_step_failed                 FAILED
+│       ├── test_skipped                                             SKIPPED
+│       ├── test_unexpected_exception_marks_step_errored             ERROR
+│       ├── test_parametrize_without_step
+│       │   ├── value='v1'                                           PASSED
+│       │   └── value='v2'                                           PASSED
+│       └── TestPytestClass
+│           └── test_method                                          PASSED
+└── with_sift                           ← package step (FAILED)
+    └── test_with_sift_demo.py          ← module step (FAILED)
+        ├── test_measurements                                        PASSED
+        ├── test_substeps                                            PASSED
+        │   ├── phase_1
+        │   └── phase_2
+        │       └── phase_2a
+        │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
+        ├── test_measure_series                                      PASSED
+        ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
+        ├── test_assert_measurements_passed_at_end                                FAILED  (pytest FAILED)
+        ├── test_report_level_metadata                               PASSED
+        └── TestClassStep
+            ├── test_parametrize
+            │   ├── axis_a='a1'
+            │   │   ├── axis_b='b1'                                  PASSED
+            │   │   └── axis_b='b2'                                  PASSED
+            │   └── axis_a='a2'
+            │       ├── axis_b='b1'                                  PASSED
+            │       └── axis_b='b2'                                  PASSED
+            └── TestNested
+                └── test_report_outcome
+                    └── check                                        PASSED
+```
+
+The `pytest_only` module deliberately includes one failing, one skipped, and
+one erroring test so the demo shows every `TestStatus` mapping (`FAILED` for
+assertions, `SKIPPED` for `pytest.skip`, `ERROR` for any other exception).
+The `with_sift` module shows two patterns for handling measurement results:
+`test_failed_measurement_marks_sift_step_failed` lets the test keep passing
+in pytest while the Sift step is `FAILED` (useful when measurements are
+diagnostic data you want to collect regardless of outcome); and
+`test_assert_measurements_passed_at_end` takes every measurement first and
+then asserts `step.measurements_passed` once at the end, so every
+measurement still lands in the report even when one fails. The end-of-test
+assertion is the recommended pattern: asserting on an individual
+`step.measure(...)` call short-circuits on the first failure and skips
+every measurement that follows. Expected
+pytest output is `16 passed, 3 failed, 1 skipped`.
+
+Flip any of the `sift_*_step` / `sift_parametrize_nesting` flags in
+`pytest.ini` to `false` to collapse a layer.
+
+## Next steps
+
+- [Pytest Plugin](pytest_plugin.md): conceptual reference covering fixtures,
+  ini flags, status semantics, and layout-mapping examples.
+- The demo's [README](https://github.com/sift-stack/sift/blob/main/python/examples/pytest_plugin/README.md)
+  on GitHub mirrors this page and is the canonical source.
diff --git a/python/examples/pytest_plugin/.env.example b/python/examples/pytest_plugin/.env.example
new file mode 100644
index 000000000..a8c028598
--- /dev/null
+++ b/python/examples/pytest_plugin/.env.example
@@ -0,0 +1,3 @@
+SIFT_API_KEY=your-api-key
+SIFT_GRPC_URI=your-org.grpc.example.com
+SIFT_REST_URI=https://your-org.rest.example.com
diff --git a/python/examples/pytest_plugin/README.md b/python/examples/pytest_plugin/README.md
new file mode 100644
index 000000000..c74a9c939
--- /dev/null
+++ b/python/examples/pytest_plugin/README.md
@@ -0,0 +1,119 @@
+# Pytest plugin demo
+
+A self-contained pytest project that exercises every feature of
+`sift_client.pytest_plugin`: package / module / class / parametrize step
+nesting, nested classes, manual substeps, `step.measure(...)` against
+numeric / string / bool bounds, gate markers, and the ini opt-outs.
+
+```
+examples/pytest_plugin/
+├── conftest.py                            # registers the plugin
+├── pytest.ini                             # available ini knobs (all commented at defaults)
+├── .env.example                           # credential template (copy to .env for local runs)
+└── tests/
+    ├── pytest_only/                       # subpackage step: `pytest_only` opens a parent step
+    │   ├── __init__.py
+    │   └── test_pytest_only_demo.py       # plain pytest tests with no Sift APIs
+    └── with_sift/                         # subpackage step: `with_sift` opens a parent step
+        ├── __init__.py
+        └── test_with_sift_demo.py         # measurements, substeps, classes, nested classes,
+                                            # stacked parametrize, sift_exclude marker
+```
+
+Every layer of organization shows up in the report tree: Python packages
+(directories with `__init__.py`), modules (test files), classes (including
+nested classes), and parametrize axes each open a parent step. Flip
+`sift_package_step`, `sift_module_step`, `sift_class_step`, or
+`sift_parametrize_nesting` to `false` in `pytest.ini` to disable this behavior.
+
+## Run it
+
+**Against a real Sift org**:
+
+```bash
+cp .env.example .env
+# Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
+pytest -v
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+**Offline (record now, replay later - intended for offline environments)**:
+
+```bash
+pytest --sift-offline --sift-log-file=/tmp/sift-demo.jsonl -v
+# Later, from anywhere with credentials:
+import-test-result-log /tmp/sift-demo.jsonl
+```
+
+## What the report tree looks like
+
+With the plugin's defaults (everything in `pytest.ini` left commented), running
+this demo produces a tree like:
+
+```
+TestReport (FAILED, since failures propagate up from leaves)
+├── pytest_only                         ← package step (FAILED)
+│   └── test_pytest_only_demo.py        ← module step (FAILED)
+│       ├── test_passes                                              PASSED
+│       ├── test_uses_a_pytest_fixture                               PASSED
+│       ├── test_assertion_failure_marks_step_failed                 FAILED
+│       ├── test_skipped                                             SKIPPED
+│       ├── test_unexpected_exception_marks_step_errored             ERROR
+│       ├── test_parametrize_without_step
+│       │   ├── value='v1'                                           PASSED
+│       │   └── value='v2'                                           PASSED
+│       └── TestPytestClass
+│           └── test_method                                          PASSED
+└── with_sift                           ← package step (FAILED)
+    └── test_with_sift_demo.py          ← module step (FAILED)
+        ├── test_measurements                                        PASSED
+        ├── test_substeps                                            PASSED
+        │   ├── phase_1
+        │   └── phase_2
+        │       └── phase_2a
+        │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
+        ├── test_measure_series                                      PASSED
+        ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
+        ├── test_assert_measurements_passed_at_end                                FAILED  (pytest FAILED)
+        ├── test_report_level_metadata                               PASSED
+        └── TestClassStep
+            ├── test_parametrize
+            │   ├── axis_a='a1'
+            │   │   ├── axis_b='b1'                                  PASSED
+            │   │   └── axis_b='b2'                                  PASSED
+            │   └── axis_a='a2'
+            │       ├── axis_b='b1'                                  PASSED
+            │       └── axis_b='b2'                                  PASSED
+            └── TestNested
+                └── test_report_outcome
+                    └── check                                        PASSED
+```
+
+The `pytest_only` module deliberately includes one failing, one skipped, and
+one erroring test so the demo shows every `TestStatus` mapping (`FAILED` for
+assertions, `SKIPPED` for `pytest.skip`, `ERROR` for any other exception).
+The `with_sift` module shows two patterns for handling measurement results:
+`test_failed_measurement_marks_sift_step_failed` lets the test keep passing
+in pytest while the Sift step is `FAILED` (useful when measurements are
+diagnostic data you want to collect regardless of outcome); and
+`test_assert_measurements_passed_at_end` takes every measurement first and
+then asserts `step.measurements_passed` once at the end, so every
+measurement still lands in the report even when one fails. The end-of-test
+assertion is the recommended pattern: asserting on an individual
+`step.measure(...)` call short-circuits on the first failure and skips
+every measurement that follows. Expected
+pytest output is `16 passed, 3 failed, 1 skipped`.
+
+Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
+`pytest.ini` to `false` to collapse a layer.
+
+## What each file demonstrates
+
+| File | Feature |
+|---|---|
+| `conftest.py` | Plugin registration via `pytest_plugins`; optional `load_dotenv()` |
+| `pytest.ini` | The four nesting flags + git metadata flag at their defaults |
+| `tests/pytest_only/test_pytest_only_demo.py` | Plain pytest tests with no Sift APIs. The plugin captures pass/fail automatically; covers functions, fixtures, parametrize, classes, plus one each of `AssertionError` (FAILED), `pytest.skip` (SKIPPED), and a raised `ValueError` (ERROR) |
+| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `assert step.measurements_passed` end-of-test pattern that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
+| `tests/{pytest_only,with_sift}/__init__.py` | Each Python package (directory with `__init__.py`) becomes a parent step in the report tree |
diff --git a/python/examples/pytest_plugin/conftest.py b/python/examples/pytest_plugin/conftest.py
new file mode 100644
index 000000000..88253bd73
--- /dev/null
+++ b/python/examples/pytest_plugin/conftest.py
@@ -0,0 +1,15 @@
+"""Project-level conftest for the pytest plugin demo.
+
+A single ``pytest_plugins`` declaration is enough to load the plugin — its
+fixtures, hooks, and CLI options register through standard pytest machinery
+from there. ``load_dotenv()`` is optional; it just lets the default
+``sift_client`` fixture pick up ``SIFT_API_KEY`` / ``SIFT_GRPC_URI`` /
+``SIFT_REST_URI`` from a local ``.env`` when running against a real Sift org.
+These can also be set as environment variables using your preferred method.
+"""
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
diff --git a/python/examples/pytest_plugin/pytest.ini b/python/examples/pytest_plugin/pytest.ini
new file mode 100644
index 000000000..90a1a824b
--- /dev/null
+++ b/python/examples/pytest_plugin/pytest.ini
@@ -0,0 +1,11 @@
+[pytest]
+# Defaults give you the full step tree: every package, module, class, and
+# parametrize axis becomes a parent step. These are the available ini options
+# and their defaults.
+#
+# sift_autouse = true              # autouse fixtures (default: true)
+# sift_package_step = true         # Python package (dir with __init__.py) parent step (default: true)
+# sift_module_step = true          # module (test file) parent step (default: true)
+# sift_class_step = true           # class parent step incl. nested (default: true)
+# sift_parametrize_nesting = true  # parametrize parent steps (default: true)
+# sift_git_metadata = true         # git repo/branch/commit included on the report (default: true)
diff --git a/python/examples/pytest_plugin/tests/pytest_only/__init__.py b/python/examples/pytest_plugin/tests/pytest_only/__init__.py
new file mode 100644
index 000000000..939562d5f
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/pytest_only/__init__.py
@@ -0,0 +1,7 @@
+"""Subpackage of plain pytest tests with no Sift awareness.
+
+Demonstrates that the plugin captures any test's pass/fail with no opt-in
+needed — no ``step`` fixture, no markers, no imports from ``sift_client``.
+The package directory itself becomes a parent step in the report tree (via
+``sift_package_step``, on by default).
+"""
diff --git a/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py b/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
new file mode 100644
index 000000000..77790d301
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
@@ -0,0 +1,49 @@
+"""Plain pytest tests are automatically captured by the plugin as steps.
+
+No imports from ``sift_client`` or fixture usage required. Each test
+becomes a step in the report tree: passing tests resolve to ``PASSED``,
+failing tests to ``FAILED``. This allows integrating existing tests
+with Sift Test Results without modification.
+"""
+
+import pytest
+
+
+def test_passes():
+    """Functions become steps in the report tree. The function docstring is used as the step description."""
+    assert 1 + 1 == 2
+
+
+@pytest.mark.parametrize("value", ["v1", "v2"])
+def test_parametrize_without_step(value):
+    """Parametrized tests are nested under a common step with sub steps for each permutation."""
+    assert value.startswith("v")
+
+
+class TestPytestClass:
+    """Test classes are turned into parent steps for their methods. Class docstrings are used as step the description."""
+
+    def test_method(self):
+        assert True
+
+
+def test_uses_a_pytest_fixture(tmp_path):
+    """Normal pytest fixtures keep working the plugin doesn't intercept them."""
+    (tmp_path / "marker").write_text("ok")
+    assert (tmp_path / "marker").read_text() == "ok"
+
+
+def test_assertion_failure_marks_step_failed():
+    """An ``AssertionError`` resolves the Sift step as ``FAILED`` (no traceback attached)."""
+    assert 1 + 1 == 3
+
+
+@pytest.mark.skip(reason="Demonstrating the skip outcome")
+def test_skipped():
+    """Skipped tests resolve as ``SKIPPED`` in the Sift report."""
+    pass
+
+
+def test_unexpected_exception_marks_step_errored():
+    """Non-``AssertionError`` exceptions resolve the Sift step as ``ERROR`` with the traceback attached."""
+    raise ValueError("simulated environmental failure")
diff --git a/python/examples/pytest_plugin/tests/with_sift/__init__.py b/python/examples/pytest_plugin/tests/with_sift/__init__.py
new file mode 100644
index 000000000..6fd60c38d
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/with_sift/__init__.py
@@ -0,0 +1,8 @@
+"""Subpackage of tests that use the Sift plugin APIs explicitly.
+
+Demonstrates ``step.measure`` (numeric / string / bool bounds), nested
+``step.substep``, gate markers, class and nested-class step nesting, stacked
+parametrize, and ``step.report_outcome``. The package directory itself
+becomes a parent step in the report tree (via ``sift_package_step``, on by
+default).
+"""
diff --git a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
new file mode 100644
index 000000000..34bf602b7
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
@@ -0,0 +1,159 @@
+"""End-to-end demo of the test-results features: measurements, substeps,
+exclusion, classes, nested classes, and stacked parametrize."""
+
+import pytest
+
+
+def test_measurements(step) -> None:
+    """Measurements are the first-class method for recording numeric, string, or bool bounds criteria and their outcomes. These show up in report steps.
+    ``step.measure`` accepts numeric (min/max), string, or bool bounds.
+    Names should be chosen that provide sufficient context, but general enough that similar/identical measurements
+    across steps or reports can be compared.
+    """
+    step.measure(name="numeric_value", value=1.5, bounds={"min": 0.0, "max": 2.0})
+    step.measure(name="string_label", value="ok", bounds="ok")
+    step.measure(name="bool_flag", value=True, bounds=True)
+
+    # Descriptions and metadata can also be provided to measurements.
+    step.measure(
+        name="numeric_value_2",
+        value=1.5,
+        bounds={"min": 0.0, "max": 2.0},
+        description="Numeric that represents X, Y, Z",
+        metadata={"subsystem": "A"},
+    )
+
+    # If you plan to link the pytest report to a Sift Run, you can also assign related channels for easy plotting in the app
+    step.measure(
+        name="numeric_value",
+        value=1.5,
+        bounds={"min": 0.0, "max": 2.0},
+        channel_names=["channel_1", "channel_2"],
+    )
+
+
+def test_substeps(step) -> None:
+    """``step.substep(...)`` opens child steps inside one test; substeps nest arbitrarily.
+    This can be useful for grouping related measurements or for creating a more natural report structure
+    without the need to create a new test, class, etc.
+
+    Metadata can be attached at the step level by passing ``metadata=...`` to
+    ``substep``; the same keyword is accepted by ``report_context.new_step``
+    and propagates to the resulting ``TestStep``.
+    """
+    with step.substep(name="phase_1", metadata={"phase_index": 1}) as s1:
+        s1.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    with step.substep(name="phase_2", metadata={"phase_index": 2}) as s2:
+        with s2.substep(name="phase_2a") as s2a:
+            s2a.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+
+def test_measure_series(step) -> None:
+    """``measure_avg`` and ``measure_all`` are the series variants of ``measure``.
+
+    Both accept a list, numpy array, or pandas series of numeric values.
+    ``measure_avg`` records one row holding the mean of the series and
+    bounds-checks it. ``measure_all`` evaluates every value individually and
+    records one row per out-of-bounds element (in-bounds values are NOT
+    recorded, keeping the report compact).
+    """
+    voltages = [4.95, 5.02, 5.01, 4.98, 5.00]
+    step.measure_avg(
+        name="voltage_mean",
+        values=voltages,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+    # All values are in-bounds here, so measure_all records nothing extra;
+    # change one to e.g. 6.0 to see an out-of-bounds row appear.
+    step.measure_all(
+        name="voltage_samples",
+        values=voltages,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+
+
+def test_failed_measurement_marks_sift_step_failed(step) -> None:
+    """An out-of-bounds measurement marks the Sift step as ``FAILED``
+    without raising. The pytest test still passes (no assertion, no
+    exception); the Sift report records bounds compliance while pytest
+    records control flow.
+
+    Use this pattern when measurements are diagnostic data you want to
+    collect alongside the test result, even when some readings fall outside
+    spec. See ``test_assert_passed_at_end`` below for the recommended way
+    to also fail pytest when any measurement is out of bounds.
+    """
+    step.measure(
+        name="voltage",
+        value=99.0,  # outside the bounds below; marks the step FAILED in Sift
+        bounds={"min": 0.0, "max": 10.0},
+        unit="V",
+    )
+
+
+def test_assert_measurements_passed_at_end(step) -> None:
+    """Recommended pattern: take every measurement first, then assert
+    ``step.measurements_passed`` once at the end.
+
+    Asserting on individual ``step.measure(...)`` calls raises
+    ``AssertionError`` on the first failure, so any measurements after the
+    failing one never run and never land in the Sift report. The end-of-test
+    assertion is strictly better for diagnostic completeness: every
+    measurement is recorded, including the failures, and the aggregate
+    result is then folded into the pytest outcome.
+
+    The ``b`` measurement below is deliberately out of bounds. ``c`` still
+    runs and is recorded; only the final ``assert`` fires.
+    """
+    step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
+    step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})  # out of bounds
+    step.measure(name="c", value=1.5, bounds={"min": 0.0, "max": 2.0})  # still recorded
+    assert step.measurements_passed, "one or more measurements out of bounds"
+
+
+def test_report_level_metadata(step, report_context) -> None:
+    """Attach metadata to the run-wide ``TestReport`` via ``report_context.report.update(...)``.
+
+    The same ``update({...})`` pattern works for any field on
+    ``TestReportUpdate`` (``run_id``, ``serial_number``, ``part_number``,
+    ``system_operator``, ``metadata``, ...). Useful for linking a session
+    to a Sift Run or tagging the report with build / operator info.
+    """
+    report_context.report.update(
+        {
+            "metadata": {
+                "build_id": "v1.2.3",
+                "operator": "ci",
+            }
+        }
+    )
+    step.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+
+@pytest.mark.sift_exclude
+def test_excluded() -> None:
+    """``sift_exclude`` runs the test in pytest but produces no Sift step."""
+    assert True
+
+
+class TestClassStep:
+    """A test class becomes its own step in the report tree.
+
+    This docstring becomes the description of the ``TestClassStep`` step.
+    """
+
+    @pytest.mark.parametrize("axis_a", ["a1", "a2"])
+    @pytest.mark.parametrize("axis_b", ["b1", "b2"])
+    def test_parametrize(self, step, axis_a: str, axis_b: str) -> None:
+        """Stacked parametrize nests outer-to-inner in decorator-on-page order."""
+        step.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    class TestNested:
+        """Nested classes produce nested class steps."""
+
+        def test_report_outcome(self, step) -> None:
+            """``step.report_outcome`` records a non-numeric pass/fail substep."""
+            step.report_outcome(name="check", result=True, reason="value matched")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_fakes.py b/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
new file mode 100644
index 000000000..460100daa
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
@@ -0,0 +1,132 @@
+"""Test doubles for the pytester-driven pytest-plugin tests.
+
+The fake ``ReportContext`` is a drop-in for the real one that records every
+step creation to a JSON file at session exit. Used by ``test_parametrize.py``
+to assert the step tree produced by an inner pytester pytest run.
+"""
+
+from __future__ import annotations
+
+import itertools
+import json
+from typing import TYPE_CHECKING, Any
+from unittest.mock import MagicMock
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class FakeStep:
+    def __init__(self, id_: str, name: str, parent_step_id: str | None, step_path: str) -> None:
+        self.id_ = id_
+        self.name = name
+        self.parent_step_id = parent_step_id
+        self.step_path = step_path
+        self.status: Any = None
+        self.description: Any = None
+        self.error_info: Any = None
+
+    def update(self, fields: dict[str, Any]) -> None:
+        for k, v in fields.items():
+            setattr(self, k, v)
+
+
+class FakeReport:
+    def __init__(self) -> None:
+        self.id_ = "report-id"
+
+    def update(self, fields: dict[str, Any]) -> None:
+        pass
+
+
+class FakeReportContext:
+    def __init__(self, steps_file: Path) -> None:
+        self.steps_file = steps_file
+        self.report = FakeReport()
+        self.client = MagicMock()
+        self.step_stack: list[FakeStep] = []
+        self.step_number_at_depth: dict[int, int] = {}
+        self.open_step_results: dict[str, bool] = {}
+        self.any_failures = False
+        self.log_file: Path | None = None
+        self.steps: list[dict[str, Any]] = []
+        self._ids = itertools.count(1)
+
+    def __enter__(self) -> FakeReportContext:
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.steps_file.write_text(json.dumps(self.steps))
+
+    def new_step(
+        self,
+        name: str,
+        description: str | None = None,
+        assertion_as_fail_not_error: bool = True,
+        metadata: dict[str, Any] | None = None,
+    ) -> Any:
+        # Reuse the real NewStep machinery — it talks to this fake via the
+        # methods below.
+        from sift_client.util.test_results.context_manager import NewStep
+
+        return NewStep(
+            self,  # type: ignore[arg-type]
+            name=name,
+            description=description,
+            assertion_as_fail_not_error=assertion_as_fail_not_error,
+            metadata=metadata,
+        )
+
+    def get_next_step_path(self) -> str:
+        top = self.step_stack[-1] if self.step_stack else None
+        path = top.step_path if top else ""
+        next_n = self.step_number_at_depth.get(len(self.step_stack), 0) + 1
+        prefix = f"{path}." if path else ""
+        return f"{prefix}{next_n}"
+
+    def create_step(
+        self,
+        name: str,
+        description: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> FakeStep:
+        step_path = self.get_next_step_path()
+        parent = self.step_stack[-1] if self.step_stack else None
+        step = FakeStep(
+            id_=f"step-{next(self._ids)}",
+            name=name,
+            parent_step_id=parent.id_ if parent else None,
+            step_path=step_path,
+        )
+        self.step_number_at_depth[len(self.step_stack)] = (
+            self.step_number_at_depth.get(len(self.step_stack), 0) + 1
+        )
+        self.step_stack.append(step)
+        self.open_step_results[step.step_path] = True
+        self.steps.append(
+            {
+                "id": step.id_,
+                "name": name,
+                "parent_step_id": step.parent_step_id,
+                "step_path": step_path,
+            }
+        )
+        return step
+
+    def record_step_outcome(self, outcome: bool, step: FakeStep) -> None:
+        if not outcome:
+            self.open_step_results[step.step_path] = False
+            self.any_failures = True
+
+    def resolve_and_propagate_step_result(self, step: FakeStep, error_info: Any = None) -> bool:
+        result = self.open_step_results.get(step.step_path, True)
+        if error_info:
+            result = False
+        return result
+
+    def exit_step(self, step: FakeStep) -> None:
+        self.step_number_at_depth[len(self.step_stack)] = 0
+        stack_top = self.step_stack.pop()
+        self.open_step_results.pop(step.step_path)
+        if stack_top.id_ != step.id_:
+            raise ValueError("popped step was not the top of the stack")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
index cba4bc1ee..90a5fcb56 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
@@ -109,20 +109,19 @@ def test_disabled_yields_stub_fixtures(
         clear_sift_env: None,
         write_plugin_conftest: Callable[[], None],
     ) -> None:
-        """`report_context` / `step` / `module_substep` are real instances backed by a simulate client."""
+        """`report_context` / `step` are real instances backed by a simulate client."""
         write_plugin_conftest()
         pytester.makepyfile(
             """
             from sift_client.util.test_results import ReportContext
             from sift_client.util.test_results.context_manager import NewStep
 
-            def test_types(step, report_context, module_substep):
+            def test_types(step, report_context):
                 assert isinstance(report_context, ReportContext)
                 assert report_context.is_simulated is True
                 assert report_context.report.is_simulated is True
                 assert step.current_step.is_simulated is True
                 assert isinstance(step, NewStep)
-                assert isinstance(module_substep, NewStep)
             """
         )
         result = pytester.runpytest_subprocess("--sift-disabled")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
new file mode 100644
index 000000000..cecad2df8
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -0,0 +1,889 @@
+"""Tests for the plugin's hierarchy-step nesting behavior.
+
+Covers every layer the plugin opens parent steps for — packages, modules,
+classes (including nested), parametrize axes — plus the ini opt-out flags,
+failure-cleanup semantics, and the drain helper.
+
+Each test spins up an inner pytest run via ``pytester`` whose conftest swaps
+in a ``FakeReportContext`` (from ``_fakes.py``) that records every step
+creation to a JSON file. The outer test reads that file and asserts the
+resulting step tree.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path as _Path
+from textwrap import dedent
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+_STEPS_FILE_ENV = "SIFT_FAKE_STEPS_FILE"
+
+# ``_fakes.py`` is excluded from the wheel by ``pyproject.toml``'s
+# ``packages.find`` rule that strips ``sift_client._tests``. The inner
+# pytester subprocess uses the installed package and cannot import from
+# ``sift_client._tests``. Embed the fake source directly into the inner
+# conftest so the subprocess gets a fully self-contained module to load.
+_FAKES_SOURCE = (_Path(__file__).parent / "_fakes.py").read_text()
+
+_INNER_CONFTEST = f"""
+{_FAKES_SOURCE}
+
+import os
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+@pytest.fixture(scope="session")
+def sift_client():
+    return MagicMock()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def report_context(sift_client):
+    import sift_client.pytest_plugin as plugin_module
+    steps_file = Path(os.environ[{_STEPS_FILE_ENV!r}])
+    with FakeReportContext(steps_file) as ctx:
+        plugin_module.REPORT_CONTEXT = ctx
+        yield ctx
+"""
+
+
+@pytest.fixture
+def steps_file(pytester: pytest.Pytester, monkeypatch: pytest.MonkeyPatch) -> Path:
+    path = pytester.path / "captured_steps.json"
+    pytester.makeconftest(_INNER_CONFTEST)
+    monkeypatch.setenv(_STEPS_FILE_ENV, str(path))
+    return path
+
+
+def _by_name(steps: list[dict]) -> dict[str, list[dict]]:
+    out: dict[str, list[dict]] = {}
+    for s in steps:
+        out.setdefault(s["name"], []).append(s)
+    return out
+
+
+def _ancestor_names(steps: list[dict], leaf: dict) -> list[str]:
+    """Walk from ``leaf`` to the root via parent_step_id, returning names."""
+    by_id = {s["id"]: s for s in steps}
+    chain: list[str] = []
+    cur: dict | None = leaf
+    while cur is not None:
+        chain.append(cur["name"])
+        parent_id = cur["parent_step_id"]
+        cur = by_id.get(parent_id) if parent_id else None
+    return chain
+
+
+def test_class_methods_cluster_under_class_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_klass=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["TestFoo"]) == 1
+    class_id = by_name["TestFoo"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == class_id
+    assert by_name["test_b"][0]["parent_step_id"] == class_id
+
+
+def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_nested=dedent(
+            """
+            class TestOuter:
+                class TestInner:
+                    def test_a(self):
+                        pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["TestOuter"]) == 1
+    assert len(by_name["TestInner"]) == 1
+    leaf = by_name["test_a"][0]
+    assert _ancestor_names(steps, leaf) == [
+        "test_a",
+        "TestInner",
+        "TestOuter",
+        "test_nested.py",
+    ]
+
+
+def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_cp=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    class_id = by_name["TestFoo"][0]["id"]
+    test_a_id = by_name["test_a"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == class_id
+    assert by_name["v=1"][0]["parent_step_id"] == test_a_id
+    assert by_name["v=2"][0]["parent_step_id"] == test_a_id
+
+
+def test_two_sibling_classes_in_module(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_sib=dedent(
+            """
+            class TestA:
+                def test_x(self):
+                    pass
+
+            class TestB:
+                def test_y(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    mod_id = by_name["test_sib.py"][0]["id"]
+    assert by_name["TestA"][0]["parent_step_id"] == mod_id
+    assert by_name["TestB"][0]["parent_step_id"] == mod_id
+    # Sanity: each class is opened exactly once (no duplicate parents).
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+
+
+def test_mixed_class_and_free_function(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_mix=dedent(
+            """
+            class TestA:
+                def test_x(self):
+                    pass
+
+            def test_free():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    mod_id = by_name["test_mix.py"][0]["id"]
+    # Class method parents to TestA; free function parents directly to module.
+    assert by_name["TestA"][0]["parent_step_id"] == mod_id
+    assert by_name["test_x"][0]["parent_step_id"] == by_name["TestA"][0]["id"]
+    assert by_name["test_free"][0]["parent_step_id"] == mod_id
+
+
+def test_class_with_all_excluded_methods_no_class_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_excl=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.sift_exclude
+                def test_a(self):
+                    pass
+
+                @pytest.mark.sift_exclude
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+    assert "test_b" not in by_name
+
+
+def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_clsexcl=dedent(
+            """
+            import pytest
+
+            @pytest.mark.sift_exclude
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+
+
+def test_class_docstring_becomes_step_description(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_doc=dedent(
+            '''
+            class TestFoo:
+                """Class docstring."""
+
+                def test_a(self):
+                    pass
+            '''
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # The fake records step creation but not all fields — check the class
+    # step was recorded, then read the description via the FakeStep's
+    # description attribute by re-reading steps. The fake's create_step only
+    # records name/parent/path/id, so verify via the leaf chain only here.
+    leaf = by_name["test_a"][0]
+    assert _ancestor_names(steps, leaf)[:3] == ["test_a", "TestFoo", "test_doc.py"]
+
+
+def test_transition_between_class_chains_drains_parametrize(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_trans=dedent(
+            """
+            import pytest
+
+            class TestA:
+                @pytest.mark.parametrize("v", [1])
+                def test_x(self, v):
+                    pass
+
+            class TestB:
+                @pytest.mark.parametrize("w", [2])
+                def test_y(self, w):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Each class opens exactly once; parametrize parents under the right class.
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+    test_x_id = by_name["test_x"][0]["id"]
+    test_y_id = by_name["test_y"][0]["id"]
+    assert by_name["v=1"][0]["parent_step_id"] == test_x_id
+    assert by_name["w=2"][0]["parent_step_id"] == test_y_id
+    # Confirm full chain: leaves trace up through correct class.
+    chain_x = _ancestor_names(steps, by_name["v=1"][0])
+    chain_y = _ancestor_names(steps, by_name["w=2"][0])
+    assert "TestA" in chain_x
+    assert "TestB" not in chain_x
+    assert "TestB" in chain_y
+    assert "TestA" not in chain_y
+
+
+# ---------------------------------------------------------------------------
+# Failure-cleanup tests
+# ---------------------------------------------------------------------------
+
+
+def test_drain_step_stack_continues_past_failing_exit() -> None:
+    """Lenient mode: a misbehaving ``__exit__`` must not block the rest of the stack."""
+    from sift_client.pytest_plugin import (
+        SiftPytestStepDrainWarning,
+        _drain_step_stack,
+    )
+
+    class _Good:
+        def __init__(self) -> None:
+            self.closed = False
+
+        def __exit__(self, *_: object) -> None:
+            self.closed = True
+
+    class _Bad:
+        def __exit__(self, *_: object) -> None:
+            raise RuntimeError("boom")
+
+    g1, g2, bad = _Good(), _Good(), _Bad()
+    stack: list[tuple[str, object]] = [("g1", g1), ("bad", bad), ("g2", g2)]
+    with pytest.warns(SiftPytestStepDrainWarning, match="boom"):
+        _drain_step_stack(stack)
+    assert stack == []
+    assert g1.closed
+    assert g2.closed
+
+
+def test_drain_step_stack_strict_drains_fully_then_raises() -> None:
+    """Strict mode: drain every frame, then raise with the FIRST failure chained."""
+    from sift_client.pytest_plugin import (
+        SiftPytestStepDrainError,
+        _drain_step_stack,
+    )
+
+    class _Good:
+        def __init__(self) -> None:
+            self.closed = False
+
+        def __exit__(self, *_: object) -> None:
+            self.closed = True
+
+    class _Bad:
+        def __init__(self, label: str) -> None:
+            self.label = label
+
+        def __exit__(self, *_: object) -> None:
+            raise RuntimeError(f"boom-{self.label}")
+
+    g, b1, b2 = _Good(), _Bad("first"), _Bad("second")
+    # Stack drains LIFO: pop order is b2, b1, g. So b2's failure is the first
+    # one collected and surfaces in __cause__.
+    stack: list[tuple[str, object]] = [("g", g), ("b1", b1), ("b2", b2)]
+    with pytest.raises(SiftPytestStepDrainError, match="2 step.*'b2'") as exc_info:
+        _drain_step_stack(stack, swallow_errors=False)
+    # Stack fully drained even though it raised.
+    assert stack == []
+    assert g.closed
+    # Original exception chained for debuggability.
+    assert isinstance(exc_info.value.__cause__, RuntimeError)
+    assert "boom-second" in str(exc_info.value.__cause__)
+
+
+def test_failing_test_in_class_does_not_orphan_class_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """A failing class method must not block the class step from cleaning up.
+
+    Sibling methods in the same class must still parent to the same class
+    step, and a later class in the module must open as a sibling (not nested
+    under an orphan).
+    """
+    pytester.makepyfile(
+        test_fail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["TestFoo"]) == 1
+    assert len(by_name["TestBar"]) == 1
+    foo_id = by_name["TestFoo"][0]["id"]
+    bar_id = by_name["TestBar"][0]["id"]
+    mod_id = by_name["test_fail.py"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == foo_id
+    assert by_name["test_b"][0]["parent_step_id"] == foo_id
+    assert by_name["test_c"][0]["parent_step_id"] == bar_id
+    # Both classes are siblings under the same module — TestBar didn't get
+    # nested under an orphan TestFoo.
+    assert by_name["TestFoo"][0]["parent_step_id"] == mod_id
+    assert by_name["TestBar"][0]["parent_step_id"] == mod_id
+
+
+def test_failing_parametrized_method_in_class_closes_full_chain(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """A failing parametrized class method must not orphan its parametrize parents."""
+    pytester.makepyfile(
+        test_pfail=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    if v == 1:
+                        raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    foo_id = by_name["TestFoo"][0]["id"]
+    test_a_id = by_name["test_a"][0]["id"]
+    # Both parametrize leaves parent to the same test_a; test_b parents
+    # directly to TestFoo (no parametrize parent leaked across methods).
+    assert by_name["v=1"][0]["parent_step_id"] == test_a_id
+    assert by_name["v=2"][0]["parent_step_id"] == test_a_id
+    assert by_name["test_b"][0]["parent_step_id"] == foo_id
+
+
+# ---------------------------------------------------------------------------
+# Opt-out flag tests
+# ---------------------------------------------------------------------------
+
+
+def _write_ini(pytester: pytest.Pytester, **overrides: object) -> None:
+    """Write a pytest.ini with the given sift_* overrides set under [pytest]."""
+    lines = ["[pytest]"]
+    for key, value in overrides.items():
+        lines.append(f"{key} = {value}")
+    pytester.makefile(".ini", pytest="\n".join(lines) + "\n")
+
+
+def test_sift_class_step_false_skips_class_steps(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(pytester, sift_class_step="false")
+    pytester.makepyfile(
+        test_noclass=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    mod_id = by_name["test_noclass.py"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == mod_id
+    assert by_name["test_b"][0]["parent_step_id"] == mod_id
+
+
+def test_sift_module_step_false_skips_module_step(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(pytester, sift_module_step="false")
+    pytester.makepyfile(
+        test_nomod=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "test_nomod.py" not in by_name
+    # TestFoo attaches to the report root (no parent recorded by the fake).
+    assert by_name["TestFoo"][0]["parent_step_id"] is None
+    assert by_name["test_a"][0]["parent_step_id"] == by_name["TestFoo"][0]["id"]
+
+
+def test_sift_parametrize_nesting_false_keeps_flat_leaves(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(pytester, sift_parametrize_nesting="false")
+    pytester.makepyfile(
+        test_flat=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [1, 2])
+            def test_a(v):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # No parametrize parent step.
+    assert "test_a" not in by_name
+    assert "v=1" not in by_name
+    # Leaves use the bracket-mangled pytest names.
+    assert "test_a[1]" in by_name
+    assert "test_a[2]" in by_name
+    mod_id = by_name["test_flat.py"][0]["id"]
+    assert by_name["test_a[1]"][0]["parent_step_id"] == mod_id
+    assert by_name["test_a[2]"][0]["parent_step_id"] == mod_id
+
+
+def test_sift_module_step_false_still_drains_across_modules(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """sift_module_step=false must not merge same-named classes across modules.
+
+    The hierarchy chain always includes the module ancestor for identity
+    (even when it's not rendered as a step), so two modules each declaring
+    ``class TestFoo`` produce two distinct ``TestFoo`` frames in the diff.
+    """
+    _write_ini(pytester, sift_module_step="false")
+    pytester.makepyfile(
+        test_a=dedent(
+            """
+            class TestFoo:
+                def test_x(self):
+                    pass
+            """
+        ),
+        test_b=dedent(
+            """
+            class TestFoo:
+                def test_y(self):
+                    pass
+            """
+        ),
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Two distinct TestFoo class steps — one per module — not a shared frame.
+    assert len(by_name["TestFoo"]) == 2
+    foo_ids = {s["id"] for s in by_name["TestFoo"]}
+    # Each test method parents to a different TestFoo id.
+    test_x_parent = by_name["test_x"][0]["parent_step_id"]
+    test_y_parent = by_name["test_y"][0]["parent_step_id"]
+    assert test_x_parent in foo_ids
+    assert test_y_parent in foo_ids
+    assert test_x_parent != test_y_parent
+
+
+def test_package_step_default_opens_for_init_dirs(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """Default: a directory with ``__init__.py`` produces a parent package step."""
+    pytester.mkpydir("pkg_a")
+    (pytester.path / "pkg_a" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "pkg_a" in by_name
+    pkg_id = by_name["pkg_a"][0]["id"]
+    mod = by_name["test_x.py"][0]
+    assert mod["parent_step_id"] == pkg_id
+
+
+def test_same_named_packages_in_different_dirs_do_not_merge(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """Two packages with the same display name but different paths must stay distinct.
+
+    The hierarchy diff compares on ``nodeid`` (identity), not just the
+    display name — so a ``utils`` package under ``proj_a/`` and another
+    under ``proj_b/`` (where ``proj_a/`` and ``proj_b/`` are bare
+    directories that pytest treats as ``pytest.Dir`` nodes and the chain
+    walker skips) produce two distinct ``utils`` parent steps in the report
+    tree, not a silent merge.
+    """
+    (pytester.path / "proj_a" / "utils").mkdir(parents=True)
+    (pytester.path / "proj_a" / "utils" / "__init__.py").touch()
+    (pytester.path / "proj_a" / "utils" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    (pytester.path / "proj_b" / "utils").mkdir(parents=True)
+    (pytester.path / "proj_b" / "utils" / "__init__.py").touch()
+    (pytester.path / "proj_b" / "utils" / "test_y.py").write_text(
+        dedent(
+            """
+            def test_two():
+                pass
+            """
+        )
+    )
+    # ``importlib`` import mode is required so two packages with the same
+    # name on disk don't collide during sys.path-based import.
+    result = pytester.runpytest_subprocess("-v", "--import-mode=importlib")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Two distinct ``utils`` package steps — one per project.
+    assert len(by_name["utils"]) == 2
+    utils_ids = {s["id"] for s in by_name["utils"]}
+    # Each module step parents to a different ``utils`` instance.
+    parent_x = by_name["test_x.py"][0]["parent_step_id"]
+    parent_y = by_name["test_y.py"][0]["parent_step_id"]
+    assert parent_x in utils_ids
+    assert parent_y in utils_ids
+    assert parent_x != parent_y
+
+
+def test_sift_package_step_false_skips_package_steps(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    """With ``sift_package_step=false`` the directory step is suppressed."""
+    _write_ini(pytester, sift_package_step="false")
+    pytester.mkpydir("pkg_a")
+    (pytester.path / "pkg_a" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert "pkg_a" not in by_name
+    # The module step still opens and is now the top-level frame.
+    assert by_name["test_x.py"][0]["parent_step_id"] is None
+
+
+def test_all_three_flags_false_matches_legacy_behavior(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    _write_ini(
+        pytester,
+        sift_module_step="false",
+        sift_class_step="false",
+        sift_parametrize_nesting="false",
+    )
+    pytester.makepyfile(
+        test_legacy=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # No module, class, or parametrize parents — just bracket-mangled leaves.
+    assert "test_legacy.py" not in by_name
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+    assert "test_a[1]" in by_name
+    assert "test_a[2]" in by_name
+    assert by_name["test_a[1]"][0]["parent_step_id"] is None
+    assert by_name["test_a[2]"][0]["parent_step_id"] is None
+
+
+# ---------------------------------------------------------------------------
+# Parametrize nesting
+# ---------------------------------------------------------------------------
+
+
+def test_single_parametrize_clusters_under_originalname(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_rail=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [3.3, 5.0])
+            def test_rail(v):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Module step + one shared `test_rail` parent + two leaves.
+    assert len(by_name["test_rail.py"]) == 1
+    assert len(by_name["test_rail"]) == 1
+    assert len(by_name["v=3.3"]) == 1
+    assert len(by_name["v=5.0"]) == 1
+    test_rail_id = by_name["test_rail"][0]["id"]
+    assert by_name["v=3.3"][0]["parent_step_id"] == test_rail_id
+    assert by_name["v=5.0"][0]["parent_step_id"] == test_rail_id
+
+
+def test_stacked_parametrize_nests_outer_to_inner(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_iso=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("voltage", ["high", "low"])
+            @pytest.mark.parametrize("component", ["motor", "ducer"])
+            def test_iso(voltage, component):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # One `test_iso` parent, two `voltage='…'` parents, four `component='…'` leaves.
+    assert len(by_name["test_iso"]) == 1
+    assert len(by_name["voltage='high'"]) == 1
+    assert len(by_name["voltage='low'"]) == 1
+    assert len(by_name["component='motor'"]) == 2  # one per voltage
+    assert len(by_name["component='ducer'"]) == 2
+    test_iso_id = by_name["test_iso"][0]["id"]
+    vh_id = by_name["voltage='high'"][0]["parent_step_id"]
+    vl_id = by_name["voltage='low'"][0]["parent_step_id"]
+    assert vh_id == test_iso_id
+    assert vl_id == test_iso_id
+    # Each component leaf parents to one of the voltage parents.
+    voltage_ids = {
+        by_name["voltage='high'"][0]["id"],
+        by_name["voltage='low'"][0]["id"],
+    }
+    for leaf in by_name["component='motor'"] + by_name["component='ducer'"]:
+        assert leaf["parent_step_id"] in voltage_ids
+
+
+def test_fixture_parametrization_participates(pytester: pytest.Pytester, steps_file: Path) -> None:
+    pytester.makepyfile(
+        test_widget=dedent(
+            """
+            import pytest
+
+            @pytest.fixture(params=["a", "b"])
+            def widget(request):
+                return request.param
+
+            def test_widget(widget):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    assert len(by_name["test_widget"]) == 1
+    parent_id = by_name["test_widget"][0]["id"]
+    assert by_name["widget='a'"][0]["parent_step_id"] == parent_id
+    assert by_name["widget='b'"][0]["parent_step_id"] == parent_id
+
+
+def test_module_boundary_isolates_parametrize_stack(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_a=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [1, 2])
+            def test_one(v):
+                pass
+            """
+        ),
+        test_b=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("w", ["x", "y"])
+            def test_two(w):
+                pass
+            """
+        ),
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = json.loads(steps_file.read_text())
+    by_name = _by_name(steps)
+    # Each module step contains its own `test_one`/`test_two` parametrize subtree.
+    mod_a = by_name["test_a.py"][0]
+    mod_b = by_name["test_b.py"][0]
+    assert by_name["test_one"][0]["parent_step_id"] == mod_a["id"]
+    assert by_name["test_two"][0]["parent_step_id"] == mod_b["id"]
+
+
+def test_leaf_parent_chain_terminates_at_report(
+    pytester: pytest.Pytester, steps_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_chain=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("a", [1])
+            @pytest.mark.parametrize("b", ["x"])
+            def test_chain(a, b):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_subprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = json.loads(steps_file.read_text())
+    leaf = next(s for s in steps if s["name"].startswith("b="))
+    chain = _ancestor_names(steps, leaf)
+    # leaf b=… → a=… → test_chain → test_chain.py (module step) → root
+    assert chain == ["b='x'", "a=1", "test_chain", "test_chain.py"]
diff --git a/python/lib/sift_client/_tests/util/test_report_context.py b/python/lib/sift_client/_tests/util/test_report_context.py
index f12247c7a..e92e57bb8 100644
--- a/python/lib/sift_client/_tests/util/test_report_context.py
+++ b/python/lib/sift_client/_tests/util/test_report_context.py
@@ -10,16 +10,15 @@
 
 from __future__ import annotations
 
-import logging
 import sys
-from typing import TYPE_CHECKING
+import warnings
+
+import pytest
 
 from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.errors import SiftWarning
 from sift_client.util.test_results import ReportContext
 
-if TYPE_CHECKING:
-    import pytest
-
 
 def _make_simulate_client() -> SiftClient:
     """Build a SiftClient flagged for in-process simulation.
@@ -38,46 +37,50 @@ def _make_simulate_client() -> SiftClient:
     return client
 
 
-def _make_context(command: list[str]) -> ReportContext:
+def _make_context(command: list[str], *, timeout: float = 0.5) -> ReportContext:
     """Build a ReportContext whose replay subprocess is the provided command.
 
-    `log_file=True` triggers the temp-file path so `_open_import_proc` fires
-    on `__enter__`. The substitute argv is swapped in via the public-ish
-    `_build_replay_command` hook so the production Popen kwargs stay
-    exercised.
+    ``log_file=True`` triggers the temp-file path so ``_open_import_proc`` fires
+    on ``__enter__``. The substitute argv is swapped in via the public-ish
+    ``_build_replay_command`` hook so the production Popen kwargs stay
+    exercised. ``timeout`` overrides the worker grace window so tests don't
+    wait the full production timeout for the timeout branch to trigger.
     """
     rc = ReportContext(_make_simulate_client(), name="test", log_file=True)
     rc._build_replay_command = lambda: command  # type: ignore[method-assign]
+    rc._import_proc_timeout = timeout
     return rc
 
 
-def test_worker_clean_exit_is_silent(caplog: pytest.LogCaptureFixture) -> None:
-    """Worker exits with code 0 → __exit__ is silent (case 1)."""
+def test_worker_clean_exit_is_silent() -> None:
+    """Worker exits with code 0 → __exit__ emits no SiftWarning (case 1)."""
     rc = _make_context([sys.executable, "-c", "pass"])
-    with caplog.at_level(logging.ERROR):
+    with warnings.catch_warnings(record=True) as recorded:
+        warnings.simplefilter("always")
         with rc:
             pass
-    assert "Import process" not in caplog.text
-    assert "replay-test-result-log" not in caplog.text
+    sift_warnings = [w for w in recorded if issubclass(w.category, SiftWarning)]
+    assert sift_warnings == []
     assert rc._import_proc is not None
     assert rc._import_proc.returncode == 0
 
 
-def test_worker_timeout_kills_and_logs(caplog: pytest.LogCaptureFixture) -> None:
-    """Worker still running at session end → kill + log, no raise (case 2)."""
-    rc = _make_context([sys.executable, "-c", "import time; time.sleep(30)"])
-    with caplog.at_level(logging.ERROR):
+def test_worker_timeout_kills_and_warns() -> None:
+    """Worker still running at session end → kill + SiftWarning, no raise (case 2)."""
+    rc = _make_context([sys.executable, "-c", "import time; time.sleep(30)"], timeout=0.2)
+    with pytest.warns(SiftWarning) as recorded:
         with rc:
             pass
     assert rc._import_proc is not None
     # `kill()` + `wait()` were called; process is dead.
     assert rc._import_proc.poll() is not None
-    assert "did not exit in 1s" in caplog.text
-    assert "replay-test-result-log" in caplog.text
+    messages = "\n".join(str(w.message) for w in recorded)
+    assert "did not exit in 0.2s" in messages
+    assert "import-test-result-log" in messages
 
 
-def test_worker_nonzero_exit_logs_stderr_no_raise(caplog: pytest.LogCaptureFixture) -> None:
-    """Worker exits non-zero with stderr → log stderr + replay hint, no raise (case 3)."""
+def test_worker_nonzero_exit_warns_stderr_no_raise() -> None:
+    """Worker exits non-zero with stderr → SiftWarning with stderr + replay hint, no raise (case 3)."""
     rc = _make_context(
         [
             sys.executable,
@@ -85,11 +88,12 @@ def test_worker_nonzero_exit_logs_stderr_no_raise(caplog: pytest.LogCaptureFixtu
             "import sys; sys.stderr.write('rpc deadline exceeded'); sys.exit(2)",
         ]
     )
-    with caplog.at_level(logging.ERROR):
+    with pytest.warns(SiftWarning) as recorded:
         with rc:
             pass
     assert rc._import_proc is not None
     assert rc._import_proc.returncode == 2
-    assert "exited with code 2" in caplog.text
-    assert "rpc deadline exceeded" in caplog.text
-    assert "replay-test-result-log" in caplog.text
+    messages = "\n".join(str(w.message) for w in recorded)
+    assert "exited with code 2" in messages
+    assert "rpc deadline exceeded" in messages
+    assert "import-test-result-log" in messages
diff --git a/python/lib/sift_client/_tests/util/test_test_results_utils.py b/python/lib/sift_client/_tests/util/test_test_results_utils.py
index 256803769..4fd6ab112 100644
--- a/python/lib/sift_client/_tests/util/test_test_results_utils.py
+++ b/python/lib/sift_client/_tests/util/test_test_results_utils.py
@@ -385,6 +385,46 @@ def test_report_outcome(self, report_context, step):
         if not initial_any_failures:
             report_context.any_failures = False
 
+    def test_measurements_passed_property(self, report_context, step):
+        """``step.measurements_passed`` counts only direct ``measure*`` calls
+        on this step, and stays True when only a substep or ``report_outcome``
+        records a failure.
+        """
+        current_step_path = step.current_step.step_path
+        initial_open_step_result = report_context.open_step_results.get(current_step_path, True)
+        initial_any_failures = report_context.any_failures
+
+        # No measurements yet, vacuously True.
+        assert step.measurements_passed is True
+
+        # In-bounds measurement keeps it True.
+        step.measure(name="ok", value=1.0, bounds={"min": 0.0, "max": 2.0})
+        assert step.measurements_passed is True
+
+        # A failing report_outcome doesn't flip measurements_passed because
+        # it isn't a direct measure() call on this step.
+        step.report_outcome("substep-fail", False, "deliberately failing")
+        assert step.measurements_passed is True
+
+        # Out-of-bounds measurement flips ``measurements_passed`` False.
+        step.measure(name="bad", value=99.0, bounds={"min": 0.0, "max": 2.0})
+        assert step.measurements_passed is False
+
+        # measure_avg / measure_all go through ``measure`` internally and
+        # also increment the counter on out-of-bounds values.
+        step.measure_avg(
+            name="bad_avg",
+            values=[50.0, 60.0, 70.0],  # mean 60 is well outside [0, 2]
+            bounds={"min": 0.0, "max": 2.0},
+        )
+        assert step.measurements_passed is False
+
+        # Restore state.
+        if initial_open_step_result:
+            report_context.open_step_results[current_step_path] = True
+        if not initial_any_failures:
+            report_context.any_failures = False
+
     def test_bad_assert(self, report_context, step):
         # Capture current state of report context's failures so we can keep things passed at a high level if the test's induced failures happen as expected.
         current_step_path = step.current_step.step_path
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 494ded3b6..7c4c1c2f5 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -1,22 +1,212 @@
 from __future__ import annotations
 
 import os
+import warnings
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generator
+from typing import TYPE_CHECKING, Any, Generator, Tuple
 
 import pytest
 
 from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import TestStatus
 from sift_client.util.test_results import ReportContext
 
+
+class SiftPytestPluginWarning(SiftWarning):
+    """Base warning for issues raised by the Sift pytest plugin."""
+
+
+class SiftPytestStepDrainWarning(SiftPytestPluginWarning):
+    """A step's ``__exit__`` raised while the plugin was draining its stack.
+
+    Surfaced at module-teardown or session-end so the drain can continue and
+    pytest test outcomes stay unaffected; the underlying exception is included
+    in the message for debugging.
+    """
+
+
+class SiftPytestStepDrainError(RuntimeError):
+    """Raised when mid-session drain fails — signals a likely upstream invariant break."""
+
+
 if TYPE_CHECKING:
     from sift_client.util.test_results.context_manager import NewStep
 
 REPORT_CONTEXT: Any = None
 
+_STASH_MISSING = object()
+
+_PARAMETRIZE_PATH_KEY = pytest.StashKey[Tuple[str, ...]]()
+# Each frame: (path_key, open step). Frames are shared across sibling test items
+# and drained at session end.
+_PARAMETRIZE_STACK: list[tuple[str, Any]] = []
+
+_HIERARCHY_KEY = pytest.StashKey[Tuple[Tuple[str, str, "str | None", bool], ...]]()
+# Outer-to-inner frames for the item's collection-tree ancestors. Each chain
+# entry is ``(identity, name, doc, rendered)``:
+#   - ``identity``: a globally-unique key (``node.nodeid``) used for diff
+#     comparison. Two ancestors at the same depth with the same display name
+#     but reached via different paths (e.g., ``proj_a/utils`` and
+#     ``proj_b/utils`` in a monorepo) get distinct identities, so they never
+#     silently merge in the diff.
+#   - ``name``: the human-readable step name used when ``rendered`` opens the
+#     Sift step.
+#   - ``doc``: docstring used for the step description if rendered.
+#   - ``rendered``: True iff the corresponding ``sift_*_step`` ini flag is on.
+#     Non-rendered frames participate in the diff but do not call
+#     ``rc.new_step(...)`` — they appear with ``ns=None`` in the stack.
+#
+# Stack entries: ``(identity, name, open_step_or_None)``. Frames are shared
+# across sibling test items and drained at session end. Drained AFTER
+# _PARAMETRIZE_STACK since parametrize parents nest inside hierarchy parents.
+_HIERARCHY_STACK: list[tuple[str, str, Any]] = []
+
+
+def _drain_step_stack(stack: list, *, swallow_errors: bool = True) -> None:
+    """Pop and close every frame.
+
+    With ``swallow_errors=True`` (default, used at teardown / session end),
+    per-frame failures are surfaced as ``SiftPytestStepDrainWarning`` so a
+    single misbehaving ``__exit__`` can't block the rest of the stack from
+    cleaning up or cascade out of pytest's finalizer chain.
+
+    With ``swallow_errors=False`` (mid-session, when a class transition forces
+    parametrize parents to close), the stack is still fully drained but the
+    first per-frame exception is re-raised at the end as a
+    ``SiftPytestStepDrainError`` so a real upstream invariant violation
+    surfaces as a test error instead of a silenceable warning.
+    """
+    errors: list[tuple[str, BaseException]] = []
+    while stack:
+        entry = stack.pop()
+        # Tolerate either ``(name, ns)`` (parametrize stack) or
+        # ``(identity, name, ns)`` (hierarchy stack) entries.
+        name, ns = entry[-2], entry[-1]
+        if ns is None:
+            # Non-rendered diff-only frame (e.g. a Package frame when
+            # ``sift_package_step=false``); nothing to close.
+            continue
+        try:
+            ns.__exit__(None, None, None)
+        except Exception as exc:
+            if swallow_errors:
+                warnings.warn(
+                    f"Sift plugin: closing step {name!r} during drain raised "
+                    f"{type(exc).__name__}: {exc}",
+                    SiftPytestStepDrainWarning,
+                    stacklevel=2,
+                )
+            else:
+                errors.append((name, exc))
+    if errors:
+        first_name, first_exc = errors[0]
+        raise SiftPytestStepDrainError(
+            f"Sift plugin: {len(errors)} step(s) raised while draining mid-session; "
+            f"first failure on {first_name!r}: {type(first_exc).__name__}: {first_exc}"
+        ) from first_exc
+
+
+def _drain_parametrize_stack(*, swallow_errors: bool = True) -> None:
+    _drain_step_stack(_PARAMETRIZE_STACK, swallow_errors=swallow_errors)
+
+
+def _drain_hierarchy_stack(*, swallow_errors: bool = True) -> None:
+    _drain_step_stack(_HIERARCHY_STACK, swallow_errors=swallow_errors)
+
+
+def _close_frame(name: str, ns: Any) -> None:
+    """Close a single frame, warning on per-frame failure.
+
+    Used by the mid-session hierarchy-stack pop and the rollback paths so a
+    misbehaving ``__exit__`` neither shadows the original exception nor leaks
+    sibling frames. ``ns=None`` indicates a non-rendered diff-only frame; skip.
+    """
+    if ns is None:
+        return
+    try:
+        ns.__exit__(None, None, None)
+    except Exception as exc:
+        warnings.warn(
+            f"Sift plugin: closing step {name!r} raised {type(exc).__name__}: {exc}",
+            SiftPytestStepDrainWarning,
+            stacklevel=2,
+        )
+
+
+def _build_parametrize_path(item: pytest.Item) -> tuple[str, ...]:
+    """Outer-to-inner step display names for a parametrized item.
+
+    Pytest stores ``callspec.params`` with the BOTTOM decorator's axis first;
+    the Sift step tree treats the TOP decorator as outermost, so we reverse.
+    """
+    callspec = getattr(item, "callspec", None)
+    if callspec is None or not callspec.params:
+        return ()
+    originalname = getattr(item, "originalname", item.name)
+    frames: list[str] = [originalname]
+    for name, value in reversed(callspec.params.items()):
+        frames.append(f"{name}={value!r}")
+    return tuple(frames)
+
+
+def _build_hierarchy_chain(
+    item: pytest.Item | pytest.Collector,
+    config: pytest.Config,
+) -> tuple[tuple[str, str, str | None, bool], ...]:
+    """Outer-to-inner ``(identity, name, docstring, rendered)`` for collection ancestors.
+
+    Walks ``item.parent`` upward and ALWAYS collects every ``pytest.Package``,
+    ``pytest.Module``, and ``pytest.Class`` ancestor — they all participate in
+    the diff that keeps the report tree coherent across tests, so two
+    same-named ancestors reached via different paths (e.g., ``proj_a/utils``
+    and ``proj_b/utils`` in a monorepo where the ``proj_*`` dirs are
+    ``pytest.Dir`` nodes the walker skips) cannot silently merge.
+
+    The ``identity`` field is ``node.nodeid`` — globally unique per collected
+    node. The diff compares on identity, not the display ``name``.
+
+    The ``rendered`` flag is True iff the layer's ini flag is on
+    (``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``).
+    Non-rendered frames participate in the diff for identity but don't open a
+    Sift step.
+
+    The ``node.obj`` access is a pytest property that imports the underlying
+    Python object and can raise *any* exception (ImportError, custom
+    metaclass errors, descriptor ``__doc__`` properties that throw). Guard
+    broadly so a misbehaving collector doesn't abort the whole collection
+    phase — that frame's docstring just becomes ``None``.
+    """
+    include_package = bool(_option_or_ini(config, _PACKAGE_STEP))
+    include_module = bool(_option_or_ini(config, _MODULE_STEP))
+    include_class = bool(_option_or_ini(config, _CLASS_STEP))
+
+    chain: list[tuple[str, str, str | None, bool]] = []
+    # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
+    # isn't part of pytest's public API; widen to ``Any`` for the walk.
+    node: Any = item
+    while node is not None:
+        if isinstance(node, pytest.Class):
+            rendered = include_class
+        elif isinstance(node, pytest.Module):
+            rendered = include_module
+        elif isinstance(node, pytest.Package):
+            rendered = include_package
+        else:
+            node = node.parent
+            continue
+        try:
+            doc = (
+                (getattr(node, "obj", None) and getattr(node.obj, "__doc__", None)) or ""
+            ).strip() or None
+        except Exception:
+            doc = None
+        chain.append((node.nodeid, node.name, doc, rendered))
+        node = node.parent
+    return tuple(reversed(chain))
+
 
 @dataclass(frozen=True)
 class _Option:
@@ -105,10 +295,45 @@ class _Option:
 _AUTOUSE = _Option(
     ini_name="sift_autouse",
     ini_help="Default for the Sift autouse fixtures (report_context, step, "
-    "module_substep). When true (default), tests are included unless marked "
-    "with @pytest.mark.sift_exclude. When false, tests are skipped unless "
-    "marked with @pytest.mark.sift_include. Bulk-apply markers in a "
-    "directory's conftest via `pytest_collection_modifyitems`.",
+    "_hierarchy_parents, _parametrize_parents). When true (default), tests "
+    "are included unless marked with @pytest.mark.sift_exclude. When false, "
+    "tests are skipped unless marked with @pytest.mark.sift_include. "
+    "Bulk-apply markers in a directory's conftest via "
+    "`pytest_collection_modifyitems`.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_PACKAGE_STEP = _Option(
+    ini_name="sift_package_step",
+    ini_help="When true (default), open a parent step for each Python package "
+    "(directory with an ``__init__.py``) in the test path. Set to false to "
+    "flatten package grouping.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_MODULE_STEP = _Option(
+    ini_name="sift_module_step",
+    ini_help="When true (default), open a per-module parent step. Set to false "
+    "to skip module-level grouping in the report tree.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_CLASS_STEP = _Option(
+    ini_name="sift_class_step",
+    ini_help="When true (default), open per-class parent steps (including nested "
+    "classes). Set to false to keep class methods at module level.",
+    ini_type="bool",
+    ini_default=True,
+)
+
+_PARAMETRIZE_NESTING = _Option(
+    ini_name="sift_parametrize_nesting",
+    ini_help="When true (default), parametrized tests nest under shared parent "
+    "steps (e.g. test_a -> v=1, v=2). Set to false to keep the flat per-test "
+    "leaf naming (test_a[1], test_a[2]).",
     ini_type="bool",
     ini_default=True,
 )
@@ -121,6 +346,10 @@ class _Option:
     _GRPC_URI,
     _REST_URI,
     _AUTOUSE,
+    _PACKAGE_STEP,
+    _MODULE_STEP,
+    _CLASS_STEP,
+    _PARAMETRIZE_NESTING,
 )
 
 
@@ -163,6 +392,44 @@ def pytest_configure(config: pytest.Config) -> None:
     )
 
 
+def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
+    """Stash each item's class chain + parametrize path and cluster siblings.
+
+    Sorts by ``(file_path, hierarchy_chain, parametrize_path)`` so sibling
+    items under a shared parent (package, module, class, or parametrize axis)
+    stay contiguous — otherwise a free function sorting between two class
+    methods would tear down + re-open the class step, producing duplicate
+    parents in the report tree.
+    """
+    for item in items:
+        item.stash[_HIERARCHY_KEY] = _build_hierarchy_chain(item, config)
+        item.stash[_PARAMETRIZE_PATH_KEY] = _build_parametrize_path(item)
+    # Use ``.get(...)`` defensively: a third-party hook may inject items after
+    # our stashing loop runs, and we'd rather sort them at the tail than
+    # KeyError out of collection.
+    items.sort(
+        key=lambda i: (
+            str(i.path),
+            tuple(identity for identity, _, _, _ in i.stash.get(_HIERARCHY_KEY, ())),
+            i.stash.get(_PARAMETRIZE_PATH_KEY, ()),
+        )
+    )
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
+    """Drain any parent steps still open at session end (innermost first).
+
+    Wrapped so a failure in the inner drain does not prevent the outer one
+    from running. With ``module_substep`` removed, this is the sole place
+    where hierarchy parents close — they persist across all tests and only
+    drain when the session ends.
+    """
+    try:
+        _drain_parametrize_stack()
+    finally:
+        _drain_hierarchy_stack()
+
+
 def _is_offline(pytestconfig: pytest.Config | None) -> bool:
     return bool(_option_or_ini(pytestconfig, _OFFLINE))
 
@@ -186,22 +453,6 @@ def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bo
     return default
 
 
-def _module_has_included_tests(request: pytest.FixtureRequest, default: bool) -> bool:
-    """True when at least one test in `request`'s module is gated on.
-
-    Used by the module-scoped `module_substep` fixture to decide whether to
-    activate without triggering `report_context` creation for modules where
-    every test is excluded.
-    """
-    module_path = request.path
-    for item in request.session.items:
-        if item.path != module_path:
-            continue
-        if _sift_enabled_for(item, default):
-            return True
-    return False
-
-
 def _option_or_ini(pytestconfig: pytest.Config | None, opt: _Option) -> Any:
     """Resolve a Sift plugin setting from CLI > ini > None.
 
@@ -302,7 +553,19 @@ def _report_context_impl(
     ) as context:
         global REPORT_CONTEXT
         REPORT_CONTEXT = context
-        yield context
+        try:
+            yield context
+        finally:
+            # Drain the hierarchy + parametrize stacks INSIDE the
+            # ReportContext's ``with`` block, so the final ``__exit__``
+            # update calls for those parent steps are written to the log
+            # file BEFORE the import worker drains. Without this, the
+            # worker exits with a partial backlog and the parent steps
+            # are stuck IN_PROGRESS in the Sift report.
+            try:
+                _drain_parametrize_stack()
+            finally:
+                _drain_hierarchy_stack()
 
 
 _CREDENTIAL_KEYS: tuple[tuple[str, _Option | None], ...] = (
@@ -411,9 +674,10 @@ def report_context(
 
     The fixture is no longer autouse; it's instantiated on the first call
     to ``request.getfixturevalue("report_context")``, which today happens
-    inside the gated ``step`` and ``module_substep`` fixtures. If every
-    test in the session is excluded via the marker gate, this fixture is
-    never resolved and no ReportContext (or teardown subprocess) is created.
+    inside the gated ``step``, ``_hierarchy_parents``, and
+    ``_parametrize_parents`` fixtures. If every test in the session is
+    excluded via the marker gate, this fixture is never resolved and no
+    ReportContext (or teardown subprocess) is created.
 
     What gets yielded depends on the mode:
 
@@ -460,24 +724,183 @@ def report_context(
 def _step_impl(
     report_context: ReportContext, request: pytest.FixtureRequest
 ) -> Generator[NewStep, None, None]:
-    name = str(request.node.name)
-    existing_docstring = request.node.obj.__doc__ or None
+    node = request.node
+    # Items get a parametrize path stashed in ``pytest_collection_modifyitems``;
+    # modules/other nodes fall back to their node name. The leaf frame
+    # (``path[-1]``) is the test-specific display name — parents are opened
+    # by ``_parametrize_parents``. When parametrize-nesting is disabled, fall
+    # back to the bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf
+    # remains uniquely identifiable.
+    if _option_or_ini(request.config, _PARAMETRIZE_NESTING):
+        path = node.stash.get(_PARAMETRIZE_PATH_KEY, ())
+        name = path[-1] if path else str(node.name)
+    else:
+        name = str(node.name)
+    # ``node.obj`` may not exist (e.g., ``pytest.DoctestItem``) or may raise
+    # when accessed — fall back to no description in those cases rather than
+    # erroring out a perfectly valid test. ``getattr``'s default only
+    # suppresses ``AttributeError``; the try/except catches everything else
+    # (RuntimeError from a misbehaving ``__doc__`` descriptor, etc.).
+    try:
+        existing_docstring = getattr(getattr(node, "obj", None), "__doc__", None) or None
+    except Exception:
+        existing_docstring = None
     with report_context.new_step(
         name=name, description=existing_docstring, assertion_as_fail_not_error=False
     ) as new_step:
         yield new_step
-        if hasattr(request.node, "rep_call") and request.node.rep_call.excinfo:
+        if hasattr(node, "rep_call") and node.rep_call.excinfo:
             new_step.update_step_from_result(
-                request.node.rep_call.excinfo,
-                request.node.rep_call.excinfo.value,
-                request.node.rep_call.excinfo.tb,
+                node.rep_call.excinfo,
+                node.rep_call.excinfo.value,
+                node.rep_call.excinfo.tb,
             )
 
 
+@pytest.fixture(autouse=True)
+def _hierarchy_parents(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+) -> None:
+    """Open/close hierarchy parent steps (packages, modules, classes) for the current item.
+
+    Same diff-stack pattern as ``_parametrize_parents`` but operates on
+    ``_HIERARCHY_KEY``. The chain is built outer-to-inner from the item's
+    collection-tree ancestors; which node types are included is decided at
+    build time by ``sift_package_step`` / ``sift_module_step`` /
+    ``sift_class_step``. When the chain changes (pop or push), the parametrize
+    stack is drained first since parametrize parents nest INSIDE these.
+
+    Gated off when the item is excluded (avoids eager ``report_context`` setup).
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _sift_enabled_for(request.node, default):
+        return None
+    # Fall back to computing the chain on-demand for items that bypassed
+    # ``pytest_collection_modifyitems`` (e.g., dynamically inserted by another
+    # plugin's later hook). Defaulting to ``()`` would incorrectly drain the
+    # entire open hierarchy stack for those items.
+    desired = request.node.stash.get(_HIERARCHY_KEY, _STASH_MISSING)
+    if desired is _STASH_MISSING:
+        desired = _build_hierarchy_chain(request.node, pytestconfig)
+    common = 0
+    # Compare on identity (nodeid) — same-named ancestors at different paths
+    # MUST stay distinct.
+    while (
+        common < len(_HIERARCHY_STACK)
+        and common < len(desired)
+        and _HIERARCHY_STACK[common][0] == desired[common][0]
+    ):
+        common += 1
+    # Any change to the hierarchy chain orphans parametrize parents from the
+    # previous test — drain them before mutating the hierarchy stack so
+    # ReportContext's top-of-stack invariant holds. Strict mode: a per-frame
+    # ``__exit__`` failure here signals a real upstream drift between the
+    # plugin stacks and ReportContext; raise it as a test error instead of a
+    # silenceable warning.
+    if common < len(_HIERARCHY_STACK) or common < len(desired):
+        _drain_parametrize_stack(swallow_errors=False)
+    # Symmetric per-frame guard for the hierarchy pop so one bad ``__exit__``
+    # doesn't leave _HIERARCHY_STACK partially drained for every subsequent test.
+    while len(_HIERARCHY_STACK) > common:
+        _identity, name, ns = _HIERARCHY_STACK.pop()
+        _close_frame(name, ns)
+    if not desired[common:]:
+        return None
+    # Fetch ``report_context`` lazily — but only when there's at least one
+    # rendered frame to push. Pure diff-only frames (e.g. a Package frame when
+    # ``sift_package_step=false``) just update _HIERARCHY_STACK with ns=None.
+    rc = None
+    # Roll back any partial push so a mid-loop exception doesn't leave half
+    # the chain orphaned on the stack. Per-frame guard inside the rollback so
+    # a failing ``__exit__`` doesn't shadow the original exception or leak
+    # the remaining opened frames.
+    opened: list[tuple[str, str, Any]] = []
+    try:
+        for identity, name, doc, rendered in desired[common:]:
+            if rendered:
+                if rc is None:
+                    rc = request.getfixturevalue("report_context")
+                ns = rc.new_step(name=name, description=doc, assertion_as_fail_not_error=False)
+                ns.__enter__()
+                opened.append((identity, name, ns))
+            else:
+                opened.append((identity, name, None))
+    except BaseException:
+        while opened:
+            _identity, name, ns = opened.pop()
+            _close_frame(name, ns)
+        raise
+    _HIERARCHY_STACK.extend(opened)
+    return None
+
+
+@pytest.fixture(autouse=True)
+def _parametrize_parents(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+    _hierarchy_parents: None,
+) -> None:
+    """Open/close shared parametrize parent steps for the current item.
+
+    Diffs the item's desired parametrize path against the open stack: pops the
+    stale tail, then opens new parents (everything except the innermost frame —
+    the ``step`` fixture creates that as the leaf). Parents persist across
+    sibling items so a tree like ``test_x[a=1]`` / ``test_x[a=2]`` shares one
+    ``test_x`` container.
+
+    Gated off when the current item is excluded so that excluded items don't
+    eagerly request ``report_context`` (which would defeat its lazy creation),
+    or when ``sift_parametrize_nesting=false``. Parents persist until the
+    diff against a subsequent test's chain pops them, or until
+    ``pytest_sessionfinish`` drains anything left at session end.
+    """
+    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    if not _sift_enabled_for(request.node, default):
+        return None
+    if not _option_or_ini(pytestconfig, _PARAMETRIZE_NESTING):
+        return None
+    # Fall back to on-demand computation for dynamically-inserted items;
+    # see _hierarchy_parents for the same rationale.
+    desired = request.node.stash.get(_PARAMETRIZE_PATH_KEY, _STASH_MISSING)
+    if desired is _STASH_MISSING:
+        desired = _build_parametrize_path(request.node)
+    parents = desired[:-1]
+    common = 0
+    while (
+        common < len(_PARAMETRIZE_STACK)
+        and common < len(parents)
+        and _PARAMETRIZE_STACK[common][0] == parents[common]
+    ):
+        common += 1
+    # Per-frame guard so one bad ``__exit__`` doesn't leave _PARAMETRIZE_STACK
+    # partially drained for every subsequent test.
+    while len(_PARAMETRIZE_STACK) > common:
+        name, ns = _PARAMETRIZE_STACK.pop()
+        _close_frame(name, ns)
+    if not parents[common:]:
+        return None
+    rc = request.getfixturevalue("report_context")
+    opened: list[tuple[str, Any]] = []
+    try:
+        for display in parents[common:]:
+            ns = rc.new_step(name=display, assertion_as_fail_not_error=False)
+            ns.__enter__()
+            opened.append((display, ns))
+    except BaseException:
+        while opened:
+            name, ns = opened.pop()
+            _close_frame(name, ns)
+        raise
+    _PARAMETRIZE_STACK.extend(opened)
+    return None
+
+
 @pytest.fixture(autouse=True)
 def step(
     request: pytest.FixtureRequest,
     pytestconfig: pytest.Config,
+    _parametrize_parents: None,
 ) -> Generator[NewStep | None, None, None]:
     """Create an outer step for the function when the Sift gate is on.
 
@@ -498,27 +921,6 @@ def step(
     yield from _step_impl(rc, request)
 
 
-@pytest.fixture(scope="module", autouse=True)
-def module_substep(
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create a per-module step when at least one test in the module is gated on.
-
-    Inspects the module's collected items rather than gating on a single marker,
-    so a module with mixed inclusion/exclusion still produces the module-level
-    step (individual `step` fixtures then decide per-test). When every test in
-    the module is excluded, the substep is skipped without requesting
-    `report_context`.
-    """
-    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
-    if not _module_has_included_tests(request, default):
-        yield None
-        return
-    rc = request.getfixturevalue("report_context")
-    yield from _step_impl(rc, request)
-
-
 @pytest.fixture(scope="session")
 def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRequest) -> bool:
     """Verify the ``SiftClient`` can reach Sift via ``/ping``.
diff --git a/python/lib/sift_client/util/test_results/__init__.py b/python/lib/sift_client/util/test_results/__init__.py
index ddce0326c..a3ac081bc 100644
--- a/python/lib/sift_client/util/test_results/__init__.py
+++ b/python/lib/sift_client/util/test_results/__init__.py
@@ -61,10 +61,13 @@ def main(self):
 
 By default, every test in the session produces a Sift report: one
 `TestReport` per session, one step per test function (`step`), and one
-parent step per test file (`module_substep`). The plugin also registers a
-default `sift_client` fixture that reads `SIFT_API_KEY`, `SIFT_GRPC_URI`,
-and `SIFT_REST_URI` from the environment. Override it by defining your own
-`sift_client` fixture in your conftest.
+parent step per Python package (directory with `__init__.py`), test file,
+and test class
+above it. Individual layers can be flattened via the `sift_package_step`,
+`sift_module_step`, `sift_class_step`, and `sift_parametrize_nesting` ini
+flags. The plugin also registers a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+Override it by defining your own `sift_client` fixture in your conftest.
 
 Note: FedRAMP users: results are buffered to a temp file and uploaded by a
 subprocess at session end (no API calls during the run). Disable the buffer
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 3d375814a..bd2ec917f 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -7,6 +7,7 @@
 import subprocess
 import tempfile
 import traceback
+import warnings
 from contextlib import AbstractContextManager, contextmanager
 from datetime import datetime, timezone
 from pathlib import Path
@@ -14,6 +15,7 @@
 
 import numpy as np
 
+from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import (
     ErrorInfo,
     NumericBounds,
@@ -42,15 +44,19 @@
 
 
 def log_replay_instructions(log_file: str | Path | None) -> None:
-    """Log instructions for manually replaying a test result log file.
+    """Surface replay instructions when an import/replay attempt fails.
 
-    Used when an import/replay attempt fails so the user can retry against the same file.
+    Emitted as a ``SiftWarning`` (not a logger.error) so pytest and other
+    runners surface it in their warning summary; logger.error is suppressed
+    by default in most CLI tools.
     """
     if log_file is None:
         return
-    logger.error(
-        f"Error replaying log file: {log_file}.\n"
-        f"  Can replay with `replay-test-result-log {log_file}`."
+    warnings.warn(
+        f"Sift log file was not fully replayed: {log_file}. "
+        f"Re-run with `import-test-result-log {log_file}` to complete the upload.",
+        SiftWarning,
+        stacklevel=2,
     )
 
 
@@ -110,6 +116,11 @@ class ReportContext(AbstractContextManager):
     open_step_results: dict[str, bool]
     any_failures: bool
     _import_proc: subprocess.Popen | None = None
+    # Seconds to wait for the import worker subprocess to finish uploading
+    # the JSONL backlog at session end before killing it. Tests substitute
+    # a smaller value (via ``_make_context`` patching) so they don't wait
+    # the full window for the timeout branch to trigger.
+    _import_proc_timeout: float = 30.0
 
     def __init__(
         self,
@@ -227,31 +238,40 @@ def __exit__(self, exc_type, exc_value, traceback):
             # them fail the session — tests already ran and their outcome
             # is independent of delivery. The local log file is the source
             # of recovery for both failure modes via
-            # `replay-test-result-log <path>`:
+            # `import-test-result-log <path>`:
             #   1. Exits cleanly (returncode 0). Silent.
-            #   2. Still running after the 1s grace window (TimeoutExpired).
+            #   2. Still running after the grace window (TimeoutExpired).
             #      Healthy worker with a large backlog; kill and surface
-            #      replay instructions.
+            #      replay instructions. 30 seconds is enough for a normal
+            #      test suite to drain; pathological backlogs should opt
+            #      into inline mode (`--sift-log-file=false`) instead.
             #   3. Exited with non-zero. Connection failures and API call
             #      errors land here — the worker's replay loop has no retry,
-            #      so the first failed RPC crashes the subprocess. Log the
-            #      captured stderr at ERROR with replay instructions.
+            #      so the first failed RPC crashes the subprocess. Surface
+            #      the captured stderr with replay instructions.
             try:
-                _, stderr_bytes = self._import_proc.communicate(timeout=1)
+                _, stderr_bytes = self._import_proc.communicate(timeout=self._import_proc_timeout)
             except subprocess.TimeoutExpired:
-                logger.error("Import process did not exit in 1s, killing it")
                 self._import_proc.kill()
                 self._import_proc.wait()
+                warnings.warn(
+                    f"Sift import worker did not exit in "
+                    f"{self._import_proc_timeout}s; killing it. "
+                    "Local log file is preserved for manual replay.",
+                    SiftWarning,
+                    stacklevel=2,
+                )
                 log_replay_instructions(self.log_file)
                 return True  # Ensures the session is marked as passed in pytest
             if self._import_proc.returncode != 0:
                 stderr_text = (
                     stderr_bytes.decode("utf-8", errors="replace").strip() if stderr_bytes else ""
                 )
-                logger.error(
-                    "Import process exited with code %d. stderr: %s",
-                    self._import_proc.returncode,
-                    stderr_text or "<empty>",
+                warnings.warn(
+                    f"Sift import worker exited with code "
+                    f"{self._import_proc.returncode}. stderr: {stderr_text or '<empty>'}",
+                    SiftWarning,
+                    stacklevel=2,
                 )
                 log_replay_instructions(self.log_file)
 
@@ -409,6 +429,11 @@ def __init__(
         self.client = report_context.client
         self.current_step = self.report_context.create_step(name, description, metadata=metadata)
         self.assertion_as_fail_not_error = assertion_as_fail_not_error
+        # Per-step measurement-failure count for ``measurements_passed``.
+        # Tracks only direct ``measure*`` calls on this NewStep instance;
+        # substep / ``report_outcome`` failures are intentionally not folded
+        # in here (see ``measurements_passed`` vs ``passed``).
+        self._failed_measurement_count = 0
 
     def __enter__(self):
         """Enter the context manager to create a new step.
@@ -417,6 +442,20 @@ def __enter__(self):
         """
         return self
 
+    @property
+    def measurements_passed(self) -> bool:
+        """True if every measurement recorded directly on this step has passed.
+
+        Counts only ``step.measure``, ``step.measure_avg``, and
+        ``step.measure_all`` calls on this ``NewStep`` instance. Useful for
+        the ``assert step.measurements_passed`` pattern at the end of a test
+        when you want to fail pytest on any out-of-bounds measurement
+        without short-circuiting on the first failure (asserting on
+        individual ``measure(...)`` return values skips every measurement
+        after the failing one).
+        """
+        return self._failed_measurement_count == 0
+
     def update_step_from_result(
         self,
         exc: type[Exception] | None,
@@ -529,6 +568,8 @@ def measure(
             create, log_file=self.report_context.log_file
         )
         self.report_context.record_step_outcome(measurement.passed, self.current_step)
+        if not measurement.passed:
+            self._failed_measurement_count += 1
 
         return measurement.passed
 
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index 90bfd10ed..5108b7e4a 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -59,7 +59,9 @@ nav:
       - examples/index.md
       - Basic Usage: examples/basic.ipynb
       - Data Ingestion: examples/ingestion.ipynb
+        # Will migrate to Guides in the future
       - Pytest Plugin: examples/pytest_plugin.md
+      - Pytest Plugin Quickstart: examples/pytest_plugin_quickstart.md
 #  - Guides:
 #      - Logging
 #      - Error Handling
diff --git a/python/pyproject.toml b/python/pyproject.toml
index a2cd6a410..0bb07e84a 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -26,7 +26,11 @@ keywords = ["sift", "sift-stack", "siftstack", "sift_py"]
 dependencies = [
     "grpcio~=1.13",
     "PyYAML~=6.0",
-    "rapidyaml~=0.11",
+    # TODO: rapidyaml 0.13.0 ships C++ source that fails to compile against
+    # the GCC version on current GitHub Actions runners (csubstr operator=
+    # and SFINAE errors in the bundled c4core). Cap below 0.13 until either
+    # rapidyaml ships fixed sdists or we move to binary wheels.
+    "rapidyaml>=0.11,<0.13",
     "pandas>=2.0,<3.1",
     "protobuf>=5.0",
     "pydantic~=2.10",

From d3a444bce127d75e813fd883e1b979d16a57519a Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 11:14:50 -0700
Subject: [PATCH 05/19] Python(feat): pytest pass fail behavior improvements
 (#568)

---
 python/docs/examples/pytest_plugin.md         |   4 +
 .../pytest_plugin/pass_fail_behavior.md       | 164 +++++
 .../_tests/pytest_plugin/_fakes.py            | 132 ----
 .../pytest_plugin/_step_status_capture.py     | 139 +++++
 .../pytest_plugin/step_status_states.md       | 105 ++++
 .../_tests/pytest_plugin/test_hierarchy.py    | 179 +++---
 .../_tests/pytest_plugin/test_pass_fail.py    | 562 ++++++++++++++++++
 python/lib/sift_client/pytest_plugin.py       | 169 +++++-
 .../util/test_results/context_manager.py      | 145 +++--
 python/mkdocs.yml                             |   3 +
 10 files changed, 1313 insertions(+), 289 deletions(-)
 create mode 100644 python/docs/guides/pytest_plugin/pass_fail_behavior.md
 delete mode 100644 python/lib/sift_client/_tests/pytest_plugin/_fakes.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py

diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index c464e564e..5a40d450d 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -306,6 +306,10 @@ outcomes into `TestStatus`:
 | Non-`AssertionError` exception escapes the test (e.g. `ValueError`, `TimeoutError`) | `ERROR`, with the formatted traceback (last 10 frames plus the first frame) on `step.error_info.error_message` |
 | Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
 
+For the full contract, including skips, xfail/xpass, hard exits (`SystemExit`,
+`KeyboardInterrupt`), setup/teardown phase failures, and propagation rules,
+see the [Pass/Fail Behavior guide](../guides/pytest_plugin/pass_fail_behavior.md).
+
 A failure or error at any depth propagates upward: the parent substep, the
 function step, the class/module/package steps above it, and the session
 report all get marked failed.
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
new file mode 100644
index 000000000..6e9b1d6e3
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -0,0 +1,164 @@
+# Pass/Fail Behavior
+
+The pytest plugin maps every pytest outcome to a `TestStatus` on the
+corresponding Sift step. Use this page to look up what a given test will
+produce, and how that result rolls up to the parent steps and the report.
+
+## `TestStatus` values
+
+The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
+
+| Status        | Meaning                                                                                                                |
+| ------------- |------------------------------------------------------------------------------------------------------------------------|
+| `PASSED`      | The step completed and every check it owns succeeded.                                                                  |
+| `FAILED`      | An assertion, a `pytest.fail(...)`, a failed `report_outcome`, or a failing measurement marked it.                     |
+| `ERROR`       | An unexpected exception escaped the test body or a fixture (setup or teardown).                                        |
+| `ABORTED`     | A hard exit (`SystemExit`, observed `KeyboardInterrupt`) interrupted the test.                                         |
+| `SKIPPED`     | The test was skipped at collection time, at runtime, or from a fixture.                                                |
+| `IN_PROGRESS` | Test in progress or the plugin never observed a final outcome (e.g. a session-aborting interrupt killed pytest first). |
+
+## Normal test outcomes
+
+| Scenario                                  | Trigger                              | Outcome  |
+| ----------------------------------------- | ------------------------------------ | -------- |
+| Test passes                               | function body returns cleanly        | `PASSED` |
+| Assertion failure                         | `assert 1 == 2`                      | `FAILED` |
+| `pytest.fail("...")` from the body        | `pytest.fail("intentional failure")` | `FAILED` |
+| Uncaught non-assertion exception          | `raise ValueError("boom")`           | `ERROR`  |
+
+A non-assertion exception gets its formatted traceback recorded on
+`step.error_info.error_message`.
+
+## Hard exits
+
+Hard exits the plugin can observe map to `ABORTED`. If pytest tears the
+session down before the plugin sees the exit, the step stays at
+`IN_PROGRESS` instead of resolving.
+
+| Scenario                                       | Trigger                   | Outcome                                                              |
+| ---------------------------------------------- | ------------------------- | -------------------------------------------------------------------- |
+| `SystemExit` from the test body                | `sys.exit(1)`             | `ABORTED`                                                            |
+| `KeyboardInterrupt` the plugin observes        | `raise KeyboardInterrupt` | `ABORTED`                                                            |
+| Session-aborting `KeyboardInterrupt`           | Ctrl-C terminates pytest  | `IN_PROGRESS` (session ends before the plugin's hooks fire)          |
+
+### Abort propagation through nested substeps
+
+Every step that was open when the abort fired records
+`ABORTED`.
+
+```python title="test_abort.py"
+import sys
+
+
+def test_x(step):
+    with step.substep(name="completed_sub"):
+        pass  # closes as PASSED before the abort
+    with step.substep(name="outer_sub") as outer_sub:
+        with outer_sub.substep(name="inner_sub"):
+            sys.exit(1)  # ABORTED applied to inner_sub, outer_sub, and the test step
+```
+
+The Sift report shows `completed_sub` as `PASSED` and the three steps
+still open at the abort (`inner_sub`, `outer_sub`, and the test step
+itself) as `ABORTED`.
+
+## Skips
+
+| Scenario                              | Trigger                                       | Outcome   |
+| ------------------------------------- | --------------------------------------------- | --------- |
+| Collection-time skip                  | `@pytest.mark.skip(reason=...)`               | `SKIPPED` |
+| Conditional collection-time skip      | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` |
+| Runtime skip from the test body       | `pytest.skip("...")`                          | `SKIPPED` |
+| Skip raised inside a fixture          | `@pytest.fixture` calls `pytest.skip("...")`  | `SKIPPED` |
+
+`SKIPPED` does not propagate as a failure. A skipped substep or test does
+not block its parent from resolving to `PASSED`.
+
+## Expected failures (xfail / xpass)
+
+xfail marks declare that a test is expected to fail. The plugin follows
+the same semantics pytest does.
+
+| Scenario                                  | Trigger                                                    | Outcome                                                       |
+| ----------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------- |
+| xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                     | `PASSED` (the test fulfilled the xfail expectation)           |
+| Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`          | `FAILED` (the mark no longer matches reality)                 |
+| Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                     | `PASSED` (`strict=False` does not insist on the failure)      |
+| `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)      |
+| `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)`                            | `SKIPPED` (the body never ran)                                |
+
+## Influencing outcomes from test code
+
+A test can also set the step's outcome directly via the helpers below.
+Substeps your test opens follow the same propagation rules as the ones
+the plugin opens for you.
+
+### Manual status override
+
+`step.current_step.update({...})` sets the status directly. The step's
+exit handler does not overwrite it.
+
+```python
+from sift_client.sift_types.test_report import TestStatus
+
+
+def test_manual(step):
+    step.current_step.update({"status": TestStatus.FAILED})
+```
+
+### `report_outcome` for externally computed checks
+
+`report_outcome(name, result, reason)` records a named check whose
+pass/fail was computed elsewhere (a subprocess, a remote system, your own
+comparison logic). A failing outcome marks the step `FAILED`.
+
+```python
+def test_external_check(step):
+    result, reason = run_external_validator()
+    step.report_outcome("ext-validator", result, reason)
+```
+
+### Measurements with bounds
+
+`step.measure(name=, value=, bounds=)` records a measurement and resolves
+the step to `FAILED` if the value is out of bounds. The call returns the
+pass/fail boolean and does not raise, so multiple measurements can run
+without short-circuiting.
+
+```python
+def test_battery(step):
+    step.measure(name="voltage", value=12.1, bounds={"min": 11.5, "max": 13.0}, unit="V")
+    step.measure(name="current", value=0.42, bounds={"max": 1.0}, unit="A")
+```
+
+### Substep failures
+
+A failed substep propagates failure to its parent step. A manually-set
+`SKIPPED` on a substep does not.
+
+```python
+def test_with_substep(step):
+    with step.substep(name="check") as inner:
+        inner.measure(name="value", value=99.0, bounds={"min": 0.0, "max": 5.0})
+    # The outer step resolves to FAILED because the substep failed.
+```
+
+## Propagation rules
+
+Every non-`PASSED`/`SKIPPED` step marks its parent as failed. What the
+parent records depends on whether its own scope had an abort and whether
+a child already failed:
+
+- A hard exit (`SystemExit` or an observed `KeyboardInterrupt`) in the
+  step's own scope records `ABORTED`. `ABORTED` propagates through every
+  step the abort passes through on its way up.
+- A child that already recorded a non-`PASSED`/`SKIPPED` outcome marks
+  the parent as `FAILED`. This holds whether or not an exception is still
+  propagating through the parent's scope: only the originating substep
+  records `ERROR`; ancestors inherit `FAILED`. The traceback stays on
+  the originating step's `error_info`.
+- A step records `ERROR` only when its own scope raised a non-Assertion
+  exception AND no child has failed.
+
+`SKIPPED` does not propagate. A status set explicitly via
+`current_step.update` is kept.
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_fakes.py b/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
deleted file mode 100644
index 460100daa..000000000
--- a/python/lib/sift_client/_tests/pytest_plugin/_fakes.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""Test doubles for the pytester-driven pytest-plugin tests.
-
-The fake ``ReportContext`` is a drop-in for the real one that records every
-step creation to a JSON file at session exit. Used by ``test_parametrize.py``
-to assert the step tree produced by an inner pytester pytest run.
-"""
-
-from __future__ import annotations
-
-import itertools
-import json
-from typing import TYPE_CHECKING, Any
-from unittest.mock import MagicMock
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
-class FakeStep:
-    def __init__(self, id_: str, name: str, parent_step_id: str | None, step_path: str) -> None:
-        self.id_ = id_
-        self.name = name
-        self.parent_step_id = parent_step_id
-        self.step_path = step_path
-        self.status: Any = None
-        self.description: Any = None
-        self.error_info: Any = None
-
-    def update(self, fields: dict[str, Any]) -> None:
-        for k, v in fields.items():
-            setattr(self, k, v)
-
-
-class FakeReport:
-    def __init__(self) -> None:
-        self.id_ = "report-id"
-
-    def update(self, fields: dict[str, Any]) -> None:
-        pass
-
-
-class FakeReportContext:
-    def __init__(self, steps_file: Path) -> None:
-        self.steps_file = steps_file
-        self.report = FakeReport()
-        self.client = MagicMock()
-        self.step_stack: list[FakeStep] = []
-        self.step_number_at_depth: dict[int, int] = {}
-        self.open_step_results: dict[str, bool] = {}
-        self.any_failures = False
-        self.log_file: Path | None = None
-        self.steps: list[dict[str, Any]] = []
-        self._ids = itertools.count(1)
-
-    def __enter__(self) -> FakeReportContext:
-        return self
-
-    def __exit__(self, *_: Any) -> None:
-        self.steps_file.write_text(json.dumps(self.steps))
-
-    def new_step(
-        self,
-        name: str,
-        description: str | None = None,
-        assertion_as_fail_not_error: bool = True,
-        metadata: dict[str, Any] | None = None,
-    ) -> Any:
-        # Reuse the real NewStep machinery — it talks to this fake via the
-        # methods below.
-        from sift_client.util.test_results.context_manager import NewStep
-
-        return NewStep(
-            self,  # type: ignore[arg-type]
-            name=name,
-            description=description,
-            assertion_as_fail_not_error=assertion_as_fail_not_error,
-            metadata=metadata,
-        )
-
-    def get_next_step_path(self) -> str:
-        top = self.step_stack[-1] if self.step_stack else None
-        path = top.step_path if top else ""
-        next_n = self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        prefix = f"{path}." if path else ""
-        return f"{prefix}{next_n}"
-
-    def create_step(
-        self,
-        name: str,
-        description: str | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> FakeStep:
-        step_path = self.get_next_step_path()
-        parent = self.step_stack[-1] if self.step_stack else None
-        step = FakeStep(
-            id_=f"step-{next(self._ids)}",
-            name=name,
-            parent_step_id=parent.id_ if parent else None,
-            step_path=step_path,
-        )
-        self.step_number_at_depth[len(self.step_stack)] = (
-            self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        )
-        self.step_stack.append(step)
-        self.open_step_results[step.step_path] = True
-        self.steps.append(
-            {
-                "id": step.id_,
-                "name": name,
-                "parent_step_id": step.parent_step_id,
-                "step_path": step_path,
-            }
-        )
-        return step
-
-    def record_step_outcome(self, outcome: bool, step: FakeStep) -> None:
-        if not outcome:
-            self.open_step_results[step.step_path] = False
-            self.any_failures = True
-
-    def resolve_and_propagate_step_result(self, step: FakeStep, error_info: Any = None) -> bool:
-        result = self.open_step_results.get(step.step_path, True)
-        if error_info:
-            result = False
-        return result
-
-    def exit_step(self, step: FakeStep) -> None:
-        self.step_number_at_depth[len(self.step_stack)] = 0
-        stack_top = self.step_stack.pop()
-        self.open_step_results.pop(step.step_path)
-        if stack_top.id_ != step.id_:
-            raise ValueError("popped step was not the top of the stack")
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
new file mode 100644
index 000000000..e92d1726e
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -0,0 +1,139 @@
+"""Read step status sequences from a Sift offline-mode log file.
+
+The contract suite drives each scenario through an inner pytester session
+run with ``--sift-offline``, which causes the real plugin + ``ReportContext``
+to write every test-result API call to a JSONL log. This module parses
+that log into a per-step status timeline that ``test_pass_fail.py`` asserts
+against, with no test-only ``ReportContext`` fake required.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from sift_client._internal.low_level_wrappers._test_results_log import iter_log_data_lines
+from sift_client.sift_types.test_report import TestStatus
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+@dataclass
+class CapturedStep:
+    step_id: str
+    name: str
+    step_path: str
+    parent_step_id: str | None
+    statuses: list[TestStatus] = field(default_factory=list)
+
+
+_PROTO_STATUS_NAMES = {
+    "TEST_STATUS_UNSPECIFIED": TestStatus.UNSPECIFIED,
+    "TEST_STATUS_DRAFT": TestStatus.DRAFT,
+    "TEST_STATUS_PASSED": TestStatus.PASSED,
+    "TEST_STATUS_FAILED": TestStatus.FAILED,
+    "TEST_STATUS_ABORTED": TestStatus.ABORTED,
+    "TEST_STATUS_ERROR": TestStatus.ERROR,
+    "TEST_STATUS_IN_PROGRESS": TestStatus.IN_PROGRESS,
+    "TEST_STATUS_SKIPPED": TestStatus.SKIPPED,
+}
+
+
+def _status(name: str | None) -> TestStatus:
+    if name is None:
+        return TestStatus.UNSPECIFIED
+    return _PROTO_STATUS_NAMES.get(name, TestStatus.UNSPECIFIED)
+
+
+def parse_log(log_path: Path) -> dict[str, CapturedStep]:
+    """Parse the offline log into ``{step_id: CapturedStep}``.
+
+    Walks the JSONL file in order, building a ``CapturedStep`` for each
+    ``CreateTestStep`` entry and appending the new status from each
+    ``UpdateTestStep`` entry.
+    """
+    steps: dict[str, CapturedStep] = {}
+    for request_type, response_id, json_str in iter_log_data_lines(log_path):
+        payload = json.loads(json_str)
+        test_step = payload.get("testStep", {})
+        if request_type == "CreateTestStep" and response_id:
+            steps[response_id] = CapturedStep(
+                step_id=response_id,
+                name=test_step.get("name", ""),
+                step_path=test_step.get("stepPath", ""),
+                parent_step_id=test_step.get("parentStepId") or None,
+                statuses=[_status(test_step.get("status"))],
+            )
+        elif request_type == "UpdateTestStep":
+            step_id = test_step.get("testStepId")
+            new_status = test_step.get("status")
+            if step_id and step_id in steps and new_status is not None:
+                steps[step_id].statuses.append(_status(new_status))
+    return steps
+
+
+_active_log: Path | None = None
+_cached: dict[str, CapturedStep] | None = None
+
+
+def set_log(path: Path) -> None:
+    """Point subsequent queries at a new log file. Clears the parse cache."""
+    global _active_log, _cached
+    _active_log = path
+    _cached = None
+
+
+def _steps() -> dict[str, CapturedStep]:
+    global _cached
+    if _cached is None:
+        if _active_log is None or not _active_log.exists():
+            _cached = {}
+        else:
+            _cached = parse_log(_active_log)
+    return _cached
+
+
+def steps_by_name(name: str) -> list[CapturedStep]:
+    return [s for s in _steps().values() if s.name == name]
+
+
+def test_step(name: str) -> CapturedStep | None:
+    """The step the autouse ``step`` fixture creates for the test function.
+
+    Multiple steps can share a name (e.g. when the makereport hook records an
+    inline step for a collection-time skip on top of the autouse step). The
+    autouse step is the shallowest by path depth.
+    """
+    matches = steps_by_name(name)
+    if not matches:
+        return None
+    return min(matches, key=lambda s: s.step_path.count("."))
+
+
+def final_status(name: str) -> TestStatus | None:
+    step = test_step(name)
+    return step.statuses[-1] if step and step.statuses else None
+
+
+def load_steps(log_path: Path) -> list[dict]:
+    """Load the offline log as a list of step records keyed by hierarchy fields.
+
+    Each record has ``id``, ``name``, ``parent_step_id``, ``step_path``, the
+    shape ``test_hierarchy.py`` expects for its ``_by_name`` and
+    ``_ancestor_names`` walkers. Returns an empty list if the log was never
+    created (e.g. every item in the inner session was ``sift_exclude``-d, so
+    the plugin's ``report_context`` fixture never fired).
+    """
+    if not log_path.exists():
+        return []
+    return [
+        {
+            "id": s.step_id,
+            "name": s.name,
+            "parent_step_id": s.parent_step_id,
+            "step_path": s.step_path,
+        }
+        for s in parse_log(log_path).values()
+    ]
diff --git a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
new file mode 100644
index 000000000..7e366a512
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
@@ -0,0 +1,105 @@
+# Pytest-plugin step-status: test scenarios
+
+Reference for the pass/fail scenarios covered by
+[`test_pass_fail.py`](test_pass_fail.py). Each row pairs a scenario with the
+`TestStatus` the plugin records, and maps to the user-facing contract in
+[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
+
+`TestStatus` values come from `sift_client.sift_types.test_report.TestStatus`:
+`PASSED`, `FAILED`, `ERROR`, `SKIPPED`, `ABORTED`, `IN_PROGRESS`. Hard process
+exits the plugin can observe (`SystemExit`, `KeyboardInterrupt` when pytest
+delivers a call-phase report) map to `ABORTED`. A session-aborting interrupt
+that fires before the plugin sees it leaves the step in `IN_PROGRESS`.
+
+## Case ID scheme
+
+Each scenario has a stable case ID of the form `PREFIX-NN`. Tests in
+`test_pass_fail.py` reference their case ID in a leading comment so a test can
+be traced back to its row here without rereading the scenario:
+
+| Prefix  | Section                                  |
+| ------- | ---------------------------------------- |
+| `CALL`  | Call-phase exit paths                    |
+| `SKIP`  | Skip paths                               |
+| `XFAIL` | xfail / xpass                            |
+| `PHASE` | Setup / teardown phases                  |
+| `COLL`  | Collection / fixture-resolution failures |
+| `API`   | Plugin-API exit paths                    |
+
+
+## Call-phase exit paths
+
+| Case      | Scenario                        | Trigger                              | Outcome                                                                                                  |
+| --------- | ------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------- |
+| `CALL-01` | Test passes                     | function body returns cleanly        | `PASSED`                                                                                                 |
+| `CALL-02` | Assert failure in call phase    | `assert 1 == 2`                      | `FAILED`                                                                                                 |
+| `CALL-03` | Generic exception in call phase | `raise ValueError("boom")`           | `ERROR`                                                                                                  |
+| `CALL-04` | `pytest.fail("...")` from body  | `pytest.fail("intentional failure")` | `FAILED`                                                                                                 |
+| `CALL-05` | `SystemExit` from the test body | `sys.exit(1)`                        | `ABORTED`                                                                                                |
+| `CALL-06` | `KeyboardInterrupt` in body     | `raise KeyboardInterrupt`            | `IN_PROGRESS` — session aborts before the plugin sees the interrupt; `ABORTED` if the plugin does see it |
+| `CALL-07` | Substep raises non-Assertion exception | `with step.substep(...): raise ValueError("boom")` | Substep `ERROR`, test step `FAILED` (child-failed signal outranks the propagating exception) |
+
+## Skip paths
+
+| Case      | Scenario                         | Trigger                                      | Outcome                                                                  |
+| --------- | -------------------------------- | -------------------------------------------- | ------------------------------------------------------------------------ |
+| `SKIP-01` | Collection-time skip             | `@pytest.mark.skip(reason=...)`              | `SKIPPED` — only the makereport hook records a step; no autouse step ran |
+| `SKIP-02` | Conditional collection-time skip | `@pytest.mark.skipif(True, reason=...)`      | `SKIPPED` — same route as `@pytest.mark.skip`                            |
+| `SKIP-03` | Runtime skip in body             | `pytest.skip("...")`                         | Outer step `SKIPPED`; no duplicate nested step                           |
+| `SKIP-04` | Skip raised inside a fixture     | `@pytest.fixture` calls `pytest.skip("...")` | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step        |
+
+## xfail / xpass
+
+| Case       | Scenario                                  | Trigger                                                    | Outcome                                                  |
+| ---------- | ----------------------------------------- | ---------------------------------------------------------- | -------------------------------------------------------- |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                     | `PASSED` — test fulfilled the xfail expectation          |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`          | `FAILED` — mark no longer matches reality                |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                     | `PASSED` — `strict=False` doesn't insist on the failure  |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` — `raises=` mismatch is a real test failure     |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)      | `SKIPPED` — the test never ran                           |
+
+## Setup / teardown phases
+
+| Case       | Scenario                                     | Trigger                                                            | Outcome                                                                                                                          |
+| ---------- | -------------------------------------------- | ------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError)   | `@pytest.fixture` raises before `yield`; test body never runs      | `ERROR` — plugin reads the setup-phase report and maps `failed` → `ERROR` (a `phase=setup` annotation is a planned follow-up)    |
+| `PHASE-02` | Teardown-phase fixture failure               | `@pytest.fixture` raises after `yield`; test body passed           | `FAILED` — plugin upgrades a passed step when the teardown report shows `failed` (a `phase=teardown` annotation is a planned follow-up) |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | `FAILED` — call-phase failure dominates; surfacing the teardown error alongside is a planned follow-up                           |
+
+## Collection / fixture-resolution failures
+
+| Case      | Scenario        | Trigger                            | Outcome                                                                                                            |
+| --------- | --------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `COLL-01` | Missing fixture | `def test_x(nonexistent_fixture):` | `ERROR` — missing fixture surfaces as a setup-phase failure (a `phase=setup` annotation is a planned follow-up)    |
+
+## Plugin-API exit paths (in-test mutations)
+
+| Case     | Scenario                          | Trigger                                                                   | Outcome                                                                                                                     |
+| -------- | --------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
+| `API-01` | Manual status override            | `step.current_step.update({"status": TestStatus.FAILED})`                 | `FAILED`                                                                                                                    |
+| `API-02` | `report_outcome(result=False)`    | `step.report_outcome("the_check", False, "did not match")`                | `FAILED`                                                                                                                    |
+| `API-03` | `measure(...)` out-of-bounds      | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`     | `FAILED`                                                                                                                    |
+| `API-04` | Failed measurement on a substep   | `with step.substep(...) as s: s.measure(... out-of-bounds)`               | `FAILED` — propagates from substep to parent                                                                                |
+| `API-05` | Manually-skipped substep          | `with step.substep(...) as s: s.current_step.update({"status": SKIPPED})` | Parent step `PASSED` — skip does not propagate as a failure                                                                 |
+| `API-06` | Hard exit inside a nested substep | `with step.substep(...) as s: with s.substep(...): sys.exit(1)`           | Every open step on the unwind path records `ABORTED`; a sibling substep that closed before the abort keeps its prior status |
+
+## Out of scope
+
+Scenarios deliberately not covered by this suite:
+
+- **Timeout** — needs `pytest-timeout` or a manual signal harness.
+- **Signal (SIGKILL / SIGTERM)** — cannot be caught from inside the process;
+  needs a subprocess-level harness.
+- **`pytest.exit("...")`** — niche; the "aborts subsequent tests" behavior
+  is hard to characterize cleanly because each `pytester` invocation is
+  its own session.
+- **`os._exit()`** — bypasses Python cleanup entirely; can't be tested
+  in-process because it would kill the outer pytest run. Guaranteed
+  data-loss case alongside `SystemExit` / `SIGKILL`.
+- **Parametrize-level marks** (`pytest.param(..., marks=pytest.mark.xfail / skip)`)
+  — routes through a different selection path but produces the same
+  `report.outcome`, so behavior matches the function-level marks already
+  covered above.
+- **Import error / syntax error / `conftest.py` error** — these fail
+  collection entirely; no `item` is produced and no plugin hook fires, so
+  no Sift step is recorded.
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
index cecad2df8..1efd4e817 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -4,65 +4,43 @@
 classes (including nested), parametrize axes — plus the ini opt-out flags,
 failure-cleanup semantics, and the drain helper.
 
-Each test spins up an inner pytest run via ``pytester`` whose conftest swaps
-in a ``FakeReportContext`` (from ``_fakes.py``) that records every step
-creation to a JSON file. The outer test reads that file and asserts the
-resulting step tree.
+Each test spins up an inner pytest run via ``pytester`` configured with
+``--sift-offline`` and a known log path. The plugin writes every test-result
+API call to that JSONL log, and the outer test parses it via
+``_step_status_capture.load_steps`` to reconstruct the step tree.
 """
 
 from __future__ import annotations
 
-import json
-from pathlib import Path as _Path
 from textwrap import dedent
 from typing import TYPE_CHECKING
 
 import pytest
 
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
+
 if TYPE_CHECKING:
     from pathlib import Path
 
-_STEPS_FILE_ENV = "SIFT_FAKE_STEPS_FILE"
-
-# ``_fakes.py`` is excluded from the wheel by ``pyproject.toml``'s
-# ``packages.find`` rule that strips ``sift_client._tests``. The inner
-# pytester subprocess uses the installed package and cannot import from
-# ``sift_client._tests``. Embed the fake source directly into the inner
-# conftest so the subprocess gets a fully self-contained module to load.
-_FAKES_SOURCE = (_Path(__file__).parent / "_fakes.py").read_text()
-
-_INNER_CONFTEST = f"""
-{_FAKES_SOURCE}
-
-import os
-from pathlib import Path
-from unittest.mock import MagicMock
-
-import pytest
-
-pytest_plugins = ["sift_client.pytest_plugin"]
 
+_INNER_CONFTEST = 'pytest_plugins = ["sift_client.pytest_plugin"]\n'
 
-@pytest.fixture(scope="session")
-def sift_client():
-    return MagicMock()
 
-
-@pytest.fixture(scope="session", autouse=True)
-def report_context(sift_client):
-    import sift_client.pytest_plugin as plugin_module
-    steps_file = Path(os.environ[{_STEPS_FILE_ENV!r}])
-    with FakeReportContext(steps_file) as ctx:
-        plugin_module.REPORT_CONTEXT = ctx
-        yield ctx
-"""
+def _base_ini_lines(log_path: Path) -> list[str]:
+    """Default ini settings every inner pytester run needs."""
+    return [
+        "[pytest]",
+        "sift_offline = true",
+        f"sift_log_file = {log_path}",
+        "sift_git_metadata = false",
+    ]
 
 
 @pytest.fixture
-def steps_file(pytester: pytest.Pytester, monkeypatch: pytest.MonkeyPatch) -> Path:
-    path = pytester.path / "captured_steps.json"
+def log_file(pytester: pytest.Pytester) -> Path:
+    path = pytester.path / "sift.log"
     pytester.makeconftest(_INNER_CONFTEST)
-    monkeypatch.setenv(_STEPS_FILE_ENV, str(path))
+    pytester.makefile(".ini", pytest="\n".join(_base_ini_lines(path)) + "\n")
     return path
 
 
@@ -85,9 +63,7 @@ def _ancestor_names(steps: list[dict], leaf: dict) -> list[str]:
     return chain
 
 
-def test_class_methods_cluster_under_class_step(
-    pytester: pytest.Pytester, steps_file: Path
-) -> None:
+def test_class_methods_cluster_under_class_step(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_klass=dedent(
             """
@@ -102,7 +78,7 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["TestFoo"]) == 1
     class_id = by_name["TestFoo"][0]["id"]
@@ -110,7 +86,7 @@ def test_b(self):
     assert by_name["test_b"][0]["parent_step_id"] == class_id
 
 
-def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_nested=dedent(
             """
@@ -123,7 +99,7 @@ def test_a(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["TestOuter"]) == 1
     assert len(by_name["TestInner"]) == 1
@@ -136,7 +112,7 @@ def test_a(self):
     ]
 
 
-def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_cp=dedent(
             """
@@ -151,7 +127,7 @@ def test_a(self, v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     class_id = by_name["TestFoo"][0]["id"]
     test_a_id = by_name["test_a"][0]["id"]
@@ -160,7 +136,7 @@ def test_a(self, v):
     assert by_name["v=2"][0]["parent_step_id"] == test_a_id
 
 
-def test_two_sibling_classes_in_module(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_two_sibling_classes_in_module(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_sib=dedent(
             """
@@ -176,7 +152,7 @@ def test_y(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     mod_id = by_name["test_sib.py"][0]["id"]
     assert by_name["TestA"][0]["parent_step_id"] == mod_id
@@ -186,7 +162,7 @@ def test_y(self):
     assert len(by_name["TestB"]) == 1
 
 
-def test_mixed_class_and_free_function(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_mixed_class_and_free_function(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_mix=dedent(
             """
@@ -201,7 +177,7 @@ def test_free():
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     mod_id = by_name["test_mix.py"][0]["id"]
     # Class method parents to TestA; free function parents directly to module.
@@ -211,7 +187,7 @@ def test_free():
 
 
 def test_class_with_all_excluded_methods_no_class_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_excl=dedent(
@@ -231,14 +207,14 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "TestFoo" not in by_name
     assert "test_a" not in by_name
     assert "test_b" not in by_name
 
 
-def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_clsexcl=dedent(
             """
@@ -256,14 +232,14 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "TestFoo" not in by_name
     assert "test_a" not in by_name
 
 
 def test_class_docstring_becomes_step_description(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_doc=dedent(
@@ -278,7 +254,7 @@ def test_a(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # The fake records step creation but not all fields — check the class
     # step was recorded, then read the description via the FakeStep's
@@ -289,7 +265,7 @@ def test_a(self):
 
 
 def test_transition_between_class_chains_drains_parametrize(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_trans=dedent(
@@ -310,7 +286,7 @@ def test_y(self, w):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Each class opens exactly once; parametrize parents under the right class.
     assert len(by_name["TestA"]) == 1
@@ -396,7 +372,7 @@ def __exit__(self, *_: object) -> None:
 
 
 def test_failing_test_in_class_does_not_orphan_class_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """A failing class method must not block the class step from cleaning up.
 
@@ -422,7 +398,7 @@ def test_c(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2, failed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["TestFoo"]) == 1
     assert len(by_name["TestBar"]) == 1
@@ -439,7 +415,7 @@ def test_c(self):
 
 
 def test_failing_parametrized_method_in_class_closes_full_chain(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """A failing parametrized class method must not orphan its parametrize parents."""
     pytester.makepyfile(
@@ -460,7 +436,7 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2, failed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     foo_id = by_name["TestFoo"][0]["id"]
     test_a_id = by_name["test_a"][0]["id"]
@@ -476,18 +452,18 @@ def test_b(self):
 # ---------------------------------------------------------------------------
 
 
-def _write_ini(pytester: pytest.Pytester, **overrides: object) -> None:
-    """Write a pytest.ini with the given sift_* overrides set under [pytest]."""
-    lines = ["[pytest]"]
+def _write_ini(pytester: pytest.Pytester, log_file: Path, **overrides: object) -> None:
+    """Write a pytest.ini with the given sift_* overrides, preserving the
+    offline/log/git-metadata defaults the ``log_file`` fixture installs.
+    """
+    lines = _base_ini_lines(log_file)
     for key, value in overrides.items():
         lines.append(f"{key} = {value}")
     pytester.makefile(".ini", pytest="\n".join(lines) + "\n")
 
 
-def test_sift_class_step_false_skips_class_steps(
-    pytester: pytest.Pytester, steps_file: Path
-) -> None:
-    _write_ini(pytester, sift_class_step="false")
+def test_sift_class_step_false_skips_class_steps(pytester: pytest.Pytester, log_file: Path) -> None:
+    _write_ini(pytester, log_file, sift_class_step="false")
     pytester.makepyfile(
         test_noclass=dedent(
             """
@@ -502,7 +478,7 @@ def test_b(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "TestFoo" not in by_name
     mod_id = by_name["test_noclass.py"][0]["id"]
@@ -511,9 +487,9 @@ def test_b(self):
 
 
 def test_sift_module_step_false_skips_module_step(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
-    _write_ini(pytester, sift_module_step="false")
+    _write_ini(pytester, log_file, sift_module_step="false")
     pytester.makepyfile(
         test_nomod=dedent(
             """
@@ -525,7 +501,7 @@ def test_a(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "test_nomod.py" not in by_name
     # TestFoo attaches to the report root (no parent recorded by the fake).
@@ -534,9 +510,9 @@ def test_a(self):
 
 
 def test_sift_parametrize_nesting_false_keeps_flat_leaves(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
-    _write_ini(pytester, sift_parametrize_nesting="false")
+    _write_ini(pytester, log_file, sift_parametrize_nesting="false")
     pytester.makepyfile(
         test_flat=dedent(
             """
@@ -550,7 +526,7 @@ def test_a(v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # No parametrize parent step.
     assert "test_a" not in by_name
@@ -564,7 +540,7 @@ def test_a(v):
 
 
 def test_sift_module_step_false_still_drains_across_modules(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """sift_module_step=false must not merge same-named classes across modules.
 
@@ -572,7 +548,7 @@ def test_sift_module_step_false_still_drains_across_modules(
     (even when it's not rendered as a step), so two modules each declaring
     ``class TestFoo`` produce two distinct ``TestFoo`` frames in the diff.
     """
-    _write_ini(pytester, sift_module_step="false")
+    _write_ini(pytester, log_file, sift_module_step="false")
     pytester.makepyfile(
         test_a=dedent(
             """
@@ -591,7 +567,7 @@ def test_y(self):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Two distinct TestFoo class steps — one per module — not a shared frame.
     assert len(by_name["TestFoo"]) == 2
@@ -605,7 +581,7 @@ def test_y(self):
 
 
 def test_package_step_default_opens_for_init_dirs(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """Default: a directory with ``__init__.py`` produces a parent package step."""
     pytester.mkpydir("pkg_a")
@@ -619,7 +595,7 @@ def test_one():
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "pkg_a" in by_name
     pkg_id = by_name["pkg_a"][0]["id"]
@@ -628,7 +604,7 @@ def test_one():
 
 
 def test_same_named_packages_in_different_dirs_do_not_merge(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """Two packages with the same display name but different paths must stay distinct.
 
@@ -663,7 +639,7 @@ def test_two():
     # name on disk don't collide during sys.path-based import.
     result = pytester.runpytest_subprocess("-v", "--import-mode=importlib")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Two distinct ``utils`` package steps — one per project.
     assert len(by_name["utils"]) == 2
@@ -677,10 +653,10 @@ def test_two():
 
 
 def test_sift_package_step_false_skips_package_steps(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     """With ``sift_package_step=false`` the directory step is suppressed."""
-    _write_ini(pytester, sift_package_step="false")
+    _write_ini(pytester, log_file, sift_package_step="false")
     pytester.mkpydir("pkg_a")
     (pytester.path / "pkg_a" / "test_x.py").write_text(
         dedent(
@@ -692,7 +668,7 @@ def test_one():
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert "pkg_a" not in by_name
     # The module step still opens and is now the top-level frame.
@@ -700,10 +676,11 @@ def test_one():
 
 
 def test_all_three_flags_false_matches_legacy_behavior(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     _write_ini(
         pytester,
+        log_file,
         sift_module_step="false",
         sift_class_step="false",
         sift_parametrize_nesting="false",
@@ -722,7 +699,7 @@ def test_a(self, v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # No module, class, or parametrize parents — just bracket-mangled leaves.
     assert "test_legacy.py" not in by_name
@@ -740,7 +717,7 @@ def test_a(self, v):
 
 
 def test_single_parametrize_clusters_under_originalname(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_rail=dedent(
@@ -755,7 +732,7 @@ def test_rail(v):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Module step + one shared `test_rail` parent + two leaves.
     assert len(by_name["test_rail.py"]) == 1
@@ -768,7 +745,7 @@ def test_rail(v):
 
 
 def test_stacked_parametrize_nests_outer_to_inner(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_iso=dedent(
@@ -784,7 +761,7 @@ def test_iso(voltage, component):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=4)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # One `test_iso` parent, two `voltage='…'` parents, four `component='…'` leaves.
     assert len(by_name["test_iso"]) == 1
@@ -806,7 +783,7 @@ def test_iso(voltage, component):
         assert leaf["parent_step_id"] in voltage_ids
 
 
-def test_fixture_parametrization_participates(pytester: pytest.Pytester, steps_file: Path) -> None:
+def test_fixture_parametrization_participates(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_widget=dedent(
             """
@@ -823,7 +800,7 @@ def test_widget(widget):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=2)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     assert len(by_name["test_widget"]) == 1
     parent_id = by_name["test_widget"][0]["id"]
@@ -832,7 +809,7 @@ def test_widget(widget):
 
 
 def test_module_boundary_isolates_parametrize_stack(
-    pytester: pytest.Pytester, steps_file: Path
+    pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
         test_a=dedent(
@@ -856,7 +833,7 @@ def test_two(w):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=4)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
     # Each module step contains its own `test_one`/`test_two` parametrize subtree.
     mod_a = by_name["test_a.py"][0]
@@ -865,9 +842,7 @@ def test_two(w):
     assert by_name["test_two"][0]["parent_step_id"] == mod_b["id"]
 
 
-def test_leaf_parent_chain_terminates_at_report(
-    pytester: pytest.Pytester, steps_file: Path
-) -> None:
+def test_leaf_parent_chain_terminates_at_report(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_chain=dedent(
             """
@@ -882,7 +857,7 @@ def test_chain(a, b):
     )
     result = pytester.runpytest_subprocess("-v")
     result.assert_outcomes(passed=1)
-    steps = json.loads(steps_file.read_text())
+    steps = capture.load_steps(log_file)
     leaf = next(s for s in steps if s["name"].startswith("b="))
     chain = _ancestor_names(steps, leaf)
     # leaf b=… → a=… → test_chain → test_chain.py (module step) → root
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
new file mode 100644
index 000000000..0e1540ce7
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -0,0 +1,562 @@
+"""Contract suite: maps each pytest exit path to the ``TestStatus`` the
+Sift pytest plugin is required to record on the outer step.
+
+Each scenario writes a tiny inner test file and runs it through pytester
+with a fake ``sift_client`` injected via a generated conftest. The fake
+records every step status write into ``_step_status_capture.CAPTURED_STEPS``
+so this outer test can assert on what the plugin produced.
+
+Assertions encode the contract from
+``docs/guides/pytest_plugin/pass_fail_behavior.md``. Tests for scenarios the
+plugin does not yet handle correctly are expected to **fail today** — they
+are the punch list. ``lib/sift_client/_tests/pytest_plugin/step_status_states.md``
+tracks each scenario's observed-today behavior next to the target so the
+remaining gaps are visible without running the suite.
+"""
+
+from __future__ import annotations
+
+import textwrap
+
+import pytest
+
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
+from sift_client.sift_types.test_report import TestStatus
+
+pytest_plugins = ["pytester"]
+
+
+_INNER_CONFTEST_SRC = '''
+"""Auto-generated conftest. Loading the Sift plugin is the only thing the
+inner session needs. ``--sift-offline`` on the CLI causes the plugin's
+default ``sift_client`` fixture to construct a placeholder client and the
+real ``ReportContext`` writes every API call to the JSONL log without
+contacting Sift.
+"""
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+'''
+
+
+@pytest.fixture
+def inner(pytester):
+    """Install the inner conftest. Returns ``pytester``."""
+    pytester.makeconftest(_INNER_CONFTEST_SRC)
+    return pytester
+
+
+# Prepended to every inner test file. Pytest skips marker-based ``skip`` items
+# before any autouse fixture runs, which would leave ``REPORT_CONTEXT`` unset
+# and the plugin's inline-skip recording inert. A single passing item up-front
+# forces ``report_context`` to initialize so the makereport hook can record
+# the skip into the same session's JSONL.
+_WARMUP = "def test_sift_warmup(): pass\n\n"
+
+
+def _run(pytester, body: str) -> None:
+    pytester.makepyfile(_WARMUP + textwrap.dedent(body))
+    log_path = pytester.path / "sift.log"
+    capture.set_log(log_path)
+    pytester.runpytest_inprocess(
+        "--sift-offline",
+        f"--sift-log-file={log_path}",
+        "--no-sift-git-metadata",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Call-phase exit paths
+# ---------------------------------------------------------------------------
+
+
+def test_pass_maps_to_passed(inner):
+    # Case: CALL-01
+    _run(
+        inner,
+        """
+        def test_x():
+            assert True
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.PASSED
+
+
+def test_assert_failure_maps_to_failed(inner):
+    # Case: CALL-02
+    _run(
+        inner,
+        """
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_generic_exception_maps_to_error(inner):
+    # Case: CALL-03
+    _run(
+        inner,
+        """
+        def test_x():
+            raise ValueError("boom")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_system_exit_maps_to_aborted(inner):
+    # Case: CALL-05
+    _run(
+        inner,
+        """
+        import sys
+        def test_x():
+            sys.exit(1)
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ABORTED
+
+
+def test_pytest_fail_maps_to_failed(inner):
+    # Case: CALL-04
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.fail("intentional failure")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_keyboard_interrupt_leaves_step_in_progress(inner):
+    # Case: CALL-06
+    # KeyboardInterrupt aborts the session before the call-phase makereport
+    # fires; the plugin can't observe the interrupt. The contract is that
+    # the step is left in IN_PROGRESS rather than being silently resolved
+    # to PASSED — a session-aborting interrupt should not look like a clean
+    # pass in the report.
+    try:
+        _run(
+            inner,
+            """
+            def test_x():
+                raise KeyboardInterrupt
+            """,
+        )
+    except KeyboardInterrupt:
+        pass
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.IN_PROGRESS
+
+
+def test_substep_exception_records_error_with_failed_parent(inner):
+    # Case: CALL-07
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner"):
+                raise ValueError("boom")
+        """,
+    )
+    # Only the originating substep records ERROR. The test step inherits the
+    # child-failed signal and resolves to FAILED, even though the same
+    # ValueError propagated through its scope.
+    inner_sub = next(iter(capture.steps_by_name("inner")), None)
+    test_x = capture.test_step("test_x")
+    assert inner_sub is not None
+    assert test_x is not None
+    assert inner_sub.statuses[-1] == TestStatus.ERROR
+    assert test_x.statuses[-1] == TestStatus.FAILED
+
+
+# ---------------------------------------------------------------------------
+# Skip paths
+# ---------------------------------------------------------------------------
+
+
+def test_pytest_skip_in_body_maps_to_skipped(inner):
+    # Case: SKIP-03
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.skip("not today")
+        """,
+    )
+    # Runtime skip in the body resolves the outer step to SKIPPED. The
+    # makereport hook must not create a duplicate nested step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+def test_pytest_mark_skip_records_skipped(inner):
+    # Case: SKIP-01
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skip(reason="collection-time skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # Collection-time skip: the autouse step fixture never runs. Only the
+    # makereport hook creates a step, with status SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_pytest_mark_skipif_records_skipped(inner):
+    # Case: SKIP-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skipif(True, reason="conditional skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # `skipif` with a truthy condition follows the same path as
+    # `@pytest.mark.skip`; only the makereport hook records a step.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_skip_inside_fixture_setup(inner):
+    # Case: SKIP-04
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def skipping_fixture():
+            pytest.skip("environment not ready")
+
+        def test_x(skipping_fixture):
+            assert True
+        """,
+    )
+    # A setup-phase skip resolves the outer step to SKIPPED. The makereport
+    # hook must not create a duplicate nested step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+# ---------------------------------------------------------------------------
+# xfail / xpass
+# ---------------------------------------------------------------------------
+
+
+def test_xfail_marked_test_that_fails(inner):
+    # Case: XFAIL-01
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="known issue")
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    # xfail + expected failure fulfills the contract; outer step resolves to
+    # PASSED. No duplicate nested step from the makereport hook.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+def test_xfail_strict_unexpected_pass(inner):
+    # Case: XFAIL-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(strict=True, reason="should fail")
+        def test_x():
+            assert True
+        """,
+    )
+    # strict xfail that passes must surface as FAILED: either the bug was
+    # fixed (remove the mark) or the test stopped exercising what it claimed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_xfail_non_strict_unexpected_pass(inner):
+    # Case: XFAIL-03
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="might pass sometimes")
+        def test_x():
+            assert True
+        """,
+    )
+    # Non-strict xfail does not insist on the failure, so a passing run is
+    # PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_xfail_raises_mismatch(inner):
+    # Case: XFAIL-04
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(raises=ValueError, reason="expected ValueError")
+        def test_x():
+            raise KeyError("wrong exception")
+        """,
+    )
+    # `raises=` mismatch is a real test failure — the contract required a
+    # specific exception type and a different one was thrown.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_xfail_run_false(inner):
+    # Case: XFAIL-05
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(run=False, reason="never run")
+        def test_x():
+            assert False
+        """,
+    )
+    # The test never ran; outer step is SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+# ---------------------------------------------------------------------------
+# Setup-phase / teardown-phase fixture failures
+# ---------------------------------------------------------------------------
+
+
+def test_setup_phase_fixture_failure(inner):
+    # Case: PHASE-01
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_setup():
+            raise RuntimeError("setup boom")
+
+        def test_x(bad_setup):
+            assert True
+        """,
+    )
+    # A fixture that raises before `yield` fails the setup phase. The outer
+    # step must surface this as ERROR; the test body never executed and a
+    # silently green step would hide the failure.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+def test_teardown_phase_fixture_failure(inner):
+    # Case: PHASE-02
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert True
+        """,
+    )
+    # A fixture that raises after `yield` fails the teardown phase. The
+    # outer step's status reflects the teardown failure as FAILED rather
+    # than the call-phase pass.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_call_fail_plus_teardown_fail(inner):
+    # Case: PHASE-03
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert 1 == 2
+        """,
+    )
+    # Call-phase failure dominates the outer step status; the contract also
+    # requires the teardown error to be surfaced somewhere on the step
+    # (mechanism TBD — see pass_fail_behavior.md). This test asserts the
+    # status today; tighten once a surfacing mechanism is chosen.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+# ---------------------------------------------------------------------------
+# Collection-phase failures
+# ---------------------------------------------------------------------------
+
+
+def test_missing_fixture_maps_to_error(inner):
+    # Case: COLL-01
+    _run(
+        inner,
+        """
+        def test_x(nonexistent_fixture):
+            assert True
+        """,
+    )
+    # An unresolved fixture is a setup-phase failure. The outer step
+    # surfaces as ERROR rather than a misleading green pass for a test
+    # that never executed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+# ---------------------------------------------------------------------------
+# Plugin-API exit paths (in-test mutations)
+# ---------------------------------------------------------------------------
+
+
+def test_manual_status_update_to_failed(inner):
+    # Case: API-01
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            step.current_step.update({"status": TestStatus.FAILED})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_report_outcome_false_maps_to_failed(inner):
+    # Case: API-02
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.report_outcome("the_check", False, "did not match")
+        """,
+    )
+    # Outer step sees a failed substep and rolls up to FAILED.
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_measure_out_of_bounds_maps_to_failed(inner):
+    # Case: API-03
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_substep_failure_propagates_to_parent(inner):
+    # Case: API-04
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner") as inner_step:
+                inner_step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    # `test_measure_out_of_bounds_maps_to_failed` exercises a failed
+    # measurement on the function step itself; this one verifies the same
+    # failure on a nested substep propagates up to the parent.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_skipped_substep_does_not_fail_parent(inner):
+    # Case: API-05
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            with step.substep(name="optional_check") as cal:
+                cal.current_step.update(
+                    {"status": TestStatus.SKIPPED},
+                    log_file=step.report_context.log_file,
+                )
+        """,
+    )
+    # A manually-resolved SKIPPED on a substep must not propagate as a failure
+    # to the parent. The outer step has no measurements of its own and resolves
+    # to PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_abort_inside_substep_marks_every_open_step_aborted(inner):
+    # Case: API-06
+    _run(
+        inner,
+        """
+        import sys
+        def test_x(step):
+            with step.substep(name="completed_sub"):
+                pass
+            with step.substep(name="outer_sub") as outer_sub:
+                with outer_sub.substep(name="inner_sub"):
+                    sys.exit(1)
+        """,
+    )
+    # SystemExit unwinds the substep stack on the way out. Every step that was
+    # open when the abort fired (inner substep, outer substep, test step)
+    # must record ABORTED. The sibling substep that closed cleanly before the
+    # abort must retain its PASSED status.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ABORTED
+    outer_sub = next(iter(capture.steps_by_name("outer_sub")), None)
+    inner_sub = next(iter(capture.steps_by_name("inner_sub")), None)
+    completed_sub = next(iter(capture.steps_by_name("completed_sub")), None)
+    assert outer_sub is not None
+    assert inner_sub is not None
+    assert completed_sub is not None
+    assert outer_sub.statuses[-1] == TestStatus.ABORTED
+    assert inner_sub.statuses[-1] == TestStatus.ABORTED
+    assert completed_sub.statuses[-1] == TestStatus.PASSED
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 7c4c1c2f5..c3b303ac8 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -5,14 +5,16 @@
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
+from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, Generator, Tuple
 
 import pytest
 
 from sift_client import SiftClient, SiftConnectionConfig
 from sift_client.errors import SiftWarning
-from sift_client.sift_types.test_report import TestStatus
+from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import format_truncated_traceback
 
 
 class SiftPytestPluginWarning(SiftWarning):
@@ -508,17 +510,162 @@ def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool |
     return Path(raw)
 
 
+def _error_info_from_longrepr(longrepr: Any) -> ErrorInfo:
+    """Fall back to the report's longrepr when no Python exception is available."""
+    return ErrorInfo(error_code=1, error_message=str(longrepr) if longrepr is not None else "")
+
+
+def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
+    """Resolve the function step's status from pytest's per-phase reports.
+
+    Reads ``_sift_phase_setup`` / ``_sift_phase_call`` and the test's xfail marker,
+    then mutates ``new_step.current_step`` in place and flips
+    ``new_step._sift_managed_externally`` so ``NewStep.__exit__`` emits the
+    resolved status without re-classifying.
+
+    When the call phase reports ``passed`` and no override is needed (i.e. the
+    test's own status or substep failures should drive the result), this leaves
+    the step alone so the default ``__exit__`` resolution stays in charge.
+    """
+    current_step = new_step.current_step
+    if current_step is None:
+        # The step never opened (the autouse fixture short-circuited or was
+        # disabled). Nothing to resolve.
+        return
+    setup_phase = getattr(item, "_sift_phase_setup", None)
+    call_phase = getattr(item, "_sift_phase_call", None)
+    xfail_marker = item.get_closest_marker("xfail")
+    xfail_runs = xfail_marker.kwargs.get("run", True) if xfail_marker is not None else True
+
+    status: TestStatus | None = None
+    error_info: ErrorInfo | None = None
+    keep_managed = False
+
+    if setup_phase is not None and setup_phase.report.outcome == "failed":
+        status = TestStatus.ERROR
+        excinfo = setup_phase.call.excinfo
+        if excinfo is not None:
+            error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+        else:
+            error_info = _error_info_from_longrepr(setup_phase.report.longrepr)
+    elif setup_phase is not None and setup_phase.report.outcome == "skipped":
+        status = TestStatus.SKIPPED
+    elif call_phase is None:
+        # Setup completed but the call-phase report never fired — the inner
+        # pytester session was aborted (e.g. by KeyboardInterrupt) before the
+        # plugin could observe the outcome. Leave the step at IN_PROGRESS so
+        # the report does not lie about a clean pass.
+        keep_managed = True
+    else:
+        wasxfail = getattr(call_phase.report, "wasxfail", None)
+        if wasxfail is not None:
+            if call_phase.report.outcome == "failed":
+                # Strict xpass: pytest synthesizes a failure when an xfail(strict=True)
+                # test unexpectedly passes. The xfail mark no longer matches reality.
+                status = TestStatus.FAILED
+            elif call_phase.report.outcome == "skipped":
+                if xfail_marker is not None and xfail_runs is False:
+                    # xfail(run=False): the test body never executed.
+                    status = TestStatus.SKIPPED
+                else:
+                    # xfail + expected failure: the test fulfilled its xfail expectation.
+                    status = TestStatus.PASSED
+            else:
+                # Non-strict xpass: passes that weren't required to fail.
+                status = TestStatus.PASSED
+        elif call_phase.report.outcome == "passed":
+            # Default __exit__ resolves PASSED/FAILED from open_step_results and any
+            # status the test code may have set. Don't override it here.
+            return
+        elif call_phase.report.outcome == "skipped":
+            status = TestStatus.SKIPPED
+        elif call_phase.report.outcome == "failed":
+            excinfo = call_phase.call.excinfo
+            children_passed = new_step.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
+            if excinfo is None:
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, AssertionError):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, pytest.fail.Exception):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
+                # Hard exits the plugin can observe: pytest converted the
+                # raise into a call-phase report. The session-aborting variant
+                # (call_phase is None) lands earlier and stays IN_PROGRESS.
+                status = TestStatus.ABORTED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif xfail_marker is not None:
+                # xfail(raises=X) with a non-matching exception: the contract failed.
+                status = TestStatus.FAILED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif not children_passed:
+                # A substep already recorded the error and carries the traceback;
+                # the test step only inherits the child-failed signal.
+                status = TestStatus.FAILED
+            else:
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+
+    if status is None and not keep_managed:
+        return
+
+    if status is not None:
+        # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
+        current_step.__dict__["status"] = status
+        if error_info is not None:
+            current_step.__dict__["error_info"] = error_info
+    new_step._sift_managed_externally = True
+
+
+def _finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestReport) -> None:
+    """Upgrade a closed step to FAILED when the teardown phase failed.
+
+    The autouse step fixture has already exited by the time the teardown
+    makereport hook fires, so call ``step.update`` again to override the status
+    server-side and propagate the failure to the still-open parent step.
+    """
+    step: NewStep | None = getattr(item, "_sift_step", None)
+    if step is None:
+        return
+    current_step = step.current_step
+    if current_step is None:
+        return
+    if teardown_report.outcome == "failed" and current_step.status == TestStatus.PASSED:
+        current_step.update({"status": TestStatus.FAILED})
+        step.report_context.mark_step_failed_after_close(current_step)
+
+
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """Capture pytest outcomes so assertion failures and skips land on the Sift step."""
+    """Capture per-phase reports and finalize step status after teardown.
+
+    Stashes both ``rep_<when>`` (the ``CallInfo``, kept for pytest plugins that
+    expect that conventional attribute) and ``_sift_phase_<when>`` (a
+    ``SimpleNamespace(call, report)`` used by ``_resolve_initial_status``). The
+    collection-time skip path is strictly gated on ``_sift_step`` being unset
+    so it does not duplicate steps the fixture already created.
+    """
     outcome = yield
     report = outcome.get_result()
-    if report.outcome == "skipped":
-        # Skipped tests bypass the autouse `step` fixture, so we record the step manually here.
-        if REPORT_CONTEXT:
-            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
-                new_step.current_step.update({"status": TestStatus.SKIPPED})
     setattr(item, "rep_" + report.when, call)
+    setattr(item, "_sift_phase_" + report.when, SimpleNamespace(call=call, report=report))
+
+    # Collection-time skip (``@pytest.mark.skip`` / ``skipif``): the autouse
+    # ``step`` fixture never runs, so the hook is the only place that can
+    # record a step. Presence of ``_sift_step`` is the "fixture ran" signal.
+    if (
+        REPORT_CONTEXT
+        and report.when == "setup"
+        and report.outcome == "skipped"
+        and getattr(item, "_sift_step", None) is None
+    ):
+        with REPORT_CONTEXT.new_step(name=item.name) as inline_step:
+            inline_step.current_step.update({"status": TestStatus.SKIPPED})
+
+    if report.when == "teardown":
+        _finalize_after_teardown(item, report)
 
 
 def _report_context_impl(
@@ -748,13 +895,9 @@ def _step_impl(
     with report_context.new_step(
         name=name, description=existing_docstring, assertion_as_fail_not_error=False
     ) as new_step:
+        node._sift_step = new_step
         yield new_step
-        if hasattr(node, "rep_call") and node.rep_call.excinfo:
-            new_step.update_step_from_result(
-                node.rep_call.excinfo,
-                node.rep_call.excinfo.value,
-                node.rep_call.excinfo.tb,
-            )
+        _resolve_initial_status(new_step, node)
 
 
 @pytest.fixture(autouse=True)
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index bd2ec917f..3454ef5e2 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -43,6 +43,17 @@
 logger = logging.getLogger(__name__)
 
 
+def format_truncated_traceback(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+    tb: object | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from a traceback, keeping the first frame and the last 10."""
+    stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore[arg-type]
+    stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
+    return ErrorInfo(error_code=1, error_message="".join(stack))
+
+
 def log_replay_instructions(log_file: str | Path | None) -> None:
     """Surface replay instructions when an import/replay attempt fails.
 
@@ -363,30 +374,33 @@ def record_step_outcome(self, outcome: bool, step: TestStep):
             self.open_step_results[step.step_path] = False
             self.any_failures = True
 
-    def resolve_and_propagate_step_result(
-        self,
-        step: TestStep,
-        error_info: ErrorInfo | None = None,
-    ) -> bool:
-        """Resolve the result of a step and propagate the result to the parent step if it failed."""
-        result = self.open_step_results.get(step.step_path, True)
-        if error_info:
-            result = False
-        if step.status != TestStatus.IN_PROGRESS:
-            # The step was manually completed so use that result.
-            # Skipped steps are considered passed.
-            result = step.status in (TestStatus.PASSED, TestStatus.SKIPPED)
-
-        # Update the parent step results if this step failed (true by default so no need to do anything if we didn't fail).
-        if not result:
+    def mark_step_failed_after_close(self, step: TestStep):
+        """Mark a step's parent as failed after the step has already been popped from the stack.
+
+        Used by the pytest plugin when a teardown-phase report fires after the
+        fixture's ``__exit__`` has already resolved and exited the step.
+        """
+        self.any_failures = True
+        path_parts = step.step_path.split(".")
+        if len(path_parts) > 1:
+            self.open_step_results[".".join(path_parts[:-1])] = False
+
+    def propagate_step_result(self, step: TestStep, status: TestStatus) -> bool:
+        """Propagate this step's final status to the parent step.
+
+        Status is the governor: anything outside ``{PASSED, SKIPPED}`` counts
+        as a failure for the parent. ``error_info`` is intentionally not
+        consulted here; it is free-form diagnostic data that may sit on a
+        step regardless of status.
+        """
+        succeeded = status in (TestStatus.PASSED, TestStatus.SKIPPED)
+        if not succeeded:
             self.any_failures = True
             self.open_step_results[step.step_path] = False
             path_parts = step.step_path.split(".")
             if len(path_parts) > 1:
-                parent_step_path = ".".join(path_parts[:-1])
-                self.open_step_results[parent_step_path] = False
-
-        return result
+                self.open_step_results[".".join(path_parts[:-1])] = False
+        return succeeded
 
     def exit_step(self, step: TestStep):
         """Exit a step and update the report context."""
@@ -407,6 +421,10 @@ class NewStep(AbstractContextManager):
     client: SiftClient
     assertion_as_fail_not_error: bool = True
     current_step: TestStep | None = None
+    # Set by the pytest plugin's ``_resolve_initial_status`` to signal that
+    # status was already resolved upstream and ``__exit__`` should skip
+    # re-classifying. Read via ``getattr`` so unset is treated as False.
+    _sift_managed_externally: bool = False
 
     def __init__(
         self,
@@ -471,34 +489,55 @@ def update_step_from_result(
 
         returns: The false if step failed or errored, true otherwise.
         """
+        current_step = self.current_step
+        if current_step is None:
+            # The step was never opened; nothing to resolve. Treat as a pass
+            # so callers that branch on the return value don't see a spurious
+            # failure.
+            return True
+
         error_info = None
-        assert self.current_step is not None
+        aborted = False
+        errored = False
         if exc:
             if isinstance(exc_value, AssertionError) and not self.assertion_as_fail_not_error:
                 # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
-                self.report_context.record_step_outcome(False, self.current_step)
+                self.report_context.record_step_outcome(False, current_step)
+            elif isinstance(exc_value, (KeyboardInterrupt, SystemExit)):
+                # Hard exit propagating through the substep stack: record as
+                # ABORTED so every in-progress step on the way out reflects
+                # the abort rather than coercing to ERROR.
+                aborted = True
+                error_info = format_truncated_traceback(exc, exc_value, tb)
             else:
-                stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore
-                stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
-                trace = "".join(stack)
-                error_info = ErrorInfo(
-                    error_code=1,
-                    error_message=trace,
-                )
-
-        # Resolve the status of this step (i.e. fail if children failed) and propagate the result to the parent step.
-        result = self.report_context.resolve_and_propagate_step_result(
-            self.current_step, error_info
-        )
-
-        # Mark the step as completed
-        status = self.current_step.status
+                errored = True
+                error_info = format_truncated_traceback(exc, exc_value, tb)
+
+        # Status is the governor: anything other than IN_PROGRESS was set
+        # deliberately (manual override, plugin pre-resolution, etc.) and must
+        # not be silently overwritten by side-channel signals. When the step is
+        # still IN_PROGRESS, resolve from independent state: aborts first, then
+        # a child-failed signal (parents inherit FAILED, not the originating
+        # ERROR), then the step's own captured exception, then the children-pass
+        # default. error_info is diagnostic and never drives status.
+        status = current_step.status
         if status == TestStatus.IN_PROGRESS:
-            # Update the status only if the step was in progress i.e. not updated elsewhere.
-            status = TestStatus.PASSED if result else TestStatus.FAILED
-        if error_info:
-            status = TestStatus.ERROR
-        self.current_step.update(
+            children_passed = self.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
+            if aborted:
+                status = TestStatus.ABORTED
+            elif not children_passed:
+                status = TestStatus.FAILED
+            elif errored:
+                status = TestStatus.ERROR
+            else:
+                status = TestStatus.PASSED
+
+        # Propagate based on the resolved status; error_info rides along as
+        # pure diagnostics and does not affect propagation.
+        result = self.report_context.propagate_step_result(current_step, status)
+        current_step.update(
             {
                 "status": status,
                 "end_time": datetime.now(timezone.utc),
@@ -509,6 +548,28 @@ def update_step_from_result(
         return result
 
     def __exit__(self, exc, exc_value, tb):
+        if getattr(self, "_sift_managed_externally", False):
+            # The pytest fixture already resolved status from phase reports.
+            # Propagate based on that resolved status, emit one update_step
+            # with the resolved values, and pop from the stack without
+            # re-classifying.
+            current_step = self.current_step
+            if current_step is None:
+                # The step was never opened; nothing to propagate.
+                return True
+            result = self.report_context.propagate_step_result(current_step, current_step.status)
+            current_step.update(
+                {
+                    "status": current_step.status,
+                    "end_time": datetime.now(timezone.utc),
+                    "error_info": current_step.error_info,
+                },
+            )
+            self.report_context.exit_step(current_step)
+            if hasattr(self, "force_result"):
+                result = self.force_result
+            return result
+
         result = self.update_step_from_result(exc, exc_value, tb)
 
         # Now that the step is updated. Let the report context handle removing it from the stack and updating the report context.
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index 5108b7e4a..af174aa4f 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -62,6 +62,9 @@ nav:
         # Will migrate to Guides in the future
       - Pytest Plugin: examples/pytest_plugin.md
       - Pytest Plugin Quickstart: examples/pytest_plugin_quickstart.md
+  - Guides:
+      - Pytest Plugin:
+          - Pass/Fail Behavior: guides/pytest_plugin/pass_fail_behavior.md
 #  - Guides:
 #      - Logging
 #      - Error Handling

From 45a5f8d78c15d8b907e7a53ee7f7938c5ac65755 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 12:38:40 -0700
Subject: [PATCH 06/19] Python(feat): report assertion message in report as
 error info for pytest plugin (#587)

---
 python/docs/examples/pytest_plugin.md         | 12 ++--
 .../docs/examples/pytest_plugin_quickstart.md | 13 +++--
 .../pytest_plugin/pass_fail_behavior.md       |  6 +-
 python/examples/pytest_plugin/README.md       | 15 ++---
 .../tests/with_sift/test_with_sift_demo.py    | 17 +++---
 .../pytest_plugin/_step_status_capture.py     | 10 ++++
 .../_tests/pytest_plugin/test_pass_fail.py    | 55 +++++++++++++++++++
 .../_tests/util/test_test_results_utils.py    |  6 +-
 python/lib/sift_client/pytest_plugin.py       |  6 +-
 .../util/test_results/context_manager.py      | 47 +++++++++++++---
 10 files changed, 150 insertions(+), 37 deletions(-)

diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index 5a40d450d..69dde25ae 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -86,7 +86,7 @@ def sift_client() -> SiftClient:
 | Name | Kind | Scope | Purpose |
 |---|---|---|---|
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
-| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
+| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `fail_if_measurements_failed`, and `current_step`. |
 | `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently — see [ini options](#ini-options). |
 | `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
 | `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
@@ -263,13 +263,15 @@ def test_no_fixtures_still_creates_a_step():
 def test_measure_a_single_value(step):
     """Take `step` explicitly when you want to record a measurement."""
     voltage = 4.97
-    passed = step.measure(
+    step.measure(
         name="battery_voltage",
         value=voltage,
         bounds={"min": 4.8, "max": 5.2},
         unit="V",
     )
-    assert passed, f"voltage {voltage}V out of bounds"
+    # An out-of-bounds measurement already marks the step FAILED. Call this at
+    # the end to also fail pytest, without an assertion message in error_info.
+    step.fail_if_measurements_failed()
 
 
 def test_measure_strings_and_booleans(step):
@@ -612,8 +614,8 @@ def test_only_outliers_recorded(step):
         unit="psi",
     )
     # Returns False because 99.9 is out of bounds. The step is already
-    # marked failed; raise here only if you also want pytest to fail.
-    assert all_in_bounds
+    # marked failed; call this only if you also want pytest to fail.
+    step.fail_if_measurements_failed()
 ```
 
 !!! note "`measure_all` requires at least one bound"
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
index 54328c707..bd8414aa7 100644
--- a/python/docs/examples/pytest_plugin_quickstart.md
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -136,7 +136,7 @@ TestReport (FAILED, since failures propagate up from leaves)
         │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
         ├── test_measure_series                                      PASSED
         ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
-        ├── test_assert_measurements_passed_at_end                                FAILED  (pytest FAILED)
+        ├── test_fail_if_measurements_failed_at_end                               FAILED  (pytest FAILED)
         ├── test_report_level_metadata                               PASSED
         └── TestClassStep
             ├── test_parametrize
@@ -158,12 +158,13 @@ The `with_sift` module shows two patterns for handling measurement results:
 `test_failed_measurement_marks_sift_step_failed` lets the test keep passing
 in pytest while the Sift step is `FAILED` (useful when measurements are
 diagnostic data you want to collect regardless of outcome); and
-`test_assert_measurements_passed_at_end` takes every measurement first and
-then asserts `step.measurements_passed` once at the end, so every
+`test_fail_if_measurements_failed_at_end` takes every measurement first and
+then calls `step.fail_if_measurements_failed()` once at the end, so every
 measurement still lands in the report even when one fails. The end-of-test
-assertion is the recommended pattern: asserting on an individual
-`step.measure(...)` call short-circuits on the first failure and skips
-every measurement that follows. Expected
+call is the recommended pattern: it fails via `pytest.fail` (no assertion
+noise in `error_info`), and unlike asserting on an individual
+`step.measure(...)` call it does not short-circuit on the first failure and
+skip every measurement that follows. Expected
 pytest output is `16 passed, 3 failed, 1 skipped`.
 
 Flip any of the `sift_*_step` / `sift_parametrize_nesting` flags in
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
index 6e9b1d6e3..5c0f178c2 100644
--- a/python/docs/guides/pytest_plugin/pass_fail_behavior.md
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -26,8 +26,10 @@ The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
 | `pytest.fail("...")` from the body        | `pytest.fail("intentional failure")` | `FAILED` |
 | Uncaught non-assertion exception          | `raise ValueError("boom")`           | `ERROR`  |
 
-A non-assertion exception gets its formatted traceback recorded on
-`step.error_info.error_message`.
+An assertion failure records the concise assertion message (the exception
+line(s), no traceback frames) on `step.error_info.error_message` while still
+mapping to `FAILED`. A non-assertion exception gets its formatted traceback
+recorded on `step.error_info.error_message`.
 
 ## Hard exits
 
diff --git a/python/examples/pytest_plugin/README.md b/python/examples/pytest_plugin/README.md
index c74a9c939..6eeaf9a34 100644
--- a/python/examples/pytest_plugin/README.md
+++ b/python/examples/pytest_plugin/README.md
@@ -75,7 +75,7 @@ TestReport (FAILED, since failures propagate up from leaves)
         │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
         ├── test_measure_series                                      PASSED
         ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
-        ├── test_assert_measurements_passed_at_end                                FAILED  (pytest FAILED)
+        ├── test_fail_if_measurements_failed_at_end                               FAILED  (pytest FAILED)
         ├── test_report_level_metadata                               PASSED
         └── TestClassStep
             ├── test_parametrize
@@ -97,12 +97,13 @@ The `with_sift` module shows two patterns for handling measurement results:
 `test_failed_measurement_marks_sift_step_failed` lets the test keep passing
 in pytest while the Sift step is `FAILED` (useful when measurements are
 diagnostic data you want to collect regardless of outcome); and
-`test_assert_measurements_passed_at_end` takes every measurement first and
-then asserts `step.measurements_passed` once at the end, so every
+`test_fail_if_measurements_failed_at_end` takes every measurement first and
+then calls `step.fail_if_measurements_failed()` once at the end, so every
 measurement still lands in the report even when one fails. The end-of-test
-assertion is the recommended pattern: asserting on an individual
-`step.measure(...)` call short-circuits on the first failure and skips
-every measurement that follows. Expected
+call is the recommended pattern: it fails via `pytest.fail` (no assertion
+noise in `error_info`), and unlike asserting on an individual
+`step.measure(...)` call it does not short-circuit on the first failure and
+skip every measurement that follows. Expected
 pytest output is `16 passed, 3 failed, 1 skipped`.
 
 Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
@@ -115,5 +116,5 @@ Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
 | `conftest.py` | Plugin registration via `pytest_plugins`; optional `load_dotenv()` |
 | `pytest.ini` | The four nesting flags + git metadata flag at their defaults |
 | `tests/pytest_only/test_pytest_only_demo.py` | Plain pytest tests with no Sift APIs. The plugin captures pass/fail automatically; covers functions, fixtures, parametrize, classes, plus one each of `AssertionError` (FAILED), `pytest.skip` (SKIPPED), and a raised `ValueError` (ERROR) |
-| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `assert step.measurements_passed` end-of-test pattern that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
+| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `step.fail_if_measurements_failed()` end-of-test call that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
 | `tests/{pytest_only,with_sift}/__init__.py` | Each Python package (directory with `__init__.py`) becomes a parent step in the report tree |
diff --git a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
index 34bf602b7..ee3eef513 100644
--- a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
+++ b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
@@ -94,24 +94,25 @@ def test_failed_measurement_marks_sift_step_failed(step) -> None:
     )
 
 
-def test_assert_measurements_passed_at_end(step) -> None:
-    """Recommended pattern: take every measurement first, then assert
-    ``step.measurements_passed`` once at the end.
+def test_fail_if_measurements_failed_at_end(step) -> None:
+    """Recommended pattern: take every measurement first, then call
+    ``step.fail_if_measurements_failed()`` once at the end.
 
     Asserting on individual ``step.measure(...)`` calls raises
     ``AssertionError`` on the first failure, so any measurements after the
     failing one never run and never land in the Sift report. The end-of-test
-    assertion is strictly better for diagnostic completeness: every
-    measurement is recorded, including the failures, and the aggregate
-    result is then folded into the pytest outcome.
+    call is strictly better for diagnostic completeness: every measurement is
+    recorded, including the failures, and the aggregate result is then folded
+    into the pytest outcome. It fails via ``pytest.fail`` rather than an
+    assertion, so the failed step carries no assertion noise in ``error_info``.
 
     The ``b`` measurement below is deliberately out of bounds. ``c`` still
-    runs and is recorded; only the final ``assert`` fires.
+    runs and is recorded; only the final call fails the test.
     """
     step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
     step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})  # out of bounds
     step.measure(name="c", value=1.5, bounds={"min": 0.0, "max": 2.0})  # still recorded
-    assert step.measurements_passed, "one or more measurements out of bounds"
+    step.fail_if_measurements_failed()
 
 
 def test_report_level_metadata(step, report_context) -> None:
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
index e92d1726e..77e09bdf5 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -27,6 +27,7 @@ class CapturedStep:
     step_path: str
     parent_step_id: str | None
     statuses: list[TestStatus] = field(default_factory=list)
+    error_messages: list[str] = field(default_factory=list)
 
 
 _PROTO_STATUS_NAMES = {
@@ -58,6 +59,7 @@ def parse_log(log_path: Path) -> dict[str, CapturedStep]:
     for request_type, response_id, json_str in iter_log_data_lines(log_path):
         payload = json.loads(json_str)
         test_step = payload.get("testStep", {})
+        error_message = test_step.get("errorInfo", {}).get("errorMessage")
         if request_type == "CreateTestStep" and response_id:
             steps[response_id] = CapturedStep(
                 step_id=response_id,
@@ -65,12 +67,15 @@ def parse_log(log_path: Path) -> dict[str, CapturedStep]:
                 step_path=test_step.get("stepPath", ""),
                 parent_step_id=test_step.get("parentStepId") or None,
                 statuses=[_status(test_step.get("status"))],
+                error_messages=[error_message] if error_message else [],
             )
         elif request_type == "UpdateTestStep":
             step_id = test_step.get("testStepId")
             new_status = test_step.get("status")
             if step_id and step_id in steps and new_status is not None:
                 steps[step_id].statuses.append(_status(new_status))
+                if error_message:
+                    steps[step_id].error_messages.append(error_message)
     return steps
 
 
@@ -117,6 +122,11 @@ def final_status(name: str) -> TestStatus | None:
     return step.statuses[-1] if step and step.statuses else None
 
 
+def final_error_message(name: str) -> str | None:
+    step = test_step(name)
+    return step.error_messages[-1] if step and step.error_messages else None
+
+
 def load_steps(log_path: Path) -> list[dict]:
     """Load the offline log as a list of step records keyed by hierarchy fields.
 
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index 0e1540ce7..d5f9674ce 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -91,6 +91,12 @@ def test_x():
         """,
     )
     assert capture.final_status("test_x") == TestStatus.FAILED
+    # The concise assertion message is recorded on error_info for the UI, but
+    # without the full traceback frames.
+    message = capture.final_error_message("test_x")
+    assert message is not None
+    assert "assert 1 == 2" in message
+    assert "Traceback (most recent call last)" not in message
 
 
 def test_generic_exception_maps_to_error(inner):
@@ -131,6 +137,34 @@ def test_x():
     assert capture.final_status("test_x") == TestStatus.FAILED
 
 
+def test_fail_if_measurements_failed_fails_without_error_info(inner):
+    # An out-of-bounds measurement plus step.fail_if_measurements_failed()
+    # fails the test via pytest.fail, so the step is FAILED with no assertion
+    # message in error_info (the reason this helper exists over `assert`).
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})
+            step.fail_if_measurements_failed()
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+    assert capture.final_error_message("test_x") is None
+
+
+def test_fail_if_measurements_failed_passes_when_in_bounds(inner):
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
+            step.fail_if_measurements_failed()
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.PASSED
+
+
 def test_keyboard_interrupt_leaves_step_in_progress(inner):
     # Case: CALL-06
     # KeyboardInterrupt aborts the session before the call-phase makereport
@@ -174,6 +208,27 @@ def test_x(step):
     assert test_x.statuses[-1] == TestStatus.FAILED
 
 
+def test_substep_assert_failure_records_message_with_failed(inner):
+    # Case: CALL-02 (substep). A substep inherits assertion_as_fail_not_error
+    # from the autouse step (False under pytest), so a failed assertion in a
+    # substep resolves to FAILED and records the concise assertion message.
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner"):
+                assert 1 == 2
+        """,
+    )
+    inner_sub = next(iter(capture.steps_by_name("inner")), None)
+    assert inner_sub is not None
+    assert inner_sub.statuses[-1] == TestStatus.FAILED
+    assert inner_sub.error_messages
+    message = inner_sub.error_messages[-1]
+    assert "assert 1 == 2" in message
+    assert "Traceback (most recent call last)" not in message
+
+
 # ---------------------------------------------------------------------------
 # Skip paths
 # ---------------------------------------------------------------------------
diff --git a/python/lib/sift_client/_tests/util/test_test_results_utils.py b/python/lib/sift_client/_tests/util/test_test_results_utils.py
index 4fd6ab112..c41587314 100644
--- a/python/lib/sift_client/_tests/util/test_test_results_utils.py
+++ b/python/lib/sift_client/_tests/util/test_test_results_utils.py
@@ -463,7 +463,11 @@ def test_bad_assert(self, report_context, step):
         assert parent_step.status == TestStatus.FAILED
         assert substep.status == TestStatus.FAILED
         assert nested_substep.status == TestStatus.FAILED
-        assert nested_substep.error_info is None
+        # The assertion-as-fail path records the concise assertion message (no
+        # traceback frames) on error_info while keeping the FAILED status.
+        assert nested_substep.error_info is not None
+        assert "AssertionError" in nested_substep.error_info.error_message
+        assert "Traceback (most recent call last)" not in nested_substep.error_info.error_message
         assert nested_substep_2.status == TestStatus.ERROR
         assert "AssertionError" in nested_substep_2.error_info.error_message
         assert sibling_substep.status == TestStatus.PASSED
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index c3b303ac8..09aca5e33 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -14,7 +14,10 @@
 from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
-from sift_client.util.test_results.context_manager import format_truncated_traceback
+from sift_client.util.test_results.context_manager import (
+    format_assertion_message,
+    format_truncated_traceback,
+)
 
 
 class SiftPytestPluginWarning(SiftWarning):
@@ -588,6 +591,7 @@ def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
                 status = TestStatus.FAILED
             elif isinstance(excinfo.value, AssertionError):
                 status = TestStatus.FAILED
+                error_info = format_assertion_message(excinfo.type, excinfo.value)
             elif isinstance(excinfo.value, pytest.fail.Exception):
                 status = TestStatus.FAILED
             elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 3454ef5e2..48a89b2d9 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -54,6 +54,20 @@ def format_truncated_traceback(
     return ErrorInfo(error_code=1, error_message="".join(stack))
 
 
+def format_assertion_message(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from just the exception line(s), no traceback frames.
+
+    For assertion failures the rewritten ``assert`` explanation lives on the
+    exception itself, so stack frames add noise without information. Equivalent
+    to pytest's ``excinfo.exconly()``.
+    """
+    lines = traceback.format_exception_only(exc, exc_value)  # type: ignore[arg-type]
+    return ErrorInfo(error_code=1, error_message="".join(lines))
+
+
 def log_replay_instructions(log_file: str | Path | None) -> None:
     """Surface replay instructions when an import/replay attempt fails.
 
@@ -465,15 +479,31 @@ def measurements_passed(self) -> bool:
         """True if every measurement recorded directly on this step has passed.
 
         Counts only ``step.measure``, ``step.measure_avg``, and
-        ``step.measure_all`` calls on this ``NewStep`` instance. Useful for
-        the ``assert step.measurements_passed`` pattern at the end of a test
-        when you want to fail pytest on any out-of-bounds measurement
-        without short-circuiting on the first failure (asserting on
-        individual ``measure(...)`` return values skips every measurement
-        after the failing one).
+        ``step.measure_all`` calls on this ``NewStep`` instance. Pair it with
+        ``fail_if_measurements_failed()`` at the end of a test to fail pytest on
+        any out-of-bounds measurement without short-circuiting on the first
+        failure (asserting on individual ``measure(...)`` return values skips
+        every measurement after the failing one).
         """
         return self._failed_measurement_count == 0
 
+    def fail_if_measurements_failed(
+        self, message: str = "one or more measurements out of bounds"
+    ) -> None:
+        """Fail the pytest test if any measurement on this step was out of bounds.
+
+        Use instead of ``assert step.measurements_passed``: it fails via
+        ``pytest.fail`` so the step resolves to FAILED without attaching an
+        assertion message to ``error_info``. No-op when every measurement
+        passed. Call once at the end of the test so every measurement is still
+        recorded before the failure fires.
+        """
+        if self.measurements_passed:
+            return
+        import pytest
+
+        pytest.fail(message, pytrace=False)
+
     def update_step_from_result(
         self,
         exc: type[Exception] | None,
@@ -501,8 +531,11 @@ def update_step_from_result(
         errored = False
         if exc:
             if isinstance(exc_value, AssertionError) and not self.assertion_as_fail_not_error:
-                # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
+                # pytest-style: an assertion is a plain failure, not an error. Record the
+                # failure and attach the concise assertion message (no traceback) so the
+                # UI can show what was asserted.
                 self.report_context.record_step_outcome(False, current_step)
+                error_info = format_assertion_message(exc, exc_value)
             elif isinstance(exc_value, (KeyboardInterrupt, SystemExit)):
                 # Hard exit propagating through the substep stack: record as
                 # ABORTED so every in-progress step on the way out reflects

From 9f8f09dcf75f06774c995e28edd51ec83c86f6a9 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 13:30:38 -0700
Subject: [PATCH 07/19] Python(chore): pytest docs reorganization (#589)

---
 python/docs/examples/index.md                 |   5 +-
 python/docs/examples/pytest_plugin.md         | 828 +-----------------
 .../docs/examples/pytest_plugin_quickstart.md |   8 +-
 python/docs/guides/index.md                   |  11 +
 .../guides/pytest_plugin/configuration.md     | 220 +++++
 python/docs/guides/pytest_plugin/index.md     | 122 +++
 .../pytest_plugin/pass_fail_behavior.md       |  19 +-
 .../guides/pytest_plugin/report_structure.md  | 421 +++++++++
 .../guides/pytest_plugin/running_modes.md     | 138 +++
 python/mkdocs.yml                             |  11 +-
 10 files changed, 957 insertions(+), 826 deletions(-)
 create mode 100644 python/docs/guides/index.md
 create mode 100644 python/docs/guides/pytest_plugin/configuration.md
 create mode 100644 python/docs/guides/pytest_plugin/index.md
 create mode 100644 python/docs/guides/pytest_plugin/report_structure.md
 create mode 100644 python/docs/guides/pytest_plugin/running_modes.md

diff --git a/python/docs/examples/index.md b/python/docs/examples/index.md
index 936a35cfd..baf2601e5 100644
--- a/python/docs/examples/index.md
+++ b/python/docs/examples/index.md
@@ -6,9 +6,12 @@ This section contains interactive Jupyter notebook examples demonstrating how to
 
 - **[Basic Usage](basic.ipynb)** - Introduction to the Sift Python client, covering basic operations and API usage
 - **[Data Ingestion](ingestion.ipynb)** - Learn how to ingest telemetry data into Sift using various methods
-- **[Pytest Plugin](pytest_plugin.md)** - Turn a pytest run into a Sift TestReport with measurements, nested steps, and pass/fail outcomes
 - **[Pytest Plugin Quickstart](pytest_plugin_quickstart.md)** - Guided tour of the runnable demo project under `python/examples/pytest_plugin/`
 
+For the conceptual reference on the pytest plugin (fixtures, configuration,
+report structure, and pass/fail behavior), see the
+[Pytest Plugin guide](../guides/pytest_plugin/index.md).
+
 ## Running Examples Locally
 
 To run these examples on your local machine:
diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index 69dde25ae..986e05e1e 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -1,822 +1,14 @@
-# Pytest Plugin
-
-The Sift Python client ships a pytest plugin that turns a pytest run into a
-`TestReport` in Sift. Each test function becomes a `TestStep`, measurements
-land as rows under that step, and failures propagate up through nested
-substeps to the report itself.
-
-This page walks through wiring the plugin into a project, the fixtures and
-hooks it provides, and the patterns you'll use day-to-day.
-
-!!! info "Where the plugin lives"
-    The plugin lives at `sift_client.pytest_plugin`. It is
-    **not** registered as a `pytest11` entry point. Projects opt in with a
-    `pytest_plugins` declaration in their top-level `conftest.py`. Pytest
-    then loads the module as a real plugin: the fixtures, CLI options, and
-    `pytest_runtest_makereport` hook all register through standard pytest
-    machinery, so `pytest --trace-config` lists it and
-    `pytest -p no:sift_client.pytest_plugin` disables it.
-
-## Install
-
-```bash
-pip install sift-stack-py pytest python-dotenv
-```
-
-Set the connection details in a `.env` next to your tests:
-
-```bash
-SIFT_API_KEY="your-api-key"
-SIFT_GRPC_URI="..."
-SIFT_REST_URI="..."
-```
-
-The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your Sift organization. You can find these on the Sift Manage page as well as generate an API key.
-
-## Wire the plugin into `conftest.py`
-
-A single `pytest_plugins` declaration in your top-level `conftest.py` is all
-that's required. The plugin ships a default `sift_client` fixture that reads
-`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
-
-```python title="conftest.py"
-from dotenv import load_dotenv
-
-load_dotenv()
-
-pytest_plugins = ["sift_client.pytest_plugin"]
-```
-
-That's the whole setup. Every test in the session will now create a step on a
-single shared `TestReport`.
-
-### Customizing the `SiftClient`
-
-To construct the client differently (custom TLS, timeouts, alternate
-credentials, etc.), override the `sift_client` fixture in your conftest. The
-plugin's default falls away in favor of your definition.
-
-```python title="conftest.py"
-import os
-
-import pytest
-from dotenv import load_dotenv
-
-from sift_client import SiftClient, SiftConnectionConfig
-
-load_dotenv()
-
-pytest_plugins = ["sift_client.pytest_plugin"]
-
-
-@pytest.fixture(scope="session")
-def sift_client() -> SiftClient:
-    return SiftClient(
-        connection_config=SiftConnectionConfig(
-            api_key=os.getenv("SIFT_API_KEY"),
-            grpc_url=os.getenv("SIFT_GRPC_URI"),
-            rest_url=os.getenv("SIFT_REST_URI"),
-            use_ssl=False,
-        )
-    )
-```
-
-## Plugin provided fixtures
-
-| Name | Kind | Scope | Purpose |
-|---|---|---|---|
-| `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
-| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `fail_if_measurements_failed`, and `current_step`. |
-| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently — see [ini options](#ini-options). |
-| `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
-| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
-
-### CLI options
-
-| Flag | Default | Effect |
-|---|---|---|
-| `--sift-offline` | off (online) | Skip the session-start ping and don't contact Sift. All create/update calls go to the JSONL log file for later replay via `import-test-result-log`. Missing `SIFT_*` env vars are tolerated; placeholders are filled. |
-| `--sift-disabled` | off | Skip Sift entirely. Nothing contacts the API and no log file is written; `step.measure(...)` still evaluates bounds and returns a real pass/fail boolean. Also honored via `SIFT_DISABLED=1`. Supersedes every other flag (disabled wins over offline). |
-| `--sift-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. Incompatible with `--sift-offline` since offline mode needs the log file as its sole sink. |
-| `--no-sift-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
-
-These can be passed permanently via `addopts`:
-
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-offline
-```
-
-Or set the matching ini key directly (recommended for stable per-project
-configuration). Each CLI flag has a corresponding key under
-`[tool.pytest.ini_options]` in `pyproject.toml` or `[pytest]` in `pytest.ini`.
-CLI flags, when passed, override the ini values.
-
-| Ini key | Type | Equivalent CLI flag |
-|---|---|---|
-| `sift_log_file` | string (`true` / `false` / `none` / path) | `--sift-log-file=<value>` |
-| `sift_git_metadata` | bool (default `true`) | `--no-sift-git-metadata` (sets to `false`) |
-| `sift_offline` | bool (default `false`) | `--sift-offline` |
-| `sift_disabled` | bool (default `false`) | `--sift-disabled` (also honors `SIFT_DISABLED` env var) |
-| `sift_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
-| `sift_package_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each Python package (directory with `__init__.py`) in the test path. |
-| `sift_module_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each test module (file). |
-| `sift_class_step` | bool (default `true`) | _(ini-only)_ — open a parent step for each test class, including nested classes. |
-| `sift_parametrize_nesting` | bool (default `true`) | _(ini-only)_ — cluster parametrized tests under shared parents (`test_x → axis=value`) instead of flat leaves (`test_x[value]`). |
-
-The default `sift_client` fixture reads its two URIs from environment first
-and falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
-intentionally env-only — keep it out of source control and supply it through
-`pytest-dotenv` (see [API key handling](#api-key-handling) below). The env
-var wins when both are set, so secrets injected into a CI environment
-continue to override values committed to `pyproject.toml`. There are no CLI
-flags for credentials.
-
-| Ini key | Environment variable | Notes |
-|---|---|---|
-| _(none)_ | `SIFT_API_KEY` | Env-only. Use `.env` + `pytest-dotenv` locally; inject from your secret store in CI. |
-| `sift_grpc_uri` | `SIFT_GRPC_URI` | Stable per-org gRPC endpoint; safe to commit. |
-| `sift_rest_uri` | `SIFT_REST_URI` | Stable per-org REST endpoint; safe to commit. |
-
-```toml title="pyproject.toml"
-[tool.pytest.ini_options]
-sift_offline = true
-sift_git_metadata = false
-sift_grpc_uri = "your-org.sift.example:443"
-sift_rest_uri = "https://your-org.sift.example"
-```
-
-```ini title="pytest.ini"
-[pytest]
-sift_offline = true
-sift_git_metadata = false
-sift_grpc_uri = your-org.sift.example:443
-sift_rest_uri = https://your-org.sift.example
-```
-
-#### API key handling
-
-`SIFT_API_KEY` is deliberately read from the process environment only. The
-recommended workflow uses the
-[`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) plugin (already a
-dependency of `sift-stack-py`), which loads variables from a `.env` file
-into `os.environ` before tests run.
-
-1. Add `.env` to `.gitignore`.
-2. Drop your key into `.env` at the project root:
-
-    ```bash title=".env"
-    SIFT_API_KEY=sk-...your-key...
-    ```
-
-3. In CI, set `SIFT_API_KEY` directly via your provider's secret manager
-   instead of committing a `.env` file.
-
-`pytest-dotenv` picks the file up automatically; no `pytest_configure`
-glue is needed.
-
-!!! warning "FedRAMP / shared environments"
-    Pass `--sift-log-file=false` (or set the ini key to `"false"`)
-    to skip the temp file + worker pipeline. Create/update calls then run
-    inline against the API instead of being deferred through a subprocess.
-
-### Report metadata captured automatically
-
-Every report the plugin creates includes:
-
-- `name` and `test_case`: derived from the first positional argument to `pytest`. When it resolves to an existing path the plugin uses the basename for `name` and the full path string for `test_case`; otherwise both fall back to `pytest <args>`. `name` always has a UTC ISO timestamp appended. See examples below.
-- `test_system_name`: `socket.gethostname()`.
-- `system_operator`: `getpass.getuser()`.
-- `start_time` / `end_time`: set on session enter/exit.
-- `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
-- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-git-metadata` or when not in a git repo.
-
-Example invocations:
-
-| Pytest invocation | Report `name` | Report `test_case` |
-|---|---|---|
-| `pytest tests/test_battery.py` | `test_battery.py 2026-05-04T12:00:00.123456+00:00` | `tests/test_battery.py` |
-| `pytest tests/` | `tests 2026-05-04T12:00:00.123456+00:00` | `tests` |
-| `pytest -k voltage` | `pytest -k voltage 2026-05-04T12:00:00.123456+00:00` | `pytest -k voltage` |
-
-To override defaults (e.g. set a serial number, system operator, or extra
-metadata), call `report_context.report.update({...})` from any test or
-fixture. See [Linking a Run](#linking-a-run-to-the-report) for the same
-pattern applied to `run_id`.
-
-## Controlling which tests produce reports
-
-By default every test in the session produces a Sift step. Two markers
-and one ini key let you narrow that to a specific set of tests, which is
-useful when a repo holds tests that you don't want included in the Sift test report.
-
-| Setting                                                 | Effect                                                                                       |
-|---------------------------------------------------------|----------------------------------------------------------------------------------------------|
-| `sift_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
-| `@pytest.mark.sift_include` on a test, class, or module | Force reporting on for that scope, regardless of the project default.                        |
-| `@pytest.mark.sift_exclude` on a test, class, or module | Force reporting off for that scope, regardless of the project default.                       |
-
-Closest marker determines setting. `sift_exclude` beats `sift_include` when both apply.
-`pytestmark` at the class or module level inherits to every test in scope.
-
-### Bulk-applying a marker to a directory
-
-To opt an entire directory in (or out) without editing each file, hook
-`pytest_collection_modifyitems` in the directory's `conftest.py`:
-
-```python title="tests/example/conftest.py"
-from pathlib import Path
-
-import pytest
-
-_HERE = Path(__file__).parent
-
-
-def pytest_collection_modifyitems(config, items):
-    for item in items:
-        try:
-            item.path.relative_to(_HERE)
-        except ValueError:
-            continue
-        item.add_marker(pytest.mark.sift_include)
-```
-
-This applies `sift_include` to every test collected under `tests/example/`.
-Combine with `sift_autouse = false` in `pyproject.toml` for
-opting in to specific directories. 
-
-`pytest_collection_modifyitems` receives every item in the session, not just
-this directory's, so the `relative_to` filter is what scopes the marker.
-
-## Basic usage
-
-With the conftest in place, the simplest test needs nothing extra. The `step`
-fixture is `autouse=True` and pytest test failures and skips are mapped to
-step statuses automatically.
-
-```python title="test_basic.py"
-def test_no_fixtures_still_creates_a_step():
-    """Autouse `step` records this function as a step on the session report."""
-    assert 1 + 1 == 2
-
-
-def test_measure_a_single_value(step):
-    """Take `step` explicitly when you want to record a measurement."""
-    voltage = 4.97
-    step.measure(
-        name="battery_voltage",
-        value=voltage,
-        bounds={"min": 4.8, "max": 5.2},
-        unit="V",
-    )
-    # An out-of-bounds measurement already marks the step FAILED. Call this at
-    # the end to also fail pytest, without an assertion message in error_info.
-    step.fail_if_measurements_failed()
-
-
-def test_measure_strings_and_booleans(step):
-    """`bounds` accepts a string or `True`/`False` for non-numeric values."""
-    step.measure(name="firmware_version", value="1.4.2", bounds="1.4.2")
-    step.measure(name="self_test_passed", value=True, bounds=True)
-
-
-def test_docstring_becomes_step_description(step):
-    """This docstring is the step's description in Sift.
-
-    The plugin pulls `request.node.obj.__doc__` when it creates the step.
-    Helper functions called from within the test do not get this treatment;
-    pass `description="..."` explicitly on `substep(...)` instead.
-    """
-    assert step.current_step.description is not None
-```
-
-!!! tip "Measurements never raise"
-    `step.measure(...)` returns `True` if the value is in bounds and `False`
-    otherwise. A `False` result marks the enclosing step as failed but does
-    not raise. Chain measurements freely and inspect the boolean if you need
-    custom flow control.
-
-### Status semantics for failures
-
-The plugin uses the step exit handler in `NewStep.__exit__` to translate test
-outcomes into `TestStatus`:
-
-| Outcome | Resulting `TestStatus` |
-|---|---|
-| In-bounds measurements only | `PASSED` |
-| Failed measurement, failed `report_outcome`, failed substep, or `AssertionError` raised by the test | `FAILED` (no traceback is attached, since pytest already prints it in the runner output) |
-| Non-`AssertionError` exception escapes the test (e.g. `ValueError`, `TimeoutError`) | `ERROR`, with the formatted traceback (last 10 frames plus the first frame) on `step.error_info.error_message` |
-| Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
-
-For the full contract, including skips, xfail/xpass, hard exits (`SystemExit`,
-`KeyboardInterrupt`), setup/teardown phase failures, and propagation rules,
-see the [Pass/Fail Behavior guide](../guides/pytest_plugin/pass_fail_behavior.md).
-
-A failure or error at any depth propagates upward: the parent substep, the
-function step, the class/module/package steps above it, and the session
-report all get marked failed.
-
-## Nested steps
-
-Use `step.substep(name=...)` to open a child step. Substeps nest arbitrarily
-deep, and a failure at any depth propagates up to fail the parent and the
-report.
-
-```python title="test_nested_steps.py"
-import time
-
-
-def test_phased_check(step):
-    """Phase a single test into setup/exercise/verify substeps."""
-    with step.substep(name="setup", description="Power on and wait for boot") as setup:
-        setup.measure(name="boot_time_s", value=2.1, bounds={"max": 5.0}, unit="s")
-
-    with step.substep(name="exercise", description="Drive the test sequence"):
-        time.sleep(0.01)
-
-    with step.substep(name="verify", description="Read final state") as verify:
-        verify.measure(name="final_state", value="IDLE", bounds="IDLE")
-
-
-def test_deeply_nested(step):
-    """A failure at the bottom fails everyone above it."""
-    with step.substep(name="level_1") as l1:
-        with l1.substep(name="level_2") as l2:
-            with l2.substep(name="level_3") as l3:
-                l3.measure(name="leaf_value", value=42, bounds={"min": 0, "max": 100})
-```
-
-Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …)
-assigned by `ReportContext`. Sibling substeps within the same parent
-auto-increment; opening a new top-level step starts a new branch.
-
-### Mirroring the test layout
-
-The plugin opens a parent step for each Python package (`__init__.py`
-directory), test file, and test class above every test, plus a parent step
-for each `@pytest.mark.parametrize` axis. Every layer is on by default and
-individually opt-out via ini flags (`sift_package_step`, `sift_module_step`,
-`sift_class_step`, `sift_parametrize_nesting`). Class/module/package
-docstrings become the matching step's description.
-
-### Linking a Run to the report
-
-`report_context` is the session-scoped fixture; mutating it in one test
-affects the whole report.
-
-```python
-def test_link_run_to_report(report_context, sift_client):
-    run = sift_client.runs.create(...)  # however you create your run
-    report_context.report.update({"run_id": run.id_})
-```
-
-The same `update({...})` pattern works for any field on `TestReportUpdate`,
-including `serial_number`, `part_number`, `system_operator`, and `metadata`.
-
-## How pytest layout maps to a Sift report
-
-The plugin builds the report tree by hooking pytest's collection: every test
-node it sees becomes a step. What you control is which constructs create
-nodes and where you nest substeps inside them. Common layouts and the
-resulting report trees:
-
-### Flat module of test functions
-
-The default. Each function is one step directly under the report.
-
-```python title="test_battery.py"
-def test_voltage(step): ...
-def test_current(step): ...
-def test_temperature(step): ...
-```
+---
+hide:
+  - navigation
+  - toc
+---
 
-```text title="Sift report"
-TestReport
-├── test_voltage
-├── test_current
-└── test_temperature
-```
+<meta http-equiv="refresh" content="0; url=../../guides/pytest_plugin/">
 
-### Modules nested under a package
-
-Two test files under the same Python package (directory with `__init__.py`)
-share that package step as their parent.
-
-```python title="suites/__init__.py"
-```
-
-```python title="suites/test_battery.py"
-def test_voltage(step): ...
-def test_current(step): ...
-```
-
-```python title="suites/test_thermal.py"
-def test_idle_temp(step): ...
-def test_load_temp(step): ...
-```
-
-```text title="Sift report"
-TestReport
-└── suites
-    ├── test_battery.py
-    │   ├── test_voltage
-    │   └── test_current
-    └── test_thermal.py
-        ├── test_idle_temp
-        └── test_load_temp
-```
-
-### Test classes (and nested classes)
-
-`class TestFoo:` and `class TestOuter: class TestInner:` produce class and
-nested class steps automatically — no manual fixture needed.
-
-```python title="test_charging.py"
-class TestCharging:
-    """Charging subsystem."""
-
-    def test_starts_at_zero(self, step): ...
-    def test_reaches_full(self, step): ...
-    def test_thermal_throttle(self, step): ...
-```
-
-```text title="Sift report"
-TestReport
-└── test_charging.py
-    └── TestCharging
-        ├── test_starts_at_zero
-        ├── test_reaches_full
-        └── test_thermal_throttle
-```
-
-The class's docstring becomes the step description.
-
-### Parametrized tests
-
-Parametrized tests cluster under a parent step named after the test function,
-with one inner parent per parametrize axis (outer-to-inner in
-decorator-on-page order). Stacked parametrize produces nested step levels.
-
-```python
-@pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
-def test_rail(step, voltage):
-    step.measure(name="rail_v", value=voltage, bounds={"min": 0.0})
-```
-
-```text title="Sift report"
-TestReport
-└── test_module.py
-    └── test_rail
-        ├── voltage=3.3
-        ├── voltage=5.0
-        └── voltage=12.0
-```
-
-Stacked parametrize:
-
-```python
-@pytest.mark.parametrize("voltage", ["high", "low"])
-@pytest.mark.parametrize("component", ["motor", "valve"])
-def test_iso(step, voltage, component): ...
-```
-
-```text title="Sift report"
-TestReport
-└── test_module.py
-    └── test_iso
-        ├── voltage='high'
-        │   ├── component='motor'
-        │   └── component='valve'
-        └── voltage='low'
-            ├── component='motor'
-            └── component='valve'
-```
-
-Set `sift_parametrize_nesting = false` in `pytest.ini` to fall back to flat
-leaf names (`test_rail[3.3]`).
-
-### Helper functions
-
-Helpers called from a test do not auto-create a step. The plugin only sees
-pytest-collected nodes. To represent helper work in the report, open a
-substep at the call site and pass it into the helper:
-
-```python
-def measure_rail(step, name, value, bounds):
-    return step.measure(name=name, value=value, bounds=bounds, unit="V")
-
-
-def test_power_rails(step):
-    with step.substep(name="3.3V rail") as rail_3v3:
-        measure_rail(rail_3v3, "rail_v", 3.31, {"min": 3.2, "max": 3.4})
-
-    with step.substep(name="5V rail") as rail_5v:
-        measure_rail(rail_5v, "rail_v", 5.02, {"min": 4.9, "max": 5.1})
-```
-
-```text title="Sift report"
-TestReport
-└── test_power_rails
-    ├── 3.3V rail
-    │   └── rail_v        (measurement)
-    └── 5V rail
-        └── rail_v        (measurement)
-```
-
-!!! tip "Docstring-as-description is top-level only"
-    The plugin reads the test function's docstring and uses it as the step
-    description. Docstrings on helper functions are not picked up. Pass
-    `description="..."` explicitly on `substep(...)` if you want one.
-
-### Fixtures that contribute steps
-
-A fixture can open its own substep around setup/teardown by using `step` (for
-function-scope) or `report_context.new_step(...)` (for any scope). The substep
-ends when the fixture's `yield` returns, which makes the report tree mirror
-the lifecycle.
-
-```python
-@pytest.fixture
-def warmed_up_dut(step):
-    with step.substep(name="warmup", description="Bring DUT to operating temp"):
-        # ... do warmup work ...
-        yield "dut-handle"
-
-
-def test_steady_state(step, warmed_up_dut):
-    step.measure(name="temp_c", value=37.2, bounds={"min": 35.0, "max": 40.0})
-```
-
-```text title="Sift report"
-TestReport
-└── test_steady_state
-    ├── warmup        (from fixture)
-    └── temp_c        (measurement)
-```
-
-## Measurement variants
-
-`step.measure(...)` records exactly one measurement. For datasets coming off a
-sensor or calculated channel, use one of the bulk variants.
-
-### `measure_avg`: one row, the mean
-
-`measure_avg` accepts a Python list, a NumPy array, or a pandas `Series`,
-takes the mean, and evaluates it against bounds.
-
-```python
-import numpy as np
-import pandas as pd
-
-
-def test_avg_with_list(step):
-    samples = [4.97, 5.01, 5.03, 4.99, 5.02]
-    step.measure_avg(
-        name="bus_voltage_avg",
-        values=samples,
-        bounds={"min": 4.9, "max": 5.1},
-        unit="V",
-    )
-
-
-def test_avg_with_numpy(step):
-    samples = np.linspace(99.5, 100.5, num=50)
-    step.measure_avg(
-        name="cpu_temp_avg",
-        values=samples,
-        bounds={"min": 95.0, "max": 105.0},
-        unit="C",
-    )
-
-
-def test_avg_with_pandas(step):
-    series = pd.Series([0.998, 1.001, 0.999, 1.002, 1.000])
-    step.measure_avg(
-        name="reference_clock_ratio",
-        values=series,
-        bounds={"min": 0.99, "max": 1.01},
-    )
-```
-
-### `measure_all`: only out-of-bounds rows
-
-Records measurements only for samples that fail bounds, so an all-pass
-dataset of N samples doesn't add N rows to the report. Returns `True` when
-every sample is in bounds.
-
-```python
-def test_only_outliers_recorded(step):
-    samples = [10.1, 10.2, 10.3, 99.9, 10.0, 10.1]  # 99.9 is the outlier
-    all_in_bounds = step.measure_all(
-        name="pressure_psi",
-        values=samples,
-        bounds={"min": 9.0, "max": 11.0},
-        unit="psi",
-    )
-    # Returns False because 99.9 is out of bounds. The step is already
-    # marked failed; call this only if you also want pytest to fail.
-    step.fail_if_measurements_failed()
-```
-
-!!! note "`measure_all` requires at least one bound"
-    Passing `bounds={}` raises `ValueError("No bounds provided")`. At
-    least one of `min` or `max` must be set.
-
-### `report_outcome`: externally computed pass/fail
-
-When the decision is computed elsewhere, drop it onto the report as a
-named substep with an optional reason. Returns the result you passed in,
-so you can use it inline.
-
-```python
-def test_external_checks(step):
-    step.report_outcome(
-        name="config_loaded",
-        result=True,
-        reason="loaded /etc/dut/config.yaml",
-    )
-
-    # Failures show up as a failed substep without raising.
-    rare_warning_seen = False
-    step.report_outcome(
-        name="no_rare_warning",
-        result=not rare_warning_seen,
-        reason="grep'd dmesg for the known-flaky warning",
-    )
-```
-
-### Bounds reference
-
-| Pass to `bounds=` | Value type | Effect |
-|---|---|---|
-| `{"min": x, "max": y}` (either key optional) | `int` / `float` | Numeric window. One-sided is fine. |
-| `NumericBounds(min=x, max=y)` | `int` / `float` | Same as the dict form, explicit. |
-| `"expected-string"` | `str` (or `bool`) | Exact equality. For `bool` values, compares lowercased string (`"true"`/`"false"`). |
-| `True` or `False` | `bool` (or `str`) | Exact equality. For `str` values, compares lowercased strings. |
-| `None` | any | Records the value but does not evaluate it; measurement is recorded as `passed=True`. |
-
-The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
-
-## Skip handling
-
-- `@pytest.mark.skip` and `@pytest.mark.skipif`: the plugin's
-  `pytest_runtest_makereport` hook sees the skipped outcome and creates a
-  step with `TestStatus.SKIPPED`.
-- Inside a test function, you can mark just one substep as skipped without
-  aborting the whole test:
-
-  ```python
-  from sift_client.sift_types.test_report import TestStatus
-
-
-  def test_runtime_skip(step):
-      with step.substep(name="optional_calibration") as cal:
-          if not precondition_met():
-              cal.current_step.update({"status": TestStatus.SKIPPED})
-  ```
-
-  A manually-resolved status is honored by the step's exit handler. No
-  further bookkeeping required. `SKIPPED` does not propagate as a failure.
-
-## Running the suite
-
-```bash
-# Full run against your Sift tenant
-pytest
-
-# Pin the log file so you can replay it later if the import worker dies
-pytest --sift-log-file=./sift-results.jsonl
-```
-
-See [Running modes](#running-modes) for the offline and disabled flags
-that let the same suite run without (or without contacting) Sift.
-
-## Running modes
-
-The plugin runs in one of three modes, picked at invocation:
-
-| Mode | Flag | Network | Log file | `step.measure(...)` | When to use |
-|---|---|---|---|---|---|
-| Online (default) | _(none)_ | yes (pings at session start, aborts if it fails) | optional write-through backup | real measurement against Sift | CI with Sift credentials, local dev hitting your tenant |
-| Offline | `--sift-offline` | none | required (the sole sink) | real measurement queued to log | field tests, air-gapped labs, CI without network |
-| Disabled | `--sift-disabled` | none | none | bounds eval; returns a real bool | local dev or CI that doesn't have (or want) Sift |
-
-Pass both flags? Disabled wins. It's the "skip Sift entirely" hammer and
-supersedes everything else.
-
-### Online mode (default)
-
-`report_context` resolves `client_has_connection` at session start. The
-default implementation calls `sift_client.ping.ping()`. A failed ping
-aborts the whole session with `pytest.UsageError` and points at
-`--sift-offline` and `--sift-disabled` as escape hatches.
-
-This is loud on purpose. A CI run that silently no-ops on a flaky network
-won't get noticed until somebody goes looking for the report, which is
-usually weeks later, which is usually too late.
-
-With the default `--sift-log-file` setting on, create/update calls are
-written to a JSONL log file during the run and an
-`import-test-result-log --incremental` worker replays them against Sift
-in the background. If the worker crashes mid-session (connection failure,
-API error) or is still draining its backlog at session end, the failure
-is logged at session end with a `replay-test-result-log` command for
-manual recovery — test outcomes are unaffected and the local log file is
-preserved. Pass `--sift-log-file=false` to make every create/update
-synchronous against the API instead.
-
-#### Overriding the connection check
-
-Override `client_has_connection` when ping isn't the right signal, for
-example a token cache that's only warm when authenticated:
-
-```python title="conftest.py"
-from pathlib import Path
-
-import pytest
-
-
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client) -> bool:
-    return Path("~/.sift-token-cache").expanduser().is_file()
-```
-
-The override is ignored under `--sift-offline` and `--sift-disabled`.
-
-### Offline mode (`--sift-offline`)
-
-Same fixtures, same `step.measure(...)` semantics as online. The
-difference is where the writes go: every create/update lands in a JSONL
-log file instead of hitting the Sift API. The session-start ping is
-skipped, missing `SIFT_*` env vars are tolerated (placeholders are
-filled), and the replay worker (`import-test-result-log --incremental`)
-does not get spawned at session end.
-
-```bash
-pytest --sift-offline --sift-log-file=./run.jsonl
-```
-
-Once you have connectivity, replay it:
-
-```bash
-import-test-result-log ./run.jsonl
-```
-
-That replay creates the report, steps, and measurements against Sift.
-See [Replaying a saved log file](#replaying-a-saved-log-file) for cleanup
-and the incremental flag.
-
-`--sift-log-file=none` is rejected when offline is set. The
-log file is the only sink in offline mode, so without it the results are
-gone.
-
-!!! warning "Pin the log path"
-    Without `--sift-log-file=<path>`, offline mode writes to
-    a `tempfile.NamedTemporaryFile` and only surfaces the path via a
-    `logger.info` line. Pin a known path when you intend to replay later.
-
-### Disabled mode (`--sift-disabled`)
-
-The plugin stays loaded with the same fixtures and markers as the other
-modes. Nothing contacts Sift, no log file is written, and no `SIFT_*`
-env vars are required. `step.measure(...)`, `step.measure_avg(...)`,
-`step.measure_all(...)`, `step.substep(...)`, and
-`report_context.report.update({...})` all behave normally — bounds
-evaluate and you get a real pass/fail boolean back.
-
-Entities returned in disabled mode report `is_simulated == True` (on
-`TestReport`, `TestStep`, `TestMeasurement`, and `ReportContext`) so
-consumers and tests can branch on provenance. Offline-mode entities
-also report `is_simulated == True`.
-
-How to turn it on, in the order most projects pick:
-
-```bash
-# In an .envrc, devcontainer, or CI job config
-export SIFT_DISABLED=1
-
-# Per-invocation kill-switch
-pytest --sift-disabled
-
-# Per-project default (uncommon; online is usually the right default)
-# pyproject.toml:
-#   [tool.pytest.ini_options]
-#   sift_disabled = true
-```
-
-Good fit for local dev without Sift credentials. Also for library
-consumers who don't have a Sift tenant. Also useful in CI for runs that
-shouldn't add noise to the report stream, like a PR job re-running the
-same suite five times in a row.
-
-## Replaying a saved log file
-
-When the worker doesn't finish cleanly the plugin will print a hint mentioning
-`import-test-result-log`. To import:
+# Pytest Plugin
 
-```bash
-import-test-result-log <path-to-log.jsonl>
-```
+This page has moved to the [Pytest Plugin guide](../guides/pytest_plugin/index.md).
 
-That replays the saved JSONL log as a single batch (no `--incremental`) and
-deletes the file when it lives under the system temp dir.
\ No newline at end of file
+You should be redirected automatically. If your browser does not redirect,
+follow the link above.
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
index bd8414aa7..b30f282c6 100644
--- a/python/docs/examples/pytest_plugin_quickstart.md
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -8,8 +8,8 @@ axes, manual substeps, and gate markers. It also includes a tests directory
 that uses no Sift APIs at all, to show how the autouse fixtures capture plain
 pytest tests for free.
 
-For a conceptual reference (fixtures, ini flags, status semantics), see
-[Pytest Plugin](pytest_plugin.md).
+For a conceptual reference (fixtures, ini flags, status semantics), see the
+[Pytest Plugin guide](../guides/pytest_plugin/index.md).
 
 ## Project layout
 
@@ -172,7 +172,7 @@ Flip any of the `sift_*_step` / `sift_parametrize_nesting` flags in
 
 ## Next steps
 
-- [Pytest Plugin](pytest_plugin.md): conceptual reference covering fixtures,
-  ini flags, status semantics, and layout-mapping examples.
+- [Pytest Plugin guide](../guides/pytest_plugin/index.md): conceptual reference
+  covering fixtures, configuration, report structure, and pass/fail behavior.
 - The demo's [README](https://github.com/sift-stack/sift/blob/main/python/examples/pytest_plugin/README.md)
   on GitHub mirrors this page and is the canonical source.
diff --git a/python/docs/guides/index.md b/python/docs/guides/index.md
new file mode 100644
index 000000000..105f0bb25
--- /dev/null
+++ b/python/docs/guides/index.md
@@ -0,0 +1,11 @@
+# Guides
+
+Conceptual references for the Sift Python client. Guides explain how a feature
+works and how to configure it. For runnable, end-to-end walkthroughs see the
+[Examples](../examples/index.md) section.
+
+## Available guides
+
+- [Pytest Plugin](pytest_plugin/index.md): turn a pytest run into a `TestReport`
+  in Sift. Each test becomes a `TestStep`, measurements are recorded as rows, and
+  failures propagate up through nested substeps to the report.
diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
new file mode 100644
index 000000000..6ed78f931
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -0,0 +1,220 @@
+# Configuration & Defaults
+
+This page is the full reference for everything the plugin exposes: fixtures, CLI
+flags, ini options, credential handling, and the markers that control which
+tests report.
+
+!!! info "Where the plugin lives"
+    The plugin lives at `sift_client.pytest_plugin`. It is **not** registered as
+    a `pytest11` entry point. Projects opt in with a `pytest_plugins` declaration
+    in their top-level `conftest.py`. Pytest then loads the module as a real
+    plugin: the fixtures, CLI options, and `pytest_runtest_makereport` hook all
+    register through standard pytest machinery, so `pytest --trace-config` lists
+    it and `pytest -p no:sift_client.pytest_plugin` disables it.
+
+## Credentials
+
+Set the connection details in a `.env` next to your tests:
+
+```bash
+SIFT_API_KEY="your-api-key"
+SIFT_GRPC_URI="..."
+SIFT_REST_URI="..."
+```
+
+The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your
+Sift organization. You can find these on the Sift Manage page as well as
+generate an API key.
+
+The default `sift_client` fixture reads its two URIs from environment first and
+falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
+intentionally env-only, so keep it out of source control and supply it through
+`pytest-dotenv` (see [API key handling](#api-key-handling) below). The env var
+wins when both are set, so secrets injected into a CI environment continue to
+override values committed to `pyproject.toml`. There are no CLI flags for
+credentials.
+
+| Ini key | Environment variable | Notes |
+|---|---|---|
+| _(none)_ | `SIFT_API_KEY` | Env-only. Use `.env` + `pytest-dotenv` locally; inject from your secret store in CI. |
+| `sift_grpc_uri` | `SIFT_GRPC_URI` | Stable per-org gRPC endpoint; safe to commit. |
+| `sift_rest_uri` | `SIFT_REST_URI` | Stable per-org REST endpoint; safe to commit. |
+
+### API key handling
+
+`SIFT_API_KEY` is deliberately read from the process environment only. The
+recommended workflow uses the
+[`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) plugin (already a
+dependency of `sift-stack-py`), which loads variables from a `.env` file into
+`os.environ` before tests run.
+
+1. Add `.env` to `.gitignore`.
+2. Drop your key into `.env` at the project root:
+
+    ```bash title=".env"
+    SIFT_API_KEY=sk-...your-key...
+    ```
+
+3. In CI, set `SIFT_API_KEY` directly via your provider's secret manager
+   instead of committing a `.env` file.
+
+`pytest-dotenv` picks the file up automatically; no `pytest_configure` glue is
+needed.
+
+!!! warning "FedRAMP / shared environments"
+    Pass `--sift-log-file=false` (or set the ini key to `"false"`) to skip the
+    temp file + worker pipeline. Create/update calls then run inline against the
+    API instead of being deferred through a subprocess.
+
+## Wire the plugin into `conftest.py`
+
+A single `pytest_plugins` declaration in your top-level `conftest.py` is all
+that's required. The plugin ships a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+
+```python title="conftest.py"
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
+
+That's the whole setup. Every test in the session will now create a step on a
+single shared `TestReport`.
+
+### Customizing the `SiftClient`
+
+To construct the client differently (custom TLS, timeouts, alternate
+credentials, etc.), override the `sift_client` fixture in your conftest. The
+plugin's default falls away in favor of your definition.
+
+```python title="conftest.py"
+import os
+
+import pytest
+from dotenv import load_dotenv
+
+from sift_client import SiftClient, SiftConnectionConfig
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+@pytest.fixture(scope="session")
+def sift_client() -> SiftClient:
+    return SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key=os.getenv("SIFT_API_KEY"),
+            grpc_url=os.getenv("SIFT_GRPC_URI"),
+            rest_url=os.getenv("SIFT_REST_URI"),
+            use_ssl=False,
+        )
+    )
+```
+
+## Plugin provided fixtures
+
+| Name | Kind | Scope | Purpose |
+|---|---|---|---|
+| `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
+| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `fail_if_measurements_failed`, and `current_step`. |
+| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently; see [ini options](#ini-options). |
+| `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
+| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
+
+## CLI options
+
+| Flag | Default | Effect |
+|---|---|---|
+| `--sift-offline` | off (online) | Skip the session-start ping and don't contact Sift. All create/update calls go to the JSONL log file for later replay via `import-test-result-log`. Missing `SIFT_*` env vars are tolerated; placeholders are filled. |
+| `--sift-disabled` | off | Skip Sift entirely. Nothing contacts the API and no log file is written; `step.measure(...)` still evaluates bounds and returns a real pass/fail boolean. Also honored via `SIFT_DISABLED=1`. Supersedes every other flag (disabled wins over offline). |
+| `--sift-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. Incompatible with `--sift-offline` since offline mode needs the log file as its sole sink. |
+| `--no-sift-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
+
+These can be passed permanently via `addopts`:
+
+```ini title="pytest.ini"
+[pytest]
+addopts = --sift-offline
+```
+
+## Ini options
+
+Set the matching ini key directly (recommended for stable per-project
+configuration). Each CLI flag has a corresponding key under
+`[tool.pytest.ini_options]` in `pyproject.toml` or `[pytest]` in `pytest.ini`.
+CLI flags, when passed, override the ini values.
+
+| Ini key | Type | Equivalent CLI flag |
+|---|---|---|
+| `sift_log_file` | string (`true` / `false` / `none` / path) | `--sift-log-file=<value>` |
+| `sift_git_metadata` | bool (default `true`) | `--no-sift-git-metadata` (sets to `false`) |
+| `sift_offline` | bool (default `false`) | `--sift-offline` |
+| `sift_disabled` | bool (default `false`) | `--sift-disabled` (also honors `SIFT_DISABLED` env var) |
+| `sift_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
+| `sift_package_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each Python package (directory with `__init__.py`) in the test path. |
+| `sift_module_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each test module (file). |
+| `sift_class_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each test class, including nested classes. |
+| `sift_parametrize_nesting` | bool (default `true`) | _(ini-only)_. Clusters parametrized tests under shared parents (`test_x`, `axis=value`) instead of flat leaves (`test_x[value]`). |
+
+```toml title="pyproject.toml"
+[tool.pytest.ini_options]
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
+```
+
+```ini title="pytest.ini"
+[pytest]
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = your-org.sift.example:443
+sift_rest_uri = https://your-org.sift.example
+```
+
+## Controlling which tests produce reports
+
+By default every test in the session produces a Sift step. Two markers and one
+ini key let you narrow that to a specific set of tests, which is useful when a
+repo holds tests that you don't want included in the Sift test report.
+
+| Setting                                                 | Effect                                                                                       |
+|---------------------------------------------------------|----------------------------------------------------------------------------------------------|
+| `sift_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
+| `@pytest.mark.sift_include` on a test, class, or module | Force reporting on for that scope, regardless of the project default.                        |
+| `@pytest.mark.sift_exclude` on a test, class, or module | Force reporting off for that scope, regardless of the project default.                       |
+
+Closest marker determines setting. `sift_exclude` beats `sift_include` when both apply.
+`pytestmark` at the class or module level inherits to every test in scope.
+
+### Bulk-applying a marker to a directory
+
+To opt an entire directory in (or out) without editing each file, hook
+`pytest_collection_modifyitems` in the directory's `conftest.py`:
+
+```python title="tests/example/conftest.py"
+from pathlib import Path
+
+import pytest
+
+_HERE = Path(__file__).parent
+
+
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
+```
+
+This applies `sift_include` to every test collected under `tests/example/`.
+Combine with `sift_autouse = false` in `pyproject.toml` for opting in to
+specific directories.
+
+`pytest_collection_modifyitems` receives every item in the session, not just
+this directory's, so the `relative_to` filter is what scopes the marker.
diff --git a/python/docs/guides/pytest_plugin/index.md b/python/docs/guides/pytest_plugin/index.md
new file mode 100644
index 000000000..9344885b3
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/index.md
@@ -0,0 +1,122 @@
+# Pytest Plugin
+
+The Sift Python client ships a pytest plugin that turns a pytest run into a
+`TestReport` in Sift. Each test function becomes a `TestStep`, measurements are presented
+as rows under that step, and failures propagate up through nested substeps to
+the report itself.
+
+## Quick start
+
+Install the client and pytest:
+
+```bash
+pip install sift-stack-py pytest python-dotenv
+```
+
+Set your connection details in a `.env` next to your tests:
+
+```bash title=".env"
+SIFT_API_KEY="..."
+SIFT_GRPC_URI="..."
+SIFT_REST_URI="..."
+```
+
+Find these on the Sift Manage page, where you can also generate an API key.
+
+Register the plugin with a single `pytest_plugins` declaration in your top-level
+`conftest.py`:
+
+```python title="conftest.py"
+from dotenv import load_dotenv
+
+load_dotenv()
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
+
+Write a test. The `step` fixture is `autouse`, so any test becomes a step on the
+report. Take it as an argument when you want to record a measurement:
+
+```python title="test_battery.py"
+def test_battery_voltage(step):
+    step.measure(
+        name="battery_voltage",
+        value=4.97,
+        bounds={"min": 4.8, "max": 5.2},
+        unit="V",
+    )
+    step.fail_if_measurements_failed()
+```
+
+Run it:
+
+```bash
+pytest
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+!!! tip "Fail at the end, not per measurement"
+    `step.measure(...)` returns a pass/fail boolean and never raises, so a
+    failing measurement marks the step failed without aborting the test. Take
+    every measurement first, then call `step.fail_if_measurements_failed()` once
+    at the end, so every measurement still lands in the report even when one
+    fails. It fails the test via `pytest.fail` (no assertion noise in
+    `error_info`), and unlike asserting on an individual `step.measure(...)` call
+    it does not short-circuit on the first failure and skip every measurement
+    after it.
+
+## Sensible defaults
+
+With nothing but the `conftest.py` above, you get:
+
+- **Full step tree.** Every Python package, test module, test class, and
+  parametrize axis above a test becomes a parent step, so the report mirrors
+  your test layout.
+- **Online mode.** The plugin pings Sift at session start and streams
+  create/update calls to your tenant during the run.
+- **Git metadata.** Repo, branch, and commit are captured on the report
+  automatically.
+
+Everything is on by default and individually overridable. See
+[Configuration & Defaults](configuration.md) for the full audit of every knob,
+marker, flag, and fixture.
+
+## Running modes
+
+The plugin runs in one of three modes, picked at invocation.
+
+| Mode | How to select | Contacts Sift | When to use                                                   |
+|---|---|---|---------------------------------------------------------------|
+| **Online** | default (no flag) | Yes, during the run | Default choice                                                |
+| **Offline** | `--sift-offline` | No; records to a log file for later replay | Environments without Sift access.                             |
+| **Disabled** | `--sift-disabled` | No | Local dev. Bounds still evaluate and return a real pass/fail. |
+
+Online mode pings Sift once at session start and aborts if Sift is unreachable or the credentials are invalid, 
+so a misconfigured job fails immediately instead of silently producing no report. 
+During the run, every create and update is appended to a JSONL log file. 
+A background worker uploads new entries to Sift incrementally. 
+If the connection drops mid-test, the test keeps running and the log keeps writing locally. 
+The remaining entries can be uploaded afterward by running import-test-result-log, which the plugin prints on exit.
+
+See [Running Modes](running_modes.md) for the log-file and replay pipeline,
+overriding the connection check, and replaying a saved log.
+
+## Report structure
+
+The report tree mirrors your test layout: packages, modules, classes, and
+parametrize axes nest automatically, and you can open arbitrary substeps inside
+a test. See [Report Structure](report_structure.md) for the layout-to-tree
+mapping, measurement variants, and report metadata.
+
+## Pass/fail outcomes
+
+Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard
+exit) maps to a `TestStatus`, and failures roll up to the parent steps and the
+report. See [Pass/Fail Behavior](pass_fail_behavior.md).
+
+## Try the runnable demo
+
+The [Pytest Plugin Quickstart](../../examples/pytest_plugin_quickstart.md) walks
+through a self-contained demo project that exercises every layer of the step
+tree, with instructions to run it with or without a Sift tenant.
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
index 5c0f178c2..2ce3d0697 100644
--- a/python/docs/guides/pytest_plugin/pass_fail_behavior.md
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -29,7 +29,8 @@ The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
 An assertion failure records the concise assertion message (the exception
 line(s), no traceback frames) on `step.error_info.error_message` while still
 mapping to `FAILED`. A non-assertion exception gets its formatted traceback
-recorded on `step.error_info.error_message`.
+(the last 10 frames plus the first frame) recorded on
+`step.error_info.error_message`.
 
 ## Hard exits
 
@@ -76,6 +77,22 @@ itself) as `ABORTED`.
 `SKIPPED` does not propagate as a failure. A skipped substep or test does
 not block its parent from resolving to `PASSED`.
 
+Inside a test function, you can mark just one substep as skipped without
+aborting the whole test:
+
+```python
+from sift_client.sift_types.test_report import TestStatus
+
+
+def test_runtime_skip(step):
+    with step.substep(name="optional_calibration") as cal:
+        if not precondition_met():
+            cal.current_step.update({"status": TestStatus.SKIPPED})
+```
+
+A manually-resolved status is honored by the step's exit handler. No further
+bookkeeping required.
+
 ## Expected failures (xfail / xpass)
 
 xfail marks declare that a test is expected to fail. The plugin follows
diff --git a/python/docs/guides/pytest_plugin/report_structure.md b/python/docs/guides/pytest_plugin/report_structure.md
new file mode 100644
index 000000000..811fd7cf0
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/report_structure.md
@@ -0,0 +1,421 @@
+# Report Structure
+
+The report tree mirrors your test layout. Every Python package, test module,
+test class, and parametrize axis above a test becomes a parent step, and you can
+open arbitrary substeps inside a test. This page covers the layout-to-tree
+mapping, the measurement variants you record into it, and the metadata the
+plugin captures for you.
+
+## Recording measurements
+
+With the conftest in place, the simplest test needs nothing extra. The `step`
+fixture is `autouse=True` and pytest test failures and skips are mapped to step
+statuses automatically.
+
+```python title="test_basic.py"
+def test_no_fixtures_still_creates_a_step():
+    """Autouse `step` records this function as a step on the session report."""
+    assert 1 + 1 == 2
+
+
+def test_measure_a_single_value(step):
+    """Take `step` explicitly when you want to record a measurement."""
+    voltage = 4.97
+    step.measure(
+        name="battery_voltage",
+        value=voltage,
+        bounds={"min": 4.8, "max": 5.2},
+        unit="V",
+    )
+    # An out-of-bounds measurement already marks the step FAILED. Call this at
+    # the end to also fail pytest, without an assertion message in error_info.
+    step.fail_if_measurements_failed()
+
+
+def test_measure_strings_and_booleans(step):
+    """`bounds` accepts a string or `True`/`False` for non-numeric values."""
+    step.measure(name="firmware_version", value="1.4.2", bounds="1.4.2")
+    step.measure(name="self_test_passed", value=True, bounds=True)
+
+
+def test_docstring_becomes_step_description(step):
+    """This docstring is the step's description in Sift.
+
+    The plugin pulls `request.node.obj.__doc__` when it creates the step.
+    Helper functions called from within the test do not get this treatment;
+    pass `description="..."` explicitly on `substep(...)` instead.
+    """
+    assert step.current_step.description is not None
+```
+
+!!! tip "Measurements never raise"
+    `step.measure(...)` returns `True` if the value is in bounds and `False`
+    otherwise. A `False` result marks the enclosing step as failed but does not
+    raise. Chain measurements freely and inspect the boolean if you need custom
+    flow control. For how outcomes map to `TestStatus` and propagate upward, see
+    [Pass/Fail Behavior](pass_fail_behavior.md).
+
+## Nested steps
+
+Use `step.substep(name=...)` to open a child step. Substeps nest arbitrarily
+deep, and a failure at any depth propagates up to fail the parent and the
+report.
+
+```python title="test_nested_steps.py"
+import time
+
+
+def test_phased_check(step):
+    """Phase a single test into setup/exercise/verify substeps."""
+    with step.substep(name="setup", description="Power on and wait for boot") as setup:
+        setup.measure(name="boot_time_s", value=2.1, bounds={"max": 5.0}, unit="s")
+
+    with step.substep(name="exercise", description="Drive the test sequence"):
+        time.sleep(0.01)
+
+    with step.substep(name="verify", description="Read final state") as verify:
+        verify.measure(name="final_state", value="IDLE", bounds="IDLE")
+
+
+def test_deeply_nested(step):
+    """A failure at the bottom fails everyone above it."""
+    with step.substep(name="level_1") as l1:
+        with l1.substep(name="level_2") as l2:
+            with l2.substep(name="level_3") as l3:
+                l3.measure(name="leaf_value", value=42, bounds={"min": 0, "max": 100})
+```
+
+Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …) assigned
+by `ReportContext`. Sibling substeps within the same parent auto-increment;
+opening a new top-level step starts a new branch.
+
+### Mirroring the test layout
+
+The plugin opens a parent step for each Python package (`__init__.py`
+directory), test file, and test class above every test, plus a parent step for
+each `@pytest.mark.parametrize` axis. Every layer is on by default and
+individually opt-out via ini flags (`sift_package_step`, `sift_module_step`,
+`sift_class_step`, `sift_parametrize_nesting`). Class/module/package docstrings
+become the matching step's description.
+
+### Linking a Run to the report
+
+`report_context` is the session-scoped fixture; mutating it in one test affects
+the whole report.
+
+```python
+def test_link_run_to_report(report_context, sift_client):
+    run = sift_client.runs.create(...)  # however you create your run
+    report_context.report.update({"run_id": run.id_})
+```
+
+The same `update({...})` pattern works for any field on `TestReportUpdate`,
+including `serial_number`, `part_number`, `system_operator`, and `metadata`.
+
+## How pytest layout maps to a Sift report
+
+The plugin builds the report tree by hooking pytest's collection: every test
+node it sees becomes a step. What you control is which constructs create nodes
+and where you nest substeps inside them. Common layouts and the resulting report
+trees:
+
+### Flat module of test functions
+
+The default. Each function is one step directly under the report.
+
+```python title="test_battery.py"
+def test_voltage(step): ...
+def test_current(step): ...
+def test_temperature(step): ...
+```
+
+```text title="Sift report"
+TestReport
+├── test_voltage
+├── test_current
+└── test_temperature
+```
+
+### Modules nested under a package
+
+Two test files under the same Python package (directory with `__init__.py`)
+share that package step as their parent.
+
+```python title="suites/__init__.py"
+```
+
+```python title="suites/test_battery.py"
+def test_voltage(step): ...
+def test_current(step): ...
+```
+
+```python title="suites/test_thermal.py"
+def test_idle_temp(step): ...
+def test_load_temp(step): ...
+```
+
+```text title="Sift report"
+TestReport
+└── suites
+    ├── test_battery.py
+    │   ├── test_voltage
+    │   └── test_current
+    └── test_thermal.py
+        ├── test_idle_temp
+        └── test_load_temp
+```
+
+### Test classes (and nested classes)
+
+`class TestFoo:` and `class TestOuter: class TestInner:` produce class and
+nested class steps automatically, with no manual fixture needed.
+
+```python title="test_charging.py"
+class TestCharging:
+    """Charging subsystem."""
+
+    def test_starts_at_zero(self, step): ...
+    def test_reaches_full(self, step): ...
+    def test_thermal_throttle(self, step): ...
+```
+
+```text title="Sift report"
+TestReport
+└── test_charging.py
+    └── TestCharging
+        ├── test_starts_at_zero
+        ├── test_reaches_full
+        └── test_thermal_throttle
+```
+
+The class's docstring becomes the step description.
+
+### Parametrized tests
+
+Parametrized tests cluster under a parent step named after the test function,
+with one inner parent per parametrize axis (outer-to-inner in decorator-on-page
+order). Stacked parametrize produces nested step levels.
+
+```python
+@pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
+def test_rail(step, voltage):
+    step.measure(name="rail_v", value=voltage, bounds={"min": 0.0})
+```
+
+```text title="Sift report"
+TestReport
+└── test_module.py
+    └── test_rail
+        ├── voltage=3.3
+        ├── voltage=5.0
+        └── voltage=12.0
+```
+
+Stacked parametrize:
+
+```python
+@pytest.mark.parametrize("voltage", ["high", "low"])
+@pytest.mark.parametrize("component", ["motor", "valve"])
+def test_iso(step, voltage, component): ...
+```
+
+```text title="Sift report"
+TestReport
+└── test_module.py
+    └── test_iso
+        ├── voltage='high'
+        │   ├── component='motor'
+        │   └── component='valve'
+        └── voltage='low'
+            ├── component='motor'
+            └── component='valve'
+```
+
+Set `sift_parametrize_nesting = false` in `pytest.ini` to fall back to flat leaf
+names (`test_rail[3.3]`).
+
+### Helper functions
+
+Helpers called from a test do not auto-create a step. The plugin only sees
+pytest-collected nodes. To represent helper work in the report, open a substep
+at the call site and pass it into the helper:
+
+```python
+def measure_rail(step, name, value, bounds):
+    return step.measure(name=name, value=value, bounds=bounds, unit="V")
+
+
+def test_power_rails(step):
+    with step.substep(name="3.3V rail") as rail_3v3:
+        measure_rail(rail_3v3, "rail_v", 3.31, {"min": 3.2, "max": 3.4})
+
+    with step.substep(name="5V rail") as rail_5v:
+        measure_rail(rail_5v, "rail_v", 5.02, {"min": 4.9, "max": 5.1})
+```
+
+```text title="Sift report"
+TestReport
+└── test_power_rails
+    ├── 3.3V rail
+    │   └── rail_v        (measurement)
+    └── 5V rail
+        └── rail_v        (measurement)
+```
+
+!!! tip "Docstring-as-description is top-level only"
+    The plugin reads the test function's docstring and uses it as the step
+    description. Docstrings on helper functions are not picked up. Pass
+    `description="..."` explicitly on `substep(...)` if you want one.
+
+### Fixtures that contribute steps
+
+A fixture can open its own substep around setup/teardown by using `step` (for
+function-scope) or `report_context.new_step(...)` (for any scope). The substep
+ends when the fixture's `yield` returns, which makes the report tree mirror the
+lifecycle.
+
+```python
+@pytest.fixture
+def warmed_up_dut(step):
+    with step.substep(name="warmup", description="Bring DUT to operating temp"):
+        # ... do warmup work ...
+        yield "dut-handle"
+
+
+def test_steady_state(step, warmed_up_dut):
+    step.measure(name="temp_c", value=37.2, bounds={"min": 35.0, "max": 40.0})
+```
+
+```text title="Sift report"
+TestReport
+└── test_steady_state
+    ├── warmup        (from fixture)
+    └── temp_c        (measurement)
+```
+
+## Measurement variants
+
+`step.measure(...)` records exactly one measurement. For datasets coming off a
+sensor or calculated channel, use one of the bulk variants.
+
+### `measure_avg`: one row, the mean
+
+`measure_avg` accepts a Python list, a NumPy array, or a pandas `Series`, takes
+the mean, and evaluates it against bounds.
+
+```python
+import numpy as np
+import pandas as pd
+
+
+def test_avg_with_list(step):
+    samples = [4.97, 5.01, 5.03, 4.99, 5.02]
+    step.measure_avg(
+        name="bus_voltage_avg",
+        values=samples,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+
+
+def test_avg_with_numpy(step):
+    samples = np.linspace(99.5, 100.5, num=50)
+    step.measure_avg(
+        name="cpu_temp_avg",
+        values=samples,
+        bounds={"min": 95.0, "max": 105.0},
+        unit="C",
+    )
+
+
+def test_avg_with_pandas(step):
+    series = pd.Series([0.998, 1.001, 0.999, 1.002, 1.000])
+    step.measure_avg(
+        name="reference_clock_ratio",
+        values=series,
+        bounds={"min": 0.99, "max": 1.01},
+    )
+```
+
+### `measure_all`: only out-of-bounds rows
+
+Records measurements only for samples that fail bounds, so an all-pass dataset
+of N samples doesn't add N rows to the report. Returns `True` when every sample
+is in bounds.
+
+```python
+def test_only_outliers_recorded(step):
+    samples = [10.1, 10.2, 10.3, 99.9, 10.0, 10.1]  # 99.9 is the outlier
+    all_in_bounds = step.measure_all(
+        name="pressure_psi",
+        values=samples,
+        bounds={"min": 9.0, "max": 11.0},
+        unit="psi",
+    )
+    # Returns False because 99.9 is out of bounds. The step is already
+    # marked failed; call this only if you also want pytest to fail.
+    step.fail_if_measurements_failed()
+```
+
+!!! note "`measure_all` requires at least one bound"
+    Passing `bounds={}` raises `ValueError("No bounds provided")`. At least one
+    of `min` or `max` must be set.
+
+### `report_outcome`: externally computed pass/fail
+
+When the decision is computed elsewhere, drop it onto the report as a named
+substep with an optional reason. Returns the result you passed in, so you can
+use it inline.
+
+```python
+def test_external_checks(step):
+    step.report_outcome(
+        name="config_loaded",
+        result=True,
+        reason="loaded /etc/dut/config.yaml",
+    )
+
+    # Failures show up as a failed substep without raising.
+    rare_warning_seen = False
+    step.report_outcome(
+        name="no_rare_warning",
+        result=not rare_warning_seen,
+        reason="grep'd dmesg for the known-flaky warning",
+    )
+```
+
+### Bounds reference
+
+| Pass to `bounds=` | Value type | Effect |
+|---|---|---|
+| `{"min": x, "max": y}` (either key optional) | `int` / `float` | Numeric window. One-sided is fine. |
+| `NumericBounds(min=x, max=y)` | `int` / `float` | Same as the dict form, explicit. |
+| `"expected-string"` | `str` (or `bool`) | Exact equality. For `bool` values, compares lowercased string (`"true"`/`"false"`). |
+| `True` or `False` | `bool` (or `str`) | Exact equality. For `str` values, compares lowercased strings. |
+| `None` | any | Records the value but does not evaluate it; measurement is recorded as `passed=True`. |
+
+The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
+
+## Report metadata captured automatically
+
+Every report the plugin creates includes:
+
+- `name` and `test_case`: derived from the first positional argument to `pytest`. When it resolves to an existing path the plugin uses the basename for `name` and the full path string for `test_case`; otherwise both fall back to `pytest <args>`. `name` always has a UTC ISO timestamp appended. See examples below.
+- `test_system_name`: `socket.gethostname()`.
+- `system_operator`: `getpass.getuser()`.
+- `start_time` / `end_time`: set on session enter/exit.
+- `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
+- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-git-metadata` or when not in a git repo.
+
+Example invocations:
+
+| Pytest invocation | Report `name` | Report `test_case` |
+|---|---|---|
+| `pytest tests/test_battery.py` | `test_battery.py 2026-05-04T12:00:00.123456+00:00` | `tests/test_battery.py` |
+| `pytest tests/` | `tests 2026-05-04T12:00:00.123456+00:00` | `tests` |
+| `pytest -k voltage` | `pytest -k voltage 2026-05-04T12:00:00.123456+00:00` | `pytest -k voltage` |
+
+To override defaults (e.g. set a serial number, system operator, or extra
+metadata), call `report_context.report.update({...})` from any test or fixture.
+See [Linking a Run](#linking-a-run-to-the-report) for the same pattern applied
+to `run_id`.
diff --git a/python/docs/guides/pytest_plugin/running_modes.md b/python/docs/guides/pytest_plugin/running_modes.md
new file mode 100644
index 000000000..e69688cf1
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/running_modes.md
@@ -0,0 +1,138 @@
+# Running Modes
+
+The plugin runs in one of three modes, picked at invocation. This page covers
+how each mode behaves, the log-file/replay pipeline, and how to replay a saved
+log against Sift.
+
+## Running the suite
+
+```bash
+# Full run against your Sift tenant
+pytest
+
+# Pin the log file so you can replay it later if the import worker dies
+pytest --sift-log-file=./sift-results.jsonl
+```
+
+## The three modes
+
+| Mode | Flag | Network | Log file | `step.measure(...)` | When to use |
+|---|---|---|---|---|---|
+| Online (default) | _(none)_ | yes (pings at session start, aborts if it fails) | optional write-through backup | real measurement against Sift | CI with Sift credentials, local dev hitting your tenant |
+| Offline | `--sift-offline` | none | required (the sole sink) | real measurement queued to log | field tests, air-gapped labs, CI without network |
+| Disabled | `--sift-disabled` | none | none | bounds eval; returns a real bool | local dev or CI that doesn't have (or want) Sift |
+
+Pass both flags and disabled wins: it skips Sift entirely and supersedes every
+other setting.
+
+## Online mode (default)
+
+`report_context` resolves `client_has_connection` at session start. The default
+implementation calls `sift_client.ping.ping()`. A failed ping aborts the whole
+session with `pytest.UsageError` and points at `--sift-offline` and
+`--sift-disabled` as escape hatches.
+
+This is loud on purpose. A CI run that silently no-ops on a flaky network won't
+get noticed until somebody goes looking for the report, which is usually weeks
+later, which is usually too late.
+
+With the default `--sift-log-file` setting on, create/update calls are written
+to a JSONL log file during the run and an `import-test-result-log --incremental`
+worker replays them against Sift in the background. If the worker crashes
+mid-session (connection failure, API error) or is still draining its backlog at
+session end, the failure is logged at session end with a `replay-test-result-log`
+command for manual recovery. Test outcomes are unaffected and the local log
+file is preserved. Pass `--sift-log-file=false` to make every create/update
+synchronous against the API instead.
+
+### Overriding the connection check
+
+Override `client_has_connection` when ping isn't the right signal, for example a
+token cache that's only warm when authenticated:
+
+```python title="conftest.py"
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def client_has_connection(sift_client) -> bool:
+    return Path("~/.sift-token-cache").expanduser().is_file()
+```
+
+The override is ignored under `--sift-offline` and `--sift-disabled`.
+
+## Offline mode (`--sift-offline`)
+
+Same fixtures, same `step.measure(...)` semantics as online. The difference is
+where the writes go: every create/update lands in a JSONL log file instead of
+hitting the Sift API. The session-start ping is skipped, missing `SIFT_*` env
+vars are tolerated (placeholders are filled), and the replay worker
+(`import-test-result-log --incremental`) does not get spawned at session end.
+
+```bash
+pytest --sift-offline --sift-log-file=./run.jsonl
+```
+
+Once you have connectivity, replay it:
+
+```bash
+import-test-result-log ./run.jsonl
+```
+
+That replay creates the report, steps, and measurements against Sift. See
+[Replaying a saved log file](#replaying-a-saved-log-file) for cleanup and the
+incremental flag.
+
+`--sift-log-file=none` is rejected when offline is set. The log file is the only
+sink in offline mode, so without it the results are gone.
+
+!!! warning "Pin the log path"
+    Without `--sift-log-file=<path>`, offline mode writes to a
+    `tempfile.NamedTemporaryFile` and only surfaces the path via a `logger.info`
+    line. Pin a known path when you intend to replay later.
+
+## Disabled mode (`--sift-disabled`)
+
+The plugin stays loaded with the same fixtures and markers as the other modes.
+Nothing contacts Sift, no log file is written, and no `SIFT_*` env vars are
+required. `step.measure(...)`, `step.measure_avg(...)`, `step.measure_all(...)`,
+`step.substep(...)`, and `report_context.report.update({...})` all behave
+normally: bounds evaluate and you get a real pass/fail boolean back.
+
+Entities returned in disabled mode report `is_simulated == True` (on
+`TestReport`, `TestStep`, `TestMeasurement`, and `ReportContext`) so consumers
+and tests can branch on provenance. Offline-mode entities also report
+`is_simulated == True`.
+
+How to turn it on, in the order most projects pick:
+
+```bash
+# In an .envrc, devcontainer, or CI job config
+export SIFT_DISABLED=1
+
+# Per-invocation kill-switch
+pytest --sift-disabled
+
+# Per-project default (uncommon; online is usually the right default)
+# pyproject.toml:
+#   [tool.pytest.ini_options]
+#   sift_disabled = true
+```
+
+Good fit for local dev without Sift credentials. Also for library consumers who
+don't have a Sift tenant. Also useful in CI for runs that shouldn't add noise to
+the report stream, like a PR job re-running the same suite five times in a row.
+
+## Replaying a saved log file
+
+When the worker doesn't finish cleanly the plugin will print a hint mentioning
+`import-test-result-log`. To import:
+
+```bash
+import-test-result-log <path-to-log.jsonl>
+```
+
+That replays the saved JSONL log as a single batch (no `--incremental`) and
+deletes the file when it lives under the system temp dir.
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index af174aa4f..5a9c73e82 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -51,6 +51,10 @@ extra:
     provider: mike
     alias: true
 
+# Kept out of the nav but still built so the old URL redirects to the guide.
+not_in_nav: |
+  /examples/pytest_plugin.md
+
 nav:
   - Home: index.md
   - Sift Client API
@@ -59,11 +63,14 @@ nav:
       - examples/index.md
       - Basic Usage: examples/basic.ipynb
       - Data Ingestion: examples/ingestion.ipynb
-        # Will migrate to Guides in the future
-      - Pytest Plugin: examples/pytest_plugin.md
       - Pytest Plugin Quickstart: examples/pytest_plugin_quickstart.md
   - Guides:
+      - guides/index.md
       - Pytest Plugin:
+          - Overview: guides/pytest_plugin/index.md
+          - Configuration & Defaults: guides/pytest_plugin/configuration.md
+          - Running Modes: guides/pytest_plugin/running_modes.md
+          - Report Structure: guides/pytest_plugin/report_structure.md
           - Pass/Fail Behavior: guides/pytest_plugin/pass_fail_behavior.md
 #  - Guides:
 #      - Logging

From 287f41a612298cf5136071582812c0438faf984b Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 13:37:39 -0700
Subject: [PATCH 08/19] Python(docs): add v0.17.0 changelog entry for pytest
 plugin

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/CHANGELOG.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index cc9fc06a8..8aa9d816a 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -3,6 +3,30 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## [v0.17.0] - Unreleased
+
+### What's New
+#### Pytest Plugin
+The client now ships a pytest plugin that turns a pytest run into a `TestReport` in Sift. Register it with a single `pytest_plugins = ["sift_client.pytest_plugin"]` line in your top-level `conftest.py`. Each test function becomes a `TestStep`, measurements appear as rows under that step, and failures roll up through nested substeps to the report. Enable it for a test by taking the autouse `step` fixture as an argument and calling `step.measure(...)` to record values against bounds.
+
+Highlights:
+- **Hierarchical report tree.** Packages, modules, classes, and parametrize axes above a test each become a parent step, so the report mirrors your test layout. Arbitrary substeps can be opened inside a test.
+- **Three running modes.** Online (default) pings Sift at session start and streams create/update calls during the run; offline records to a JSONL log for later replay; disabled evaluates bounds locally without contacting Sift. Select with `--sift-offline` or `--sift-disabled`.
+- **Graceful connection handling.** Online mode aborts at session start if Sift is unreachable or credentials are invalid, so a misconfigured job fails fast. If the connection drops mid-run, tests keep running and the log keeps writing locally; remaining entries upload afterward via the import command the plugin prints on exit.
+- **Pass/fail mapping.** Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard exit) maps to a `TestStatus` and propagates to parent steps and the report. `step.measure(...)` returns a pass/fail boolean without raising, so all measurements land in the report even when one fails; `step.fail_if_measurements_failed()` fails the test at the end without adding assertion noise to `error_info`.
+- **Assertion messages as error info.** Assertion failure messages are reported as the step's error info.
+- **Git metadata.** Repo, branch, and commit are captured on the report automatically.
+
+See the [Pytest Plugin guide](https://github.com/sift-stack/sift/blob/main/python/docs/guides/pytest_plugin/index.md) and the runnable quickstart example for full configuration.
+
+### Full Changelog
+- [Pytest plugin improvements](https://github.com/sift-stack/sift/pull/567)
+- [Graceful handling of missing connection](https://github.com/sift-stack/sift/pull/569)
+- [Hierarchical pytest report tree](https://github.com/sift-stack/sift/pull/570)
+- [Pass/fail behavior improvements](https://github.com/sift-stack/sift/pull/568)
+- [Report assertion message as error info](https://github.com/sift-stack/sift/pull/587)
+- [Pytest docs reorganization](https://github.com/sift-stack/sift/pull/589)
+
 ## [v0.16.2] - May 21, 2026
 
 ### Bugfixes

From 47556306f87eee9b3fe7ecb4fd99b806176977df Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 13:50:03 -0700
Subject: [PATCH 09/19] revert rapidyaml version change

---
 python/pyproject.toml | 6 +-----
 python/uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 0bb07e84a..a2cd6a410 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -26,11 +26,7 @@ keywords = ["sift", "sift-stack", "siftstack", "sift_py"]
 dependencies = [
     "grpcio~=1.13",
     "PyYAML~=6.0",
-    # TODO: rapidyaml 0.13.0 ships C++ source that fails to compile against
-    # the GCC version on current GitHub Actions runners (csubstr operator=
-    # and SFINAE errors in the bundled c4core). Cap below 0.13 until either
-    # rapidyaml ships fixed sdists or we move to binary wheels.
-    "rapidyaml>=0.11,<0.13",
+    "rapidyaml~=0.11",
     "pandas>=2.0,<3.1",
     "protobuf>=5.0",
     "pydantic~=2.10",
diff --git a/python/uv.lock b/python/uv.lock
index 038a7ce09..9ed71e17b 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -4315,7 +4315,7 @@ wheels = [
 
 [[package]]
 name = "sift-stack-py"
-version = "0.16.2"
+version = "0.17.0.dev0"
 source = { editable = "." }
 dependencies = [
     { name = "alive-progress", version = "3.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },

From fe9c0a38187477e43508a39fbb171a0803b005cd Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 26 May 2026 14:19:02 -0700
Subject: [PATCH 10/19] Python(chore): use inprocess to improve test
 performance (#590)

---
 .../_tests/pytest_plugin/conftest.py          | 18 ++++---
 .../_tests/pytest_plugin/test_hierarchy.py    | 48 +++++++++----------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/python/lib/sift_client/_tests/pytest_plugin/conftest.py b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
index 783a12bf4..7afee768d 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/conftest.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
@@ -9,13 +9,17 @@
   block inside ``pytest_configure``, useful for inspecting internal state
   without running tests against a real backend
 
-Every test in this suite invokes the inner session via
-``pytester.runpytest_subprocess(...)`` rather than ``pytester.runpytest(...)``.
-``runpytest`` runs the inner pytest in-process, which re-imports the Sift
-plugin on each test; the plugin transitively imports numpy, whose C
-extensions refuse to initialize twice in one process and raise
-``cannot load module more than once per process``. Spawning a subprocess
-gives each inner session a fresh interpreter and sidesteps that guard.
+The offline-log tests (``test_hierarchy.py``, ``test_pass_fail.py``) drive the
+inner session in-process via ``pytester.runpytest_inprocess(...)``. This is
+fast because the outer session already preloads the plugin (``pyproject.toml``
+sets ``addopts = "... -p sift_client.pytest_plugin ..."``), so the numpy C
+extensions the plugin pulls in are imported once for the whole outer process
+and reused by every inner run — no per-test interpreter spawn, and no
+``cannot load module more than once per process`` re-init guard to trip.
+
+Tests that need true process isolation (fresh env vars, credential and
+connection resolution, ini parsing) still use ``pytester.runpytest_subprocess(...)``
+so the inner session starts from a clean interpreter.
 """
 
 from __future__ import annotations
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
index 1efd4e817..9e0dd52e0 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -76,7 +76,7 @@ def test_b(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -97,7 +97,7 @@ def test_a(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -125,7 +125,7 @@ def test_a(self, v):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -150,7 +150,7 @@ def test_y(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -175,7 +175,7 @@ def test_free():
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -205,7 +205,7 @@ def test_b(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -230,7 +230,7 @@ def test_b(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -252,7 +252,7 @@ def test_a(self):
             '''
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -284,7 +284,7 @@ def test_y(self, w):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -396,7 +396,7 @@ def test_c(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2, failed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -434,7 +434,7 @@ def test_b(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2, failed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -476,7 +476,7 @@ def test_b(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -499,7 +499,7 @@ def test_a(self):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -524,7 +524,7 @@ def test_a(v):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -565,7 +565,7 @@ def test_y(self):
             """
         ),
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -593,7 +593,7 @@ def test_one():
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -637,7 +637,7 @@ def test_two():
     )
     # ``importlib`` import mode is required so two packages with the same
     # name on disk don't collide during sys.path-based import.
-    result = pytester.runpytest_subprocess("-v", "--import-mode=importlib")
+    result = pytester.runpytest_inprocess("-v", "--import-mode=importlib")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -666,7 +666,7 @@ def test_one():
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=1)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -697,7 +697,7 @@ def test_a(self, v):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -730,7 +730,7 @@ def test_rail(v):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -759,7 +759,7 @@ def test_iso(voltage, component):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=4)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -798,7 +798,7 @@ def test_widget(widget):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=2)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -831,7 +831,7 @@ def test_two(w):
             """
         ),
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=4)
     steps = capture.load_steps(log_file)
     by_name = _by_name(steps)
@@ -855,7 +855,7 @@ def test_chain(a, b):
             """
         )
     )
-    result = pytester.runpytest_subprocess("-v")
+    result = pytester.runpytest_inprocess("-v")
     result.assert_outcomes(passed=1)
     steps = capture.load_steps(log_file)
     leaf = next(s for s in steps if s["name"].startswith("b="))

From ae0babe532de7826b6dde491a2253b7a4bbfac24 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Thu, 28 May 2026 14:54:05 -0700
Subject: [PATCH 11/19] Python(feat): pytest summary output (#594)

---
 python/CHANGELOG.md                           |   1 +
 .../guides/pytest_plugin/configuration.md     |   3 +
 .../guides/pytest_plugin/running_modes.md     |  82 +++++
 .../_internal/grpc_transport/transport.py     |  25 +-
 python/lib/sift_client/_internal/rest.py      |   4 +-
 python/lib/sift_client/_internal/urls.py      |  55 +++
 .../_tests/pytest_plugin/conftest.py          |   2 +-
 .../pytest_plugin/test_terminal_output.py     | 195 ++++++++++
 python/lib/sift_client/_tests/test_urls.py    |  74 ++++
 python/lib/sift_client/client.py              |  27 ++
 python/lib/sift_client/pytest_plugin.py       | 340 ++++++++++++++++++
 .../lib/sift_client/sift_types/test_report.py |  32 ++
 .../sift_client/transport/base_connection.py  |   6 +
 .../util/test_results/context_manager.py      |  60 +++-
 14 files changed, 879 insertions(+), 27 deletions(-)
 create mode 100644 python/lib/sift_client/_internal/urls.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
 create mode 100644 python/lib/sift_client/_tests/test_urls.py

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index 8aa9d816a..1b8c43a93 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -16,6 +16,7 @@ Highlights:
 - **Pass/fail mapping.** Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard exit) maps to a `TestStatus` and propagates to parent steps and the report. `step.measure(...)` returns a pass/fail boolean without raising, so all measurements land in the report even when one fails; `step.fail_if_measurements_failed()` fails the test at the end without adding assertion noise to `error_info`.
 - **Assertion messages as error info.** Assertion failure messages are reported as the step's error info.
 - **Git metadata.** Repo, branch, and commit are captured on the report automatically.
+- **Terminal output.** The plugin prints a session header with the SDK version and active mode, and an end-of-run `Sift report` panel showing the test case, outcome, step and measurement breakdowns (color-coded), test system/operator, plus a link to the report (online), the saved log and upload command (offline), or a disabled note. Both suppress under `-q`. `SiftClient.app_url` exposes the web-app origin; set `sift_report_url_base` for on-prem or custom deployments. `--sift-open-report` opens the report in a browser at session end.
 
 See the [Pytest Plugin guide](https://github.com/sift-stack/sift/blob/main/python/docs/guides/pytest_plugin/index.md) and the runnable quickstart example for full configuration.
 
diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
index 6ed78f931..3b3151111 100644
--- a/python/docs/guides/pytest_plugin/configuration.md
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -132,6 +132,8 @@ def sift_client() -> SiftClient:
 | `--sift-disabled` | off | Skip Sift entirely. Nothing contacts the API and no log file is written; `step.measure(...)` still evaluates bounds and returns a real pass/fail boolean. Also honored via `SIFT_DISABLED=1`. Supersedes every other flag (disabled wins over offline). |
 | `--sift-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. Incompatible with `--sift-offline` since offline mode needs the log file as its sole sink. |
 | `--no-sift-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
+| `--sift-report-url-base=<origin>` | derived from REST URI | Web-app origin used to build the clickable report link in the terminal footer (e.g. `https://app.siftstack.com`). Set this for on-prem or custom deployments whose API host can't be mapped to a frontend automatically. Also honored via the `SIFT_APP_URL` environment variable. When unset, the link is derived from the REST URI for known Sift hosts. |
+| `--sift-open-report` | off | Open the resulting report in a browser at session end. Online mode only; a no-op when the report URL can't be resolved. Intended for local development. |
 
 These can be passed permanently via `addopts`:
 
@@ -158,6 +160,7 @@ CLI flags, when passed, override the ini values.
 | `sift_module_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each test module (file). |
 | `sift_class_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each test class, including nested classes. |
 | `sift_parametrize_nesting` | bool (default `true`) | _(ini-only)_. Clusters parametrized tests under shared parents (`test_x`, `axis=value`) instead of flat leaves (`test_x[value]`). |
+| `sift_open_report` | bool (default `false`) | `--sift-open-report` |
 
 ```toml title="pyproject.toml"
 [tool.pytest.ini_options]
diff --git a/python/docs/guides/pytest_plugin/running_modes.md b/python/docs/guides/pytest_plugin/running_modes.md
index e69688cf1..9289428e4 100644
--- a/python/docs/guides/pytest_plugin/running_modes.md
+++ b/python/docs/guides/pytest_plugin/running_modes.md
@@ -25,6 +25,88 @@ pytest --sift-log-file=./sift-results.jsonl
 Pass both flags and disabled wins: it skips Sift entirely and supersedes every
 other setting.
 
+## Terminal output
+
+Each run prints a header with the SDK version and active mode, and an end-of-run
+`Sift report` panel summarizing the outcome. Both are suppressed under `-q`. The
+panel is color-coded when the terminal supports it (green pass, red
+failure/error, yellow skip, cyan link) and plain text otherwise (`--color=no`,
+captured output, CI logs).
+
+The section title carries the report name (truncated if long). The `Steps` row
+tallies every step in the report by final status, so it counts substeps and the
+package/module/class/parametrize grouping steps too — its totals are expected to
+exceed pytest's own test count. The `Measurements` row tallies recorded
+measurements (`step.measure(...)`) and is omitted when there are none. The
+`Test case` and `System` rows echo the report's test case, test system, and
+operator.
+
+**Online** shows the report metadata, step and measurement breakdowns, and a
+clickable link. The web host is derived from the REST URI for known Sift hosts;
+for on-prem or custom deployments set `--sift-report-url-base`
+(ini: `sift_report_url_base`, env: `SIFT_APP_URL`). Add `--sift-open-report` to
+open the report in a browser at session end.
+
+```text
+============================= test session starts ==============================
+platform linux -- Python 3.11.8, pytest-8.3.2, pluggy-1.5.0
+Sift: sift-stack-py 0.17.0 — online mode
+collected 12 items
+
+tests/test_battery.py ........                                           [ 66%]
+tests/test_thermal.py ....                                               [100%]
+
+================ Sift report · pytest tests/ 2026-05-27T22:44:23Z ==============
+  Test case    pytest tests/
+  Status       PASSED       online · sift-stack-py 0.17.0
+  Steps        14 passed
+  Measurements 42 passed
+  System       ci-runner-7 · cibot
+  Log file     /tmp/sift-a1b2c3.jsonl
+  Report       https://app.siftstack.com/test-results/0193f1a2-7c44-7e5b-9b1a-2f6c0d9e84aa
+============================== 12 passed in 3.45s ==============================
+```
+
+If the background uploader doesn't finish, the panel still links the report and
+flags that it may be incomplete:
+
+```text
+================ Sift report · pytest tests/ 2026-05-27T22:44:23Z ==============
+  Test case    pytest tests/
+  Status       FAILED       online · sift-stack-py 0.17.0
+  Steps        11 passed · 2 failed · 1 error
+  Measurements 40 passed · 3 failed
+  System       ci-runner-7 · cibot
+  Log file     /tmp/sift-a1b2c3.jsonl
+  Report       https://app.siftstack.com/test-results/0193f1a2-7c44-7e5b-9b1a-2f6c0d9e84aa
+               may be incomplete — finish with: import-test-result-log /tmp/sift-a1b2c3.jsonl
+```
+
+When the web host can't be resolved and no override is set, the `Report` row
+shows the report id instead of a link.
+
+**Offline** shows the metadata and breakdowns, then the upload command under a
+small rule (the log path is part of the command):
+
+```text
+================ Sift report · pytest tests/ 2026-05-27T22:44:23Z ==============
+  Test case    pytest tests/
+  Status       PASSED       offline · not uploaded
+  Steps        14 passed
+  Measurements 42 passed
+  System       ci-runner-7 · cibot
+  Log file     ./run.jsonl
+------------------------------ to upload to Sift -------------------------------
+  >> import-test-result-log ./run.jsonl
+```
+
+**Disabled** notes that no report was created:
+
+```text
+===================================== Sift =====================================
+Sift disabled — no test report created.
+```
+
 ## Online mode (default)
 
 `report_context` resolves `client_has_connection` at session start. The default
diff --git a/python/lib/sift_client/_internal/grpc_transport/transport.py b/python/lib/sift_client/_internal/grpc_transport/transport.py
index 7e0bc5425..e088befa0 100644
--- a/python/lib/sift_client/_internal/grpc_transport/transport.py
+++ b/python/lib/sift_client/_internal/grpc_transport/transport.py
@@ -8,7 +8,6 @@
 
 from importlib.metadata import PackageNotFoundError, version
 from typing import TYPE_CHECKING, Any, TypedDict, cast
-from urllib.parse import ParseResult, urlparse
 
 import grpc
 import grpc.aio as grpc_aio
@@ -21,6 +20,7 @@
     Metadata,
     MetadataInterceptor,
 )
+from sift_client._internal.urls import parse_host
 
 if TYPE_CHECKING:
     from sift_client._internal.grpc_transport._async_interceptors.base import ClientAsyncInterceptor
@@ -78,7 +78,7 @@ def use_sift_channel(
 
     credentials = get_ssl_credentials(cert_via_openssl)
     options = _compute_channel_options(config)
-    api_uri = _clean_uri(config["uri"], use_ssl)
+    api_uri = parse_host(config["uri"])
     channel = grpc.secure_channel(api_uri, credentials, options)
     interceptors = _compute_sift_interceptors(config, metadata)
     return grpc.intercept_channel(channel, *interceptors)
@@ -98,7 +98,7 @@ def use_sift_async_channel(
         return _use_insecure_sift_async_channel(config, metadata)
 
     return grpc_aio.secure_channel(
-        target=_clean_uri(config["uri"], use_ssl),
+        target=parse_host(config["uri"]),
         credentials=get_ssl_credentials(cert_via_openssl),
         options=_compute_channel_options(config),
         interceptors=_compute_sift_async_interceptors(config, metadata),
@@ -112,7 +112,7 @@ def _use_insecure_sift_channel(
     FOR DEVELOPMENT PURPOSES ONLY
     """
     options = _compute_channel_options(config)
-    api_uri = _clean_uri(config["uri"], False)
+    api_uri = parse_host(config["uri"])
     channel = grpc.insecure_channel(api_uri, options)
     interceptors = _compute_sift_interceptors(config, metadata)
     return grpc.intercept_channel(channel, *interceptors)
@@ -125,7 +125,7 @@ def _use_insecure_sift_async_channel(
     FOR DEVELOPMENT PURPOSES ONLY
     """
     return grpc_aio.insecure_channel(
-        target=_clean_uri(config["uri"], False),
+        target=parse_host(config["uri"]),
         options=_compute_channel_options(config),
         interceptors=_compute_sift_async_interceptors(config, metadata),
     )
@@ -205,21 +205,6 @@ def _metadata_async_interceptor(
     return MetadataAsyncInterceptor(md)
 
 
-def _clean_uri(uri: str, use_ssl: bool) -> str:
-    """
-    This will automatically transform the URI to an acceptable form regardless of whether or not
-    users included the scheme in the URL or included trailing slashes.
-    """
-
-    if "http://" in uri or "https://" in uri:
-        parsed: ParseResult = urlparse(uri)
-        return parsed.netloc
-
-    full_uri = f"https://{uri}" if use_ssl else f"http://{uri}"
-    parsed_res: ParseResult = urlparse(full_uri)
-    return parsed_res.netloc
-
-
 def _compute_user_agent() -> str:
     try:
         return f"sift_stack_py/{version('sift_stack_py')}"
diff --git a/python/lib/sift_client/_internal/rest.py b/python/lib/sift_client/_internal/rest.py
index ee0239b79..6a9d1c9d1 100644
--- a/python/lib/sift_client/_internal/rest.py
+++ b/python/lib/sift_client/_internal/rest.py
@@ -6,7 +6,7 @@
 from typing_extensions import NotRequired
 from urllib3.util import Retry
 
-from sift_client._internal.grpc_transport.transport import _clean_uri
+from sift_client._internal.urls import parse_host
 
 _DEFAULT_REST_RETRY = Retry(total=3, status_forcelist=[500, 502, 503, 504], backoff_factor=1)
 
@@ -33,7 +33,7 @@ class SiftRestConfig(TypedDict):
 def compute_uri(restconf: SiftRestConfig) -> str:
     uri = restconf["uri"]
     use_ssl = restconf.get("use_ssl", True)
-    clean_uri = _clean_uri(uri, use_ssl)
+    clean_uri = parse_host(uri)
 
     if use_ssl:
         return f"https://{clean_uri}"
diff --git a/python/lib/sift_client/_internal/urls.py b/python/lib/sift_client/_internal/urls.py
new file mode 100644
index 000000000..99dd1816f
--- /dev/null
+++ b/python/lib/sift_client/_internal/urls.py
@@ -0,0 +1,55 @@
+"""Helpers for turning Sift API endpoints into web-app (frontend) URLs.
+
+The Sift frontend can be hosted on several domains and the backend exposes no
+field for its own URL, so the frontend origin is derived client-side from the
+API host. This table mirrors the canonical mapping used by the Grafana
+datasource (sift-stack/sift-grafana-datasource,
+``src/components/sharelink/getFrontendHostnameDefaults.ts``). Hosts outside the
+table (on-prem and custom deployments) require an explicit override.
+"""
+
+from __future__ import annotations
+
+from urllib.parse import urlparse
+
+# API host (host[:port], no scheme) -> frontend origin (with scheme).
+_API_HOST_TO_FRONTEND_ORIGIN: dict[str, str] = {
+    "api.siftstack.com": "https://app.siftstack.com",
+    "gov.api.siftstack.com": "https://gov.siftstack.com",
+}
+
+
+def parse_origin(url: str) -> str:
+    """Normalize a URL or bare host into a ``scheme://host[:port]`` origin.
+
+    Bare hosts (no scheme) are assumed to be ``https``.
+    """
+    candidate = url if "://" in url else f"https://{url}"
+    parsed = urlparse(candidate)
+    return f"{parsed.scheme}://{parsed.netloc}".rstrip("/")
+
+
+def parse_host(url: str) -> str:
+    """Extract ``host[:port]`` from a URL or bare host string."""
+    candidate = url if "://" in url else f"https://{url}"
+    return urlparse(candidate).netloc
+
+
+def frontend_origin_for_api(api_base_url: str, override: str | None = None) -> str | None:
+    """Return the Sift web-app origin for a given API base URL.
+
+    Args:
+        api_base_url: The REST API base URL (e.g. ``https://api.siftstack.com``).
+        override: An explicit frontend origin (host or full URL) to use instead
+            of the derived value. Set this for on-prem or custom deployments
+            whose API host isn't in the built-in mapping.
+
+    Returns:
+        The frontend origin (e.g. ``https://app.siftstack.com``), or ``None``
+        when no override is given and the API host isn't recognized.
+    """
+    if override:
+        return parse_origin(override)
+    if not api_base_url:
+        return None
+    return _API_HOST_TO_FRONTEND_ORIGIN.get(parse_host(api_base_url))
diff --git a/python/lib/sift_client/_tests/pytest_plugin/conftest.py b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
index 7afee768d..ba775e04b 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/conftest.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
@@ -29,7 +29,7 @@
 
 import pytest
 
-_SIFT_ENV_VARS = ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI", "SIFT_DISABLED")
+_SIFT_ENV_VARS = ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI", "SIFT_DISABLED", "SIFT_APP_URL")
 
 
 @pytest.fixture
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py b/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
new file mode 100644
index 000000000..76550cc22
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
@@ -0,0 +1,195 @@
+"""Tests for the plugin's terminal output (session header + report footer).
+
+Driven through inner pytester sessions. Online output is exercised by the
+``SiftClient.app_url`` unit tests (``_tests/test_urls.py``) since a live link
+needs a real backend; here we cover the deterministic disabled/offline footers
+and the ``-q`` suppression both share.
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Callable
+
+from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+from sift_client.pytest_plugin import (
+    _measurement_segments,
+    _resolve_real_report_id,
+    _step_count_segments,
+)
+from sift_client.sift_types.test_report import TestStatus
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestStepCountSegments:
+    def test_lists_nonzero_statuses_in_order_with_color(self) -> None:
+        counts = Counter({TestStatus.PASSED: 4, TestStatus.FAILED: 2, TestStatus.SKIPPED: 1})
+        assert _step_count_segments(counts) == [
+            ("4 passed", {"green": True}),
+            ("2 failed", {"red": True}),
+            ("1 skipped", {"yellow": True}),
+        ]
+
+    def test_error_and_aborted_are_red(self) -> None:
+        counts = Counter({TestStatus.ERROR: 1, TestStatus.ABORTED: 1})
+        assert _step_count_segments(counts) == [
+            ("1 error", {"red": True}),
+            ("1 aborted", {"red": True}),
+        ]
+
+    def test_empty_is_empty(self) -> None:
+        assert _step_count_segments(Counter()) == []
+
+
+class TestMeasurementSegments:
+    def test_passed_green_failed_red(self) -> None:
+        assert _measurement_segments(Counter({True: 2, False: 1})) == [
+            ("2 passed", {"green": True}),
+            ("1 failed", {"red": True}),
+        ]
+
+    def test_empty_is_empty(self) -> None:
+        assert _measurement_segments(Counter()) == []
+
+
+class TestResolveRealReportId:
+    """``_resolve_real_report_id`` maps the footer to the real server report id."""
+
+    def test_synchronous_online_uses_report_id_directly(self) -> None:
+        # No log file, non-simulated report (``--sift-log-file=false`` path).
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="real-123", is_simulated=False),
+            log_file=None,
+        )
+        assert _resolve_real_report_id(context) == "real-123"
+
+    def test_incremental_resolves_via_sidecar(self, tmp_path: Path) -> None:
+        log_file = tmp_path / "run.jsonl"
+        log_file.write_text("")
+        LogTracking(id_map={"sim-1": "real-1"}).save(log_file)
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="sim-1", is_simulated=True),
+            log_file=log_file,
+        )
+        assert _resolve_real_report_id(context) == "real-1"
+
+    def test_empty_report_id_returns_none(self) -> None:
+        # An unset/empty id must not produce a ``/test-results/`` link.
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="", is_simulated=False),
+            log_file=None,
+        )
+        assert _resolve_real_report_id(context) is None
+
+    def test_incremental_unmapped_returns_none(self, tmp_path: Path) -> None:
+        # Worker died before mapping the report: no sidecar entry.
+        log_file = tmp_path / "run.jsonl"
+        log_file.write_text("")
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="sim-1", is_simulated=True),
+            log_file=log_file,
+        )
+        assert _resolve_real_report_id(context) is None
+
+
+class TestHeader:
+    def test_header_shows_version_and_mode(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The session header reports the SDK version and the active mode."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+        result.stdout.fnmatch_lines(["*sift-stack-py*disabled mode*"])
+
+    def test_header_suppressed_under_quiet(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``-q`` suppresses the header, matching pytest's own platform header."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", "-q")
+        result.assert_outcomes(passed=1)
+        result.stdout.no_fnmatch_line("*sift-stack-py*")
+
+
+class TestDisabledFooter:
+    def test_footer_notes_no_report(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+        result.stdout.fnmatch_lines(["*Sift disabled*no test report created*"])
+
+    def test_footer_suppressed_under_quiet(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", "-q")
+        result.assert_outcomes(passed=1)
+        result.stdout.no_fnmatch_line("*Sift disabled*")
+
+
+class TestOfflineFooter:
+    def test_footer_shows_log_path_and_replay_command(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline footer points at the saved log file and the replay command."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        result.stdout.fnmatch_lines(
+            [
+                "*Test case*",
+                "*Status*offline*not uploaded*",
+                "*Steps*passed*",
+                "*Measurements*1 passed*",
+                "*System*",
+                f"*Log file*{log_path}",
+                "*to upload to Sift*",
+                f"*import-test-result-log {log_path}",
+            ]
+        )
+
+    def test_sift_open_report_flag_is_accepted_offline(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-open-report`` is a no-op offline (no resolvable URL) and never errors."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess(
+            "--sift-offline", f"--sift-log-file={log_path}", "--sift-open-report"
+        )
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/test_urls.py b/python/lib/sift_client/_tests/test_urls.py
new file mode 100644
index 000000000..be9febd52
--- /dev/null
+++ b/python/lib/sift_client/_tests/test_urls.py
@@ -0,0 +1,74 @@
+"""Tests for web-app URL derivation (``_internal/urls.py`` and ``SiftClient.app_url``)."""
+
+from __future__ import annotations
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.urls import frontend_origin_for_api
+
+
+class TestFrontendOriginForApi:
+    @pytest.mark.parametrize(
+        ("api_base_url", "expected"),
+        [
+            ("https://api.siftstack.com", "https://app.siftstack.com"),
+            ("https://gov.api.siftstack.com", "https://gov.siftstack.com"),
+            # Bare host (no scheme) resolves the same as the full URL.
+            ("api.siftstack.com", "https://app.siftstack.com"),
+        ],
+    )
+    def test_known_hosts(self, api_base_url: str, expected: str) -> None:
+        assert frontend_origin_for_api(api_base_url) == expected
+
+    def test_unknown_host_returns_none(self) -> None:
+        assert frontend_origin_for_api("https://api.acme.example.com") is None
+
+    def test_empty_returns_none(self) -> None:
+        assert frontend_origin_for_api("") is None
+
+    def test_override_wins_over_derivation(self) -> None:
+        # Override applies even for a known host.
+        assert (
+            frontend_origin_for_api("https://api.siftstack.com", override="https://app.acme.test")
+            == "https://app.acme.test"
+        )
+
+    def test_override_normalizes_bare_host(self) -> None:
+        assert (
+            frontend_origin_for_api("https://api.acme.example.com", override="sift.acme.test")
+            == "https://sift.acme.test"
+        )
+
+
+class TestSiftClientAppUrl:
+    def _client(self, rest_url: str, app_url: str | None = None) -> SiftClient:
+        return SiftClient(
+            connection_config=SiftConnectionConfig(
+                api_key="k",
+                grpc_url="grpc-api.siftstack.com:443",
+                rest_url=rest_url,
+            ),
+            app_url=app_url,
+        )
+
+    def test_derives_from_known_rest_host(self) -> None:
+        assert self._client("https://api.siftstack.com").app_url == "https://app.siftstack.com"
+
+    def test_unknown_host_without_override_is_none(self) -> None:
+        assert self._client("https://api.acme.example.com").app_url is None
+
+    def test_override_used_for_unknown_host(self) -> None:
+        client = self._client("https://api.acme.example.com", app_url="https://sift.acme.test")
+        assert client.app_url == "https://sift.acme.test"
+
+    def test_override_from_connection_config(self) -> None:
+        client = SiftClient(
+            connection_config=SiftConnectionConfig(
+                api_key="k",
+                grpc_url="grpc-api.siftstack.com:443",
+                rest_url="https://api.acme.example.com",
+                app_url="https://sift.acme.test",
+            )
+        )
+        assert client.app_url == "https://sift.acme.test"
diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py
index ff574adba..d77aff6c0 100644
--- a/python/lib/sift_client/client.py
+++ b/python/lib/sift_client/client.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from sift_client._internal.urls import frontend_origin_for_api
 from sift_client.resources import (
     AssetsAPI,
     AssetsAPIAsync,
@@ -124,6 +125,7 @@ def __init__(
         grpc_url: str | None = None,
         rest_url: str | None = None,
         connection_config: SiftConnectionConfig | None = None,
+        app_url: str | None = None,
     ):
         """Initialize the SiftClient with specific connection parameters or a connection_config.
 
@@ -132,6 +134,10 @@ def __init__(
             grpc_url: The Sift gRPC API URL.
             rest_url: The Sift REST API URL.
             connection_config: A SiftConnectionConfig object to configure the connection behavior of the SiftClient.
+            app_url: The Sift web-app origin (e.g. ``https://app.siftstack.com``).
+                Set this for on-prem or custom deployments whose API host can't be
+                mapped to a frontend automatically; see the ``app_url`` property.
+                A value here takes precedence over ``connection_config.app_url``.
         """
         if not (api_key and grpc_url and rest_url) and not connection_config:
             raise ValueError(
@@ -152,6 +158,12 @@ def __init__(
         WithGrpcClient.__init__(self, grpc_client=grpc_client)
         WithRestClient.__init__(self, rest_client=rest_client)
 
+        # Explicit web-app origin override; falls back to the connection config's
+        # value, then to host-based derivation in the ``app_url`` property.
+        self._app_url: str | None = app_url or (
+            connection_config.app_url if connection_config else None
+        )
+
         # When set, test-results writes return synthesized responses without
         # contacting Sift. Read by `TestResultsAPIAsync._simulate`. Used by the
         # pytest plugin's ``--sift-disabled`` mode.
@@ -198,3 +210,18 @@ def grpc_client(self) -> GrpcClient:
     def rest_client(self) -> RestClient:
         """The REST client used by the SiftClient for making REST API calls."""
         return self._rest_client
+
+    @property
+    def app_url(self) -> str | None:
+        """The Sift web-app origin for this client, or None if it can't be determined.
+
+        Uses the explicit override passed at construction when set, otherwise
+        derives the origin from the REST host for known Sift deployments (e.g.
+        ``https://api.siftstack.com`` -> ``https://app.siftstack.com``). Returns
+        None for unrecognized hosts with no override.
+
+        # TODO: Add a ``WithAppPage`` mixin on BaseType so resources (TestReport,
+        # Run, ...) can expose their own web-app link from ``_client.app_url`` plus
+        # a per-type path, instead of callers assembling paths by hand.
+        """
+        return frontend_origin_for_api(self.rest_client.base_url, override=self._app_url)
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 09aca5e33..cf85b3abb 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -15,6 +15,7 @@
 from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
 from sift_client.util.test_results.context_manager import (
+    _quiet_fork_stderr,
     format_assertion_message,
     format_truncated_traceback,
 )
@@ -42,6 +43,12 @@ class SiftPytestStepDrainError(RuntimeError):
 
 REPORT_CONTEXT: Any = None
 
+# Set at session end with the resolved (real) report id/URL when online and
+# uploaded. Read from a project's conftest in a later hook (e.g.
+# ``pytest_unconfigure``) to post the link, write a file, etc.
+SIFT_REPORT_ID_STASH_KEY = pytest.StashKey[str]()
+SIFT_REPORT_URL_STASH_KEY = pytest.StashKey[str]()
+
 _STASH_MISSING = object()
 
 _PARAMETRIZE_PATH_KEY = pytest.StashKey[Tuple[str, ...]]()
@@ -297,6 +304,33 @@ class _Option:
     "this ini value.",
 )
 
+_REPORT_URL_BASE = _Option(
+    cli_flag="--sift-report-url-base",
+    ini_name="sift_report_url_base",
+    cli_help="Sift web-app origin used to build the clickable report link in the "
+    "terminal footer (e.g. https://app.siftstack.com). Set this for on-prem or "
+    "custom deployments whose API host can't be mapped to a frontend "
+    "automatically. Also honored via the SIFT_APP_URL env var. When unset, the "
+    "link is derived from the REST URI for known Sift hosts.",
+    ini_help="Default for --sift-report-url-base. The Sift web-app origin used to "
+    "build the report link in the terminal footer. Also honored via the "
+    "SIFT_APP_URL env var. When unset, the link is derived from the REST URI for "
+    "known Sift hosts.",
+)
+
+_OPEN = _Option(
+    cli_flag="--sift-open-report",
+    ini_name="sift_open_report",
+    action="store_true",
+    cli_help="Open the resulting Sift test report in a browser at session end. "
+    "Online mode only; no-op when the report URL can't be resolved. Intended for "
+    "local development.",
+    ini_help="When true, open the report in a browser at session end (online only). "
+    "Defaults to false.",
+    ini_type="bool",
+    ini_default=False,
+)
+
 _AUTOUSE = _Option(
     ini_name="sift_autouse",
     ini_help="Default for the Sift autouse fixtures (report_context, step, "
@@ -350,6 +384,8 @@ class _Option:
     _DISABLED,
     _GRPC_URI,
     _REST_URI,
+    _REPORT_URL_BASE,
+    _OPEN,
     _AUTOUSE,
     _PACKAGE_STEP,
     _MODULE_STEP,
@@ -445,6 +481,305 @@ def _is_disabled(pytestconfig: pytest.Config | None) -> bool:
     return os.getenv("SIFT_DISABLED", "").lower() in ("1", "true", "yes")
 
 
+def _sdk_version() -> str:
+    """Return the installed ``sift_stack_py`` version, or ``"unknown"``."""
+    from importlib.metadata import PackageNotFoundError, version
+
+    try:
+        return version("sift_stack_py")
+    except PackageNotFoundError:
+        return "unknown"
+
+
+def _mode_label(config: pytest.Config) -> str:
+    """Resolve the active mode for the terminal header: disabled > offline > online."""
+    if _is_disabled(config):
+        return "disabled"
+    if _is_offline(config):
+        return "offline"
+    return "online"
+
+
+def pytest_report_header(config: pytest.Config) -> str | None:
+    """Emit a session-start header with the SDK version and active mode.
+
+    Suppressed under ``-q`` (negative verbosity), matching how pytest hides its
+    own platform/plugin header.
+    """
+    if config.get_verbosity() < 0:
+        return None
+    return f"Sift: sift-stack-py {_sdk_version()} — {_mode_label(config)} mode"
+
+
+def _resolve_real_report_id(context: Any) -> str | None:
+    """Resolve the real server-side report id for the online footer link.
+
+    In synchronous online mode (``--sift-log-file=false``) the report is created
+    directly against the API, so ``report.id_`` is already the real id. In the
+    default incremental mode the report is created through the simulate path
+    (a client-side UUID) and the background worker maps it to the real id on
+    replay, recording it in the ``<log>.tracking`` sidecar's ``id_map``. By the
+    time this footer runs the session-scoped report context has torn down and
+    the worker has drained, so the sidecar is final.
+
+    Returns ``None`` when the worker never mapped the report (e.g. it died before
+    replaying the create), meaning no real report exists to link.
+    """
+    report = context.report
+    if not report.id_:
+        # No id was ever assigned (unset/empty); nothing to link.
+        return None
+    sim_id = str(report.id_)
+    if not getattr(report, "is_simulated", False):
+        return sim_id
+    log_file = getattr(context, "log_file", None)
+    if log_file is None:
+        return None
+    from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+
+    return LogTracking.load(log_file).id_map.get(sim_id)
+
+
+_LABEL_WIDTH = 13
+
+
+def _sift_kv(terminalreporter: Any, label: str, value: str, **value_markup: bool) -> None:
+    """Write an indented ``label  value`` row, bolding the label.
+
+    ``value_markup`` (e.g. ``green=True``, ``cyan=True``) styles only the value.
+    Color is dropped automatically when the terminal has no markup (not a TTY or
+    ``--color=no``), so captured/CI output stays plain text.
+    """
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{label:<{_LABEL_WIDTH}}", bold=True)
+    terminalreporter.write_line(value, **value_markup)
+
+
+# Step-count breakdown order and labels for the footer's "Steps" row.
+_STEP_COUNT_ORDER: tuple[tuple[TestStatus, str], ...] = (
+    (TestStatus.PASSED, "passed"),
+    (TestStatus.FAILED, "failed"),
+    (TestStatus.ERROR, "error"),
+    (TestStatus.ABORTED, "aborted"),
+    (TestStatus.SKIPPED, "skipped"),
+    (TestStatus.IN_PROGRESS, "in progress"),
+)
+
+
+# Per-status color for the footer's step breakdown: green pass, red
+# failure/error/abort, yellow skip; in-progress (and anything else) stays plain.
+_STEP_STATUS_MARKUP: dict[TestStatus, dict[str, bool]] = {
+    TestStatus.PASSED: {"green": True},
+    TestStatus.FAILED: {"red": True},
+    TestStatus.ERROR: {"red": True},
+    TestStatus.ABORTED: {"red": True},
+    TestStatus.SKIPPED: {"yellow": True},
+}
+
+
+def _step_count_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
+    """Build ``(text, markup)`` segments for a step tally, non-zero only."""
+    return [
+        (f"{counts.get(status, 0)} {label}", _STEP_STATUS_MARKUP.get(status, {}))
+        for status, label in _STEP_COUNT_ORDER
+        if counts.get(status, 0)
+    ]
+
+
+def _measurement_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
+    """Build ``(text, markup)`` segments for a measurement tally, non-zero only."""
+    segments: list[tuple[str, dict[str, bool]]] = []
+    if counts.get(True, 0):
+        segments.append((f"{counts[True]} passed", {"green": True}))
+    if counts.get(False, 0):
+        segments.append((f"{counts[False]} failed", {"red": True}))
+    return segments
+
+
+def _write_count_row(
+    terminalreporter: Any, label: str, segments: list[tuple[str, dict[str, bool]]]
+) -> None:
+    """Write a ``label  a · b · c`` row, applying each segment's color markup."""
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{label:<{_LABEL_WIDTH}}", bold=True)
+    for index, (text, markup) in enumerate(segments):
+        if index:
+            terminalreporter.write(" · ")
+        terminalreporter.write(text, **markup)
+    terminalreporter.write_line("")
+
+
+def _report_panel_title(report: Any, terminalreporter: Any) -> str:
+    """``Sift report · <name>`` for the section rule, truncated to the terminal width.
+
+    The report name embeds a timestamp (and, for invocation-based runs, the
+    pytest args), so a long name is truncated with an ellipsis to keep the
+    separator line from wrapping.
+    """
+    base = "Sift report"
+    name = getattr(report, "name", None)
+    if not name:
+        return base
+    title = f"{base} · {name}"
+    fullwidth = getattr(getattr(terminalreporter, "_tw", None), "fullwidth", 80)
+    # Reserve room for the separator characters and spaces write_sep adds.
+    limit = max(len(base), fullwidth - 8)
+    if len(title) > limit:
+        title = title[: limit - 1] + "…"
+    return title
+
+
+def _maybe_open_report(url: str) -> None:
+    """Best-effort open the report URL in a browser (for ``--sift-open-report``).
+
+    Skipped on CI or non-interactive sessions so a committed ``sift_open_report``
+    setting can't spawn a browser on a headless agent; the flag is meant for
+    local development.
+    """
+    import sys
+    import webbrowser
+
+    if os.environ.get("CI") or not sys.stdout.isatty():
+        return
+    try:
+        # webbrowser.open forks/execs the platform opener while the gRPC client's
+        # background threads are live; redirect fd 2 across the fork to swallow
+        # gRPC's prefork notice (same treatment as the plugin's other fork sites).
+        with _quiet_fork_stderr():
+            webbrowser.open(url)
+    except Exception:
+        # Headless / no browser available: opening is a convenience, never fatal.
+        pass
+
+
+def pytest_terminal_summary(terminalreporter: Any, exitstatus: int, config: pytest.Config) -> None:
+    """Emit a session-end Sift report summary, adapting per mode.
+
+    The printed panel is suppressed under ``-q``, but programmatic side effects
+    (stashing the report ref for ``conftest.py``, ``--sift-open-report``) still run so
+    other plugins and CI steps can consume the result. The panel shows the
+    outcome (green/red), step and measurement tallies, and a per-mode action: a
+    report link (online), the upload command (offline), or a disabled note.
+    """
+    quiet = config.get_verbosity() < 0
+
+    if _is_disabled(config):
+        if not quiet:
+            terminalreporter.write_sep("=", "Sift", cyan=True, bold=True)
+            terminalreporter.write_line("Sift disabled — no test report created.")
+        return
+
+    context = REPORT_CONTEXT
+    if context is None:
+        # No gated test ran, so no report context was created. Nothing to show.
+        return
+
+    log_file = getattr(context, "log_file", None)
+    offline = _is_offline(config)
+
+    # Resolve the report link first so stashing and --sift-open-report run even under
+    # -q (programmatic consumers don't care about verbosity). Truthiness, not
+    # ``is not None``: a resolved-but-empty id (degenerate sidecar mapping, unset
+    # proto field) must fall through to the "not uploaded" path, not produce a
+    # ``/test-results/`` link.
+    report_id = None if offline else _resolve_real_report_id(context)
+    report_url = (
+        f"{context.client.app_url}/test-results/{report_id}"
+        if report_id and context.client.app_url
+        else None
+    )
+    if report_id:
+        config.stash[SIFT_REPORT_ID_STASH_KEY] = report_id
+    if report_url is not None:
+        config.stash[SIFT_REPORT_URL_STASH_KEY] = report_url
+        if _option_or_ini(config, _OPEN):
+            _maybe_open_report(report_url)
+
+    if quiet:
+        return
+
+    failed = bool(getattr(context, "any_failures", False))
+    status_word, status_markup = (
+        ("FAILED", {"red": True, "bold": True})
+        if failed
+        else ("PASSED", {"green": True, "bold": True})
+    )
+    # Offline results live only in the local log until replayed, so the status
+    # row calls that out instead of repeating the version (already in the header).
+    status_context = (
+        f"{_mode_label(config)} · not uploaded"
+        if offline
+        else f"{_mode_label(config)} · sift-stack-py {_sdk_version()}"
+    )
+
+    report = context.report
+
+    terminalreporter.write_sep(
+        "=", _report_panel_title(report, terminalreporter), cyan=True, bold=True
+    )
+
+    # Identity row: the test case (test path or pytest invocation).
+    if report.test_case:
+        _sift_kv(terminalreporter, "Test case", str(report.test_case))
+
+    # Status row: colored outcome, then compact mode context.
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{'Status':<{_LABEL_WIDTH}}", bold=True)
+    terminalreporter.write(status_word, **status_markup)
+    terminalreporter.write_line(f"      {status_context}")
+
+    # Step + measurement tallies (green pass, red failure, yellow skip).
+    _write_count_row(
+        terminalreporter,
+        "Steps",
+        _step_count_segments(context.step_status_counts) or [("no steps", {})],
+    )
+    measurement_segments = _measurement_segments(context.measurement_counts)
+    if measurement_segments:
+        _write_count_row(terminalreporter, "Measurements", measurement_segments)
+
+    # Provenance row: test system and operator.
+    system = " · ".join(part for part in (report.test_system_name, report.system_operator) if part)
+    if system:
+        _sift_kv(terminalreporter, "System", system)
+
+    # Local log file (write-through backup online, sole sink offline).
+    if log_file is not None:
+        _sift_kv(terminalreporter, "Log file", str(log_file))
+
+    if offline:
+        if log_file is not None:
+            terminalreporter.write_sep("-", "to upload to Sift")
+            terminalreporter.write_line(f"  >> import-test-result-log {log_file}", cyan=True)
+        return
+
+    if not report_id:
+        # Incremental upload never mapped the report (the worker died before
+        # replaying the create), so there's no real report to link.
+        _sift_kv(
+            terminalreporter,
+            "Report",
+            f"not uploaded — replay with: import-test-result-log {log_file}",
+            yellow=True,
+        )
+    elif report_url is not None:
+        _sift_kv(terminalreporter, "Report", report_url, cyan=True)
+    else:
+        _sift_kv(
+            terminalreporter,
+            "Report",
+            f"id {report_id}  (set sift_report_url_base for a clickable link)",
+        )
+
+    if report_id and getattr(context, "replay_incomplete", False) and log_file is not None:
+        _sift_kv(
+            terminalreporter,
+            "",
+            f"may be incomplete — finish with: import-test-result-log {log_file}",
+            yellow=True,
+        )
+
+
 def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
     """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
 
@@ -806,6 +1141,10 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
         )
     for env in missing:
         resolved[env] = _OFFLINE_DEFAULTS[env]
+    # Web-app origin for the report link: the sift_report_url_base CLI/ini option
+    # wins, then the SIFT_APP_URL env var, else host-based derivation in
+    # SiftClient.app_url.
+    report_url_base = _option_or_ini(pytestconfig, _REPORT_URL_BASE) or os.getenv("SIFT_APP_URL")
     # `or ""` is unreachable in practice since the `missing` check above guarantees
     # non-None values
     return SiftClient(
@@ -813,6 +1152,7 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
             api_key=resolved.get("SIFT_API_KEY") or "",
             grpc_url=resolved.get("SIFT_GRPC_URI") or "",
             rest_url=resolved.get("SIFT_REST_URI") or "",
+            app_url=report_url_base or None,
         )
     )
 
diff --git a/python/lib/sift_client/sift_types/test_report.py b/python/lib/sift_client/sift_types/test_report.py
index c4abfc548..dd786b02d 100644
--- a/python/lib/sift_client/sift_types/test_report.py
+++ b/python/lib/sift_client/sift_types/test_report.py
@@ -410,6 +410,38 @@ class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"], Simulat
     # Set by the low-level wrapper when this instance came from the simulate path
     _simulated: bool = False
 
+    def __str__(self) -> str:
+        """Human-readable form: ``[STATUS] name = value [unit] (bounds)``.
+
+        Used for failure messages, logs, and the REPL. The string omits whichever
+        parts aren't set (no unit, no bounds), and falls back to ``?`` if no
+        value type is populated. The status prefix reflects ``self.passed``.
+        """
+        status = "PASSED" if self.passed else "FAILED"
+        if self.numeric_value is not None:
+            value = f"{self.numeric_value}"
+            if self.unit:
+                value += f" {self.unit}"
+        elif self.string_value is not None:
+            value = repr(self.string_value)
+        elif self.boolean_value is not None:
+            value = str(self.boolean_value).lower()
+        else:
+            value = "?"
+        bounds = ""
+        nb = self.numeric_bounds
+        if nb is not None:
+            parts: list[str] = []
+            if nb.min is not None:
+                parts.append(f"min {nb.min}")
+            if nb.max is not None:
+                parts.append(f"max {nb.max}")
+            if parts:
+                bounds = f" ({', '.join(parts)})"
+        elif self.string_expected_value:
+            bounds = f" (expected {self.string_expected_value!r})"
+        return f"[{status}] {self.name} = {value}{bounds}"
+
     @classmethod
     def _from_proto(
         cls, proto: TestMeasurementProto, sift_client: SiftClient | None = None
diff --git a/python/lib/sift_client/transport/base_connection.py b/python/lib/sift_client/transport/base_connection.py
index 02f0e096e..6586412fe 100644
--- a/python/lib/sift_client/transport/base_connection.py
+++ b/python/lib/sift_client/transport/base_connection.py
@@ -24,6 +24,7 @@ def __init__(
         api_key: str,
         use_ssl: bool = True,
         cert_via_openssl: bool = False,
+        app_url: str | None = None,
     ):
         """Initialize the connection configuration.
 
@@ -33,12 +34,17 @@ def __init__(
             api_key: The API key for authentication.
             use_ssl: Whether to use SSL/TLS for secure connections.
             cert_via_openssl: Whether to use OpenSSL for certificate validation.
+            app_url: The Sift web-app origin (e.g. ``https://app.siftstack.com``).
+                Set this for on-prem or custom deployments whose API host can't be
+                mapped to a frontend automatically. When unset, the web-app URL is
+                derived from ``rest_url`` for known hosts.
         """
         self.api_key = api_key
         self.grpc_url = grpc_url
         self.rest_url = rest_url
         self.use_ssl = use_ssl
         self.cert_via_openssl = cert_via_openssl
+        self.app_url = app_url
 
     def get_grpc_config(self):
         """Create and return a GrpcConfig with the current settings.
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 48a89b2d9..41066b247 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -8,6 +8,7 @@
 import tempfile
 import traceback
 import warnings
+from collections import Counter
 from contextlib import AbstractContextManager, contextmanager
 from datetime import datetime, timezone
 from pathlib import Path
@@ -19,6 +20,7 @@
 from sift_client.sift_types.test_report import (
     ErrorInfo,
     NumericBounds,
+    TestMeasurement,
     TestMeasurementCreate,
     TestReport,
     TestReportCreate,
@@ -140,6 +142,19 @@ class ReportContext(AbstractContextManager):
     step_number_at_depth: dict[int, int]
     open_step_results: dict[str, bool]
     any_failures: bool
+    # Every step created in this report (including hierarchy/parametrize
+    # parents), retained after close so end-of-run summaries can tally final
+    # statuses. ``update`` mutates step instances in place, so these references
+    # reflect late status changes (e.g. a teardown-phase failure).
+    created_steps: list[TestStep]
+    # Every measurement recorded in this report, retained for end-of-run
+    # summaries. Appended in ``NewStep.measure``. A measurement's ``passed`` is
+    # fixed at creation, so the retained references stay accurate.
+    created_measurements: list[TestMeasurement]
+    # Set True in ``__exit__`` when the background replay worker timed out or
+    # exited non-zero, so callers (e.g. the pytest plugin footer) can flag that
+    # the uploaded report may be missing entries.
+    replay_incomplete: bool = False
     _import_proc: subprocess.Popen | None = None
     # Seconds to wait for the import worker subprocess to finish uploading
     # the JSONL backlog at session end before killing it. Tests substitute
@@ -184,6 +199,9 @@ def __init__(
         self.step_number_at_depth = {}
         self.open_step_results = {}
         self.any_failures = False
+        self.created_steps = []
+        self.created_measurements = []
+        self.replay_incomplete = False
 
         if log_file is True:
             tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False)
@@ -279,6 +297,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             except subprocess.TimeoutExpired:
                 self._import_proc.kill()
                 self._import_proc.wait()
+                self.replay_incomplete = True
                 warnings.warn(
                     f"Sift import worker did not exit in "
                     f"{self._import_proc_timeout}s; killing it. "
@@ -289,6 +308,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 log_replay_instructions(self.log_file)
                 return True  # Ensures the session is marked as passed in pytest
             if self._import_proc.returncode != 0:
+                self.replay_incomplete = True
                 stderr_text = (
                     stderr_bytes.decode("utf-8", errors="replace").strip() if stderr_bytes else ""
                 )
@@ -311,6 +331,23 @@ def is_simulated(self) -> bool:
         """
         return self.report.is_simulated
 
+    @property
+    def step_status_counts(self) -> Counter[TestStatus]:
+        """Tally of every created step by its current status.
+
+        Includes hierarchy/parametrize parent steps. Read at the end of a run for
+        summaries; reflects late status changes since steps are mutated in place.
+        """
+        return Counter(step.status for step in self.created_steps)
+
+    @property
+    def measurement_counts(self) -> Counter[bool]:
+        """Tally of recorded measurements keyed by ``passed`` (True/False).
+
+        Read at the end of a run for summaries.
+        """
+        return Counter(m.passed for m in self.created_measurements)
+
     def new_step(
         self,
         name: str,
@@ -378,6 +415,8 @@ def create_step(
         )
         self.step_stack.append(step)
         self.open_step_results[step.step_path] = True
+        # Retained for end-of-run tallies; never popped (unlike step_stack).
+        self.created_steps.append(step)
 
         return step
 
@@ -388,6 +427,10 @@ def record_step_outcome(self, outcome: bool, step: TestStep):
             self.open_step_results[step.step_path] = False
             self.any_failures = True
 
+    def record_measurement(self, measurement: TestMeasurement) -> None:
+        """Retain a recorded measurement for end-of-run summaries."""
+        self.created_measurements.append(measurement)
+
     def mark_step_failed_after_close(self, step: TestStep):
         """Mark a step's parent as failed after the step has already been popped from the stack.
 
@@ -466,6 +509,9 @@ def __init__(
         # substep / ``report_outcome`` failures are intentionally not folded
         # in here (see ``measurements_passed`` vs ``passed``).
         self._failed_measurement_count = 0
+        # Out-of-bounds measurements recorded on this step, retained so
+        # ``fail_if_measurements_failed`` can name them in the failure message.
+        self._failed_measurements: list[TestMeasurement] = []
 
     def __enter__(self):
         """Enter the context manager to create a new step.
@@ -487,9 +533,7 @@ def measurements_passed(self) -> bool:
         """
         return self._failed_measurement_count == 0
 
-    def fail_if_measurements_failed(
-        self, message: str = "one or more measurements out of bounds"
-    ) -> None:
+    def fail_if_measurements_failed(self, message: str = "measurements out of bounds") -> None:
         """Fail the pytest test if any measurement on this step was out of bounds.
 
         Use instead of ``assert step.measurements_passed``: it fails via
@@ -497,12 +541,18 @@ def fail_if_measurements_failed(
         assertion message to ``error_info``. No-op when every measurement
         passed. Call once at the end of the test so every measurement is still
         recorded before the failure fires.
+
+        The failure message names each out-of-bounds measurement with its
+        recorded value and bounds. ``message`` is used as the header line.
         """
         if self.measurements_passed:
             return
         import pytest
 
-        pytest.fail(message, pytrace=False)
+        failed = self._failed_measurements
+        header = f"{message} ({len(failed)}):" if failed else message
+        body = [f"  - {m}" for m in failed]
+        pytest.fail("\n".join([header, *body]), pytrace=False)
 
     def update_step_from_result(
         self,
@@ -662,8 +712,10 @@ def measure(
             create, log_file=self.report_context.log_file
         )
         self.report_context.record_step_outcome(measurement.passed, self.current_step)
+        self.report_context.record_measurement(measurement)
         if not measurement.passed:
             self._failed_measurement_count += 1
+            self._failed_measurements.append(measurement)
 
         return measurement.passed
 

From d5cc95201fb7ed5de1489a6090df349358f1760f Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Fri, 29 May 2026 14:08:44 -0700
Subject: [PATCH 12/19] version bump to 0.17.0.dev1

---
 python/pyproject.toml | 2 +-
 python/uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index a2cd6a410..2846fedba 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.17.0.dev0"
+version = "0.17.0.dev1"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }
diff --git a/python/uv.lock b/python/uv.lock
index 9ed71e17b..b8c439b1a 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -4315,7 +4315,7 @@ wheels = [
 
 [[package]]
 name = "sift-stack-py"
-version = "0.17.0.dev0"
+version = "0.17.0.dev1"
 source = { editable = "." }
 dependencies = [
     { name = "alive-progress", version = "3.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },

From e5f397c7ef029dbc26f20614fec4e26aad714fce Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 2 Jun 2026 12:04:30 -0700
Subject: [PATCH 13/19] Python(fix): pytest exit instead of raise on connection
 fail (#606)

---
 .../_tests/pytest_plugin/test_online.py       | 19 +++++++++++++------
 python/lib/sift_client/pytest_plugin.py       | 13 +++++++------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_online.py b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
index 876fffb0e..19a666d04 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_online.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
@@ -1,10 +1,10 @@
 """Tests for online mode (the default).
 
 Online mode requires connectivity to Sift. The plugin pings via
-``client_has_connection`` at session start and aborts with
-``pytest.UsageError`` on failure. Missing ``SIFT_API_KEY`` /
-``SIFT_GRPC_URI`` / ``SIFT_REST_URI`` env vars are reported as a usage error
-so the failure is actionable.
+``client_has_connection`` at session start and aborts via ``pytest.exit`` on
+failure, so the message prints once before any test runs. Missing
+``SIFT_API_KEY`` / ``SIFT_GRPC_URI`` / ``SIFT_REST_URI`` env vars are reported
+as a usage error so the failure is actionable.
 """
 
 from __future__ import annotations
@@ -23,7 +23,7 @@ def test_ping_failure_aborts(
         pytester: pytest.Pytester,
         clear_sift_env: None,
     ) -> None:
-        """Online mode with an unreachable ping aborts the session via UsageError."""
+        """Online mode with an unreachable ping aborts the session before any test runs."""
         pytester.makeconftest(
             """
             import pytest
@@ -46,12 +46,19 @@ def sift_client():
             @pytest.mark.sift_include
             def test_should_not_run():
                 assert True
+
+            @pytest.mark.sift_include
+            def test_should_not_run_either():
+                assert True
             """
         )
         result = pytester.runpytest_subprocess()
         assert result.ret != 0
         combined = "\n".join(result.outlines + result.errlines)
-        assert "Sift ping failed" in combined, combined
+        # ``pytest.exit`` stops on the first gated test's setup: the message
+        # appears once (not once per test) and nothing runs.
+        assert combined.count("Sift ping failed") == 1, combined
+        result.assert_outcomes()
 
     def test_missing_env_vars_named_in_error(
         self,
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index cf85b3abb..ed2d71fb6 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -1184,7 +1184,7 @@ def report_context(
       session end.
     * default (online): verify connectivity via ``client_has_connection``
       before constructing the context. A failed ping aborts the session
-      with ``pytest.UsageError`` and points at ``--sift-offline`` and
+      with ``pytest.exit`` and points at ``--sift-offline`` and
       ``--sift-disabled`` as escape hatches.
 
     The log-file destination is controlled by
@@ -1204,11 +1204,12 @@ def report_context(
         except Exception as exc:
             grpc_config = getattr(getattr(sift_client, "grpc_client", None), "_config", None)
             grpc_url = getattr(grpc_config, "uri", "<unknown>")
-            raise pytest.UsageError(
+            pytest.exit(
                 f"Sift ping failed against {grpc_url}: {exc}. "
                 "Pass --sift-offline to run without contacting Sift, or "
-                "--sift-disabled to skip Sift entirely."
-            ) from exc
+                "--sift-disabled to skip Sift entirely.",
+                returncode=4,
+            )
     yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
 
 
@@ -1413,8 +1414,8 @@ def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRe
     """Verify the ``SiftClient`` can reach Sift via ``/ping``.
 
     Consulted at session start by ``report_context`` in online mode. A failed
-    ping raises through ``report_context`` and aborts the session with
-    ``pytest.UsageError``. Override this fixture in your conftest to use a
+    ping aborts the session via ``pytest.exit``. Override this fixture in your
+    conftest to use a
     different reachability signal (e.g. a cached auth token) for environments
     where pinging is the wrong check. Returns ``False`` in ``--sift-disabled``
     mode without constructing a client.

From eb8c32bfd4e9ba397c34d4e5d0c76fbf70d0ba7f Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 2 Jun 2026 12:15:06 -0700
Subject: [PATCH 14/19] Python(feat): flexible pytest naming and cleaned up
 options (#602)

---
 python/CHANGELOG.md                           |   4 +-
 .../docs/examples/pytest_plugin_quickstart.md |  24 +-
 .../guides/pytest_plugin/configuration.md     | 262 ++++--
 python/docs/guides/pytest_plugin/index.md     |  16 +-
 .../guides/pytest_plugin/running_modes.md     |  13 +-
 python/examples/pytest_plugin/README.md       |  15 +-
 python/examples/pytest_plugin/conftest.py     |  16 +-
 python/examples/pytest_plugin/pyproject.toml  |  33 +
 python/examples/pytest_plugin/pytest.ini      |  11 -
 .../tests/with_sift/test_with_sift_demo.py    |   8 +-
 .../sift_client/_internal/pyproject_config.py |  84 ++
 .../_tests/pytest_plugin/test_disabled.py     |  14 -
 .../pytest_plugin/test_report_fields.py       | 272 ++++++
 .../_tests/pytest_plugin/test_report_name.py  | 120 +++
 .../pytest_plugin/test_settings_reference.py  |  39 +
 .../pytest_plugin/test_typo_detector.py       | 113 +++
 python/lib/sift_client/pytest_plugin.py       | 877 ++++++++++++++----
 .../sift_types/_mixins/metadata.py            |  19 +
 python/lib/sift_client/sift_types/asset.py    |   2 +
 python/lib/sift_client/sift_types/report.py   |   2 +
 python/lib/sift_client/sift_types/run.py      |   2 +
 .../lib/sift_client/sift_types/test_report.py |   6 +
 .../util/test_results/context_manager.py      |  16 +-
 python/pyproject.toml                         |   1 +
 python/uv.lock                                |   2 +
 25 files changed, 1639 insertions(+), 332 deletions(-)
 create mode 100644 python/examples/pytest_plugin/pyproject.toml
 delete mode 100644 python/examples/pytest_plugin/pytest.ini
 create mode 100644 python/lib/sift_client/_internal/pyproject_config.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_report_name.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
 create mode 100644 python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
 create mode 100644 python/lib/sift_client/sift_types/_mixins/metadata.py

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index 1b8c43a93..4905ae0d7 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -16,7 +16,8 @@ Highlights:
 - **Pass/fail mapping.** Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard exit) maps to a `TestStatus` and propagates to parent steps and the report. `step.measure(...)` returns a pass/fail boolean without raising, so all measurements land in the report even when one fails; `step.fail_if_measurements_failed()` fails the test at the end without adding assertion noise to `error_info`.
 - **Assertion messages as error info.** Assertion failure messages are reported as the step's error info.
 - **Git metadata.** Repo, branch, and commit are captured on the report automatically.
-- **Terminal output.** The plugin prints a session header with the SDK version and active mode, and an end-of-run `Sift report` panel showing the test case, outcome, step and measurement breakdowns (color-coded), test system/operator, plus a link to the report (online), the saved log and upload command (offline), or a disabled note. Both suppress under `-q`. `SiftClient.app_url` exposes the web-app origin; set `sift_report_url_base` for on-prem or custom deployments. `--sift-open-report` opens the report in a browser at session end.
+- **Terminal output.** The plugin prints a session header with the SDK version and active mode, and an end-of-run `Sift report` panel showing the test case, outcome, step and measurement breakdowns (color-coded), test system/operator, plus a link to the report (online), the saved log and upload command (offline), or a disabled note. Both suppress under `-q`. `SiftClient.app_url` exposes the web-app origin; set `sift_app_url` for on-prem or custom deployments. `--sift-open-report` opens the report in a browser at session end.
+- **Configurable report content via `[tool.sift.pytest.report]` and `SIFT_REPORT_*` env vars.** Static defaults (`name`, `test_case`, `test_system_name`, `system_operator`, `serial_number`, `part_number`, and `metadata`) live under `[tool.sift.pytest.report]` in `pyproject.toml`. `name` and `test_case` accept the `{target}`, `{command}`, `{args}`, `{rootdir}`, `{timestamp}`, `{count}`, `{git_repo}`, `{git_branch}`, `{git_commit}` placeholders. `[tool.sift.pytest.report.metadata]` is a TOML table whose typed values land on the report's metadata alongside git fields and the auto-recorded `pytest_command`. For dynamic per-run injection (CI, hardware-bench unit cycling), set `SIFT_REPORT_TEST_SYSTEM_NAME` / `_SYSTEM_OPERATOR` / `_SERIAL_NUMBER` / `_PART_NUMBER` env vars, which pytest-dotenv loads from `.env` for local dev. Env entries win over TOML.
 
 See the [Pytest Plugin guide](https://github.com/sift-stack/sift/blob/main/python/docs/guides/pytest_plugin/index.md) and the runnable quickstart example for full configuration.
 
@@ -27,6 +28,7 @@ See the [Pytest Plugin guide](https://github.com/sift-stack/sift/blob/main/pytho
 - [Pass/fail behavior improvements](https://github.com/sift-stack/sift/pull/568)
 - [Report assertion message as error info](https://github.com/sift-stack/sift/pull/587)
 - [Pytest docs reorganization](https://github.com/sift-stack/sift/pull/589)
+- [Configurable report name template and preserved pytest command](https://github.com/sift-stack/sift/pull/591)
 
 ## [v0.16.2] - May 21, 2026
 
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
index b30f282c6..30012f9b4 100644
--- a/python/docs/examples/pytest_plugin_quickstart.md
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -16,7 +16,7 @@ For a conceptual reference (fixtures, ini flags, status semantics), see the
 ```
 examples/pytest_plugin/
 ├── conftest.py                            # registers the plugin
-├── pytest.ini                             # available ini knobs (all commented at defaults)
+├── pyproject.toml                         # pytest knobs + report name/test_case/metadata
 ├── .env.example                           # credential template
 └── tests/
     ├── pytest_only/                       # subpackage step
@@ -32,21 +32,25 @@ above each test becomes its own parent step in the report tree.
 
 ## `conftest.py`
 
-A single `pytest_plugins` declaration loads the plugin; `load_dotenv()` is
-optional and just lets the default `sift_client` fixture pick up
-`SIFT_API_KEY` / `SIFT_GRPC_URI` / `SIFT_REST_URI` from a local `.env`.
+A single `pytest_plugins` declaration loads the plugin. The default
+`sift_client` fixture reads `SIFT_API_KEY` / `SIFT_GRPC_URI` / `SIFT_REST_URI`
+from the environment — set them in your shell, your CI secret store, or a
+local `.env` (`pip install pytest-dotenv` auto-loads it).
 
 ```python title="conftest.py"
 --8<-- "examples/pytest_plugin/conftest.py"
 ```
 
-## `pytest.ini`
+## `pyproject.toml`
 
-Every knob is commented at its default value. Uncomment any line to opt out of
-a layer of the step tree.
+Pytest behavior knobs sit under `[tool.pytest.ini_options]`, each commented at
+its default — uncomment any line to opt out of a layer of the step tree. The
+report's display `name`, `test_case`, and free-form `metadata` are set under
+`[tool.sift.pytest.report]`; `name` and `test_case` accept template
+placeholders.
 
-```ini title="pytest.ini"
---8<-- "examples/pytest_plugin/pytest.ini"
+```toml title="pyproject.toml"
+--8<-- "examples/pytest_plugin/pyproject.toml"
 ```
 
 ## `.env.example`
@@ -168,7 +172,7 @@ skip every measurement that follows. Expected
 pytest output is `16 passed, 3 failed, 1 skipped`.
 
 Flip any of the `sift_*_step` / `sift_parametrize_nesting` flags in
-`pytest.ini` to `false` to collapse a layer.
+`pyproject.toml` to `false` to collapse a layer.
 
 ## Next steps
 
diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
index 3b3151111..7c7114543 100644
--- a/python/docs/guides/pytest_plugin/configuration.md
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -26,40 +26,45 @@ The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your
 Sift organization. You can find these on the Sift Manage page as well as
 generate an API key.
 
-The default `sift_client` fixture reads its two URIs from environment first and
-falls back to ini keys when the env vars are unset. `SIFT_API_KEY` is
-intentionally env-only, so keep it out of source control and supply it through
-`pytest-dotenv` (see [API key handling](#api-key-handling) below). The env var
-wins when both are set, so secrets injected into a CI environment continue to
-override values committed to `pyproject.toml`. There are no CLI flags for
+The default `sift_client` fixture reads its two URIs from the environment
+first, then from the `sift_grpc_uri` / `sift_rest_uri` ini keys.
+`SIFT_API_KEY` is intentionally env-only, so keep it out of source control (see
+[API key handling](#api-key-handling) below). There are no CLI flags for
 credentials.
 
-| Ini key | Environment variable | Notes |
+| Setting | Where | Notes |
 |---|---|---|
-| _(none)_ | `SIFT_API_KEY` | Env-only. Use `.env` + `pytest-dotenv` locally; inject from your secret store in CI. |
-| `sift_grpc_uri` | `SIFT_GRPC_URI` | Stable per-org gRPC endpoint; safe to commit. |
-| `sift_rest_uri` | `SIFT_REST_URI` | Stable per-org REST endpoint; safe to commit. |
+| `SIFT_API_KEY` | env var only | Inject from your secret store in CI; for local dev use a `.env` (see below). Never read from a committed file. |
+| `SIFT_GRPC_URI` | env > `sift_grpc_uri` ini | Stable per-org gRPC endpoint; safe to commit. |
+| `SIFT_REST_URI` | env > `sift_rest_uri` ini | Stable per-org REST endpoint; safe to commit. |
 
 ### API key handling
 
-`SIFT_API_KEY` is deliberately read from the process environment only. The
-recommended workflow uses the
-[`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) plugin (already a
-dependency of `sift-stack-py`), which loads variables from a `.env` file into
-`os.environ` before tests run.
+`SIFT_API_KEY` is read from the process environment only — the plugin never
+reads it from a committed file. How you get it into the environment is up to
+you:
 
-1. Add `.env` to `.gitignore`.
-2. Drop your key into `.env` at the project root:
+- **CI:** set `SIFT_API_KEY` directly via your provider's secret manager.
+- **Local dev:** keep the values in a `.env` (gitignored) and let
+  [`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) load them — it is
+  not bundled with `sift-stack-py`, so install it explicitly:
+
+    ```bash
+    pip install pytest-dotenv
+    ```
 
     ```bash title=".env"
     SIFT_API_KEY=sk-...your-key...
+    SIFT_GRPC_URI=your-org.grpc.example.com
+    SIFT_REST_URI=https://your-org.rest.example.com
     ```
 
-3. In CI, set `SIFT_API_KEY` directly via your provider's secret manager
-   instead of committing a `.env` file.
+    Once installed, pytest-dotenv auto-loads `.env` from the rootdir before
+    tests run — no `conftest.py` glue and no `load_dotenv()` call. (Point it at
+    a different file with the `env_files` ini key if you prefer.)
 
-`pytest-dotenv` picks the file up automatically; no `pytest_configure` glue is
-needed.
+Prefer real environment variables (shell exports, CI secrets) for anything you
+can't keep in a local file.
 
 !!! warning "FedRAMP / shared environments"
     Pass `--sift-log-file=false` (or set the ini key to `"false"`) to skip the
@@ -73,10 +78,6 @@ that's required. The plugin ships a default `sift_client` fixture that reads
 `SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
 
 ```python title="conftest.py"
-from dotenv import load_dotenv
-
-load_dotenv()
-
 pytest_plugins = ["sift_client.pytest_plugin"]
 ```
 
@@ -93,12 +94,9 @@ plugin's default falls away in favor of your definition.
 import os
 
 import pytest
-from dotenv import load_dotenv
 
 from sift_client import SiftClient, SiftConnectionConfig
 
-load_dotenv()
-
 pytest_plugins = ["sift_client.pytest_plugin"]
 
 
@@ -120,47 +118,68 @@ def sift_client() -> SiftClient:
 |---|---|---|---|
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
 | `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `fail_if_measurements_failed`, and `current_step`. |
-| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently; see [ini options](#ini-options). |
+| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently; see [settings reference](#settings-reference). |
 | `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
 | `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
 
-## CLI options
+## Settings reference
 
-| Flag | Default | Effect |
-|---|---|---|
-| `--sift-offline` | off (online) | Skip the session-start ping and don't contact Sift. All create/update calls go to the JSONL log file for later replay via `import-test-result-log`. Missing `SIFT_*` env vars are tolerated; placeholders are filled. |
-| `--sift-disabled` | off | Skip Sift entirely. Nothing contacts the API and no log file is written; `step.measure(...)` still evaluates bounds and returns a real pass/fail boolean. Also honored via `SIFT_DISABLED=1`. Supersedes every other flag (disabled wins over offline). |
-| `--sift-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. Incompatible with `--sift-offline` since offline mode needs the log file as its sole sink. |
-| `--no-sift-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
-| `--sift-report-url-base=<origin>` | derived from REST URI | Web-app origin used to build the clickable report link in the terminal footer (e.g. `https://app.siftstack.com`). Set this for on-prem or custom deployments whose API host can't be mapped to a frontend automatically. Also honored via the `SIFT_APP_URL` environment variable. When unset, the link is derived from the REST URI for known Sift hosts. |
-| `--sift-open-report` | off | Open the resulting report in a browser at session end. Online mode only; a no-op when the report URL can't be resolved. Intended for local development. |
+Every setting the plugin reads, grouped by the three config kinds. Within a
+group, a `—` means the setting can't be set from that surface.
 
-These can be passed permanently via `addopts`:
+Each kind has a home chosen for a specific workflow:
 
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-offline
-```
+- **Pytest behavior** lives in `[tool.pytest.ini_options]` (log/offline/disabled/git/`*_step`/autouse/parametrize). A CLI flag exists for the ones with a real ad-hoc override workflow.
+- **Connection** comes from the environment first, falling back to the ini keys; the API key is env-only so secrets stay out of committed files.
+- **Report content** takes static defaults from `[tool.sift.pytest.report]` and per-run dynamic values from `SIFT_REPORT_*` env vars (CI builds, hardware cycling, anything `.env`-driven; pytest-dotenv loads `.env` for local dev).
+
+**Precedence within a setting:** env > CLI flag > ini key > TOML > built-in
+default. No setting exposes both env and CLI, so the chain isn't ambiguous in
+practice.
+
+The plugin scans `SIFT_*` env vars and `[tool.sift.pytest.*]` keys at session
+start; anything outside these tables fires a warning with a closest-match
+suggestion, so typos like `SIFT_REPORT_SERIALNUM` surface immediately.
 
-## Ini options
+<!-- BEGIN settings-reference (auto-generated from _OPTIONS in pytest_plugin.py; regenerate via test_settings_reference_docs_in_sync) -->
+### Pytest behavior
 
-Set the matching ini key directly (recommended for stable per-project
-configuration). Each CLI flag has a corresponding key under
-`[tool.pytest.ini_options]` in `pyproject.toml` or `[pytest]` in `pytest.ini`.
-CLI flags, when passed, override the ini values.
+| Setting | CLI flag | Ini (`[tool.pytest.ini_options]`) |
+|---|---|---|
+| Path to the JSONL log of create/update calls (path \| true \| false \| none). | `--sift-log-file` | `sift_log_file` |
+| Capture git repo/branch/commit on the report. | `--no-sift-git-metadata` | `sift_git_metadata` |
+| Skip the session-start ping; route create/update through the JSONL log. | `--sift-offline` | `sift_offline` |
+| Disable Sift entirely (no API calls, no log file). Supersedes --sift-offline. | `--sift-disabled` | `sift_disabled` |
+| Open the resulting report in a browser at session end (online only; no-op when the report URL can't be resolved). | `--sift-open-report` | `sift_open_report` |
+| Default for the Sift autouse fixtures (report_context, step, hierarchy/parametrize parents). | — | `sift_autouse` |
+| Open a parent step for each Python package in the test path. | — | `sift_package_step` |
+| Open a parent step for each test module. | — | `sift_module_step` |
+| Open per-class parent steps, including nested classes. | — | `sift_class_step` |
+| Cluster parametrized tests under shared parent steps (e.g. test_a -> v=1, v=2). | — | `sift_parametrize_nesting` |
+
+### Connection
+
+| Setting | Ini (`[tool.pytest.ini_options]`) | Env var |
+|---|---|---|
+| Sift API key (secret, env-only). | — | `SIFT_API_KEY` |
+| Sift gRPC endpoint URI. | `sift_grpc_uri` | `SIFT_GRPC_URI` |
+| Sift REST endpoint URI. | `sift_rest_uri` | `SIFT_REST_URI` |
+| Sift web-app origin for the report link in the terminal footer (e.g. https://app.siftstack.com). When unset, the link is derived from the REST URI for known Sift hosts. | `sift_app_url` | `SIFT_APP_URL` |
+
+### Report content
 
-| Ini key | Type | Equivalent CLI flag |
+| Setting | TOML (`[tool.sift...]`) | Env var |
 |---|---|---|
-| `sift_log_file` | string (`true` / `false` / `none` / path) | `--sift-log-file=<value>` |
-| `sift_git_metadata` | bool (default `true`) | `--no-sift-git-metadata` (sets to `false`) |
-| `sift_offline` | bool (default `false`) | `--sift-offline` |
-| `sift_disabled` | bool (default `false`) | `--sift-disabled` (also honors `SIFT_DISABLED` env var) |
-| `sift_autouse` | bool (default `true`) | _(no CLI flag; controls the marker gate below)_ |
-| `sift_package_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each Python package (directory with `__init__.py`) in the test path. |
-| `sift_module_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each test module (file). |
-| `sift_class_step` | bool (default `true`) | _(ini-only)_. Opens a parent step for each test class, including nested classes. |
-| `sift_parametrize_nesting` | bool (default `true`) | _(ini-only)_. Clusters parametrized tests under shared parents (`test_x`, `axis=value`) instead of flat leaves (`test_x[value]`). |
-| `sift_open_report` | bool (default `false`) | `--sift-open-report` |
+| Template for the report display name. Placeholders: {target}, {command}, {args}, {rootdir}, {timestamp}, {count}, {git_repo}, {git_branch}, {git_commit}. | `[tool.sift.pytest.report] name` | — |
+| Template for the report's test_case field (same placeholders as report_name). | `[tool.sift.pytest.report] test_case` | — |
+| Name of the test system / rig. Defaults to the host's name. | `[tool.sift.pytest.report] test_system_name` | `SIFT_REPORT_TEST_SYSTEM_NAME` |
+| Operator running the test. Defaults to the OS user. | `[tool.sift.pytest.report] system_operator` | `SIFT_REPORT_SYSTEM_OPERATOR` |
+| Serial number of the unit under test. | `[tool.sift.pytest.report] serial_number` | `SIFT_REPORT_SERIAL_NUMBER` |
+| Part number of the unit under test. | `[tool.sift.pytest.report] part_number` | `SIFT_REPORT_PART_NUMBER` |
+| Free-form report metadata, as a TOML table of scalar values. For dynamic per-run keys, attach them in conftest via the report_context fixture. | `[tool.sift.pytest.report.metadata]` (table) | — |
+<!-- END settings-reference -->
+
+### Quick-start examples
 
 ```toml title="pyproject.toml"
 [tool.pytest.ini_options]
@@ -168,9 +187,22 @@ sift_offline = true
 sift_git_metadata = false
 sift_grpc_uri = "your-org.sift.example:443"
 sift_rest_uri = "https://your-org.sift.example"
+
+[tool.sift.pytest.report]
+name = "{rootdir} ({count} tests) {timestamp}"
+test_system_name = "rig-7"
+
+[tool.sift.pytest.report.metadata]
+build_id = "v1.2.3"
 ```
 
-```ini title="pytest.ini"
+```bash title="CI env (set by your runner)"
+SIFT_API_KEY=...                    # from a secret manager
+SIFT_REPORT_SYSTEM_OPERATOR=ci-bot
+SIFT_REPORT_SERIAL_NUMBER=$UNIT_SN  # cycles per matrix job
+```
+
+```ini title="pytest.ini (alternative — pytest-execution flags only)"
 [pytest]
 sift_offline = true
 sift_git_metadata = false
@@ -178,6 +210,116 @@ sift_grpc_uri = your-org.sift.example:443
 sift_rest_uri = https://your-org.sift.example
 ```
 
+CLI flags can be made permanent via `addopts`:
+
+```ini title="pytest.ini"
+[pytest]
+addopts = --sift-offline
+```
+
+## Report content in depth
+
+The [settings reference](#settings-reference) above maps each report-content
+field to its `[tool.sift.pytest.report]` key and `SIFT_REPORT_*` env var. This
+section covers the two template fields and the metadata table in more detail.
+
+```toml title="pyproject.toml — static project defaults"
+[tool.sift.pytest.report]
+name             = "{rootdir} {git_branch} ({count} tests) {timestamp}"
+test_case        = "{rootdir}-{git_branch}"
+test_system_name = "rig-7"
+system_operator  = "ci-bot"
+serial_number    = "SN-001"
+part_number      = "PN-9000"
+```
+
+```bash title="Per-run overrides — CI or hardware-bench shell"
+SIFT_REPORT_SERIAL_NUMBER=$UNIT_SN \
+SIFT_REPORT_SYSTEM_OPERATOR=$CI_ACTOR \
+pytest tests/
+```
+
+### `name` vs `test_case`
+
+The two fields look similar but serve opposite purposes:
+
+- **`name`** is the report's **per-run display label** — what you see in the
+  Test Results list. It should be unique per run, which is why its default ends
+  in `{timestamp}`.
+- **`test_case`** is the **cross-run grouping key** — reports that share a
+  `test_case` are treated as runs of the *same* case, so Sift can track its
+  pass/fail history over time. It should be stable across runs, which is why
+  its default has **no** timestamp.
+
+By default both derive from the same `{target}` (what ran), and the timestamp
+is the only difference: `name` = `{target} {timestamp}` (distinct each run),
+`test_case` = `{target}` (identical across runs of the same target, so they
+group together). Set either explicitly to override — a static `test_case` like
+`"{rootdir}"` is common when you want every run of a project to group under one
+case regardless of which subset ran.
+
+### Templates for `name` and `test_case`
+
+`name` and `test_case` accept the same f-string-style placeholders:
+
+| Placeholder | Value |
+|---|---|
+| `{target}` | What ran, derived from the collected tests (not the command line) and anchored to the project name: `project/tests/test_x.py::test_y` for a single test (the `[param]` suffix is stripped), `project/tests/test_x.py` for a single file, `project/tests/motor` for several files' common directory, or just `project` for a whole-suite run. |
+| `{command}` | The full pytest invocation, e.g. `pytest tests/ -k smoke`. |
+| `{args}` | The invocation arguments without the leading `pytest`. |
+| `{rootdir}` | The pytest rootdir name (typically the project directory). |
+| `{timestamp}` | The report start time in ISO 8601 (UTC). |
+| `{count}` | The number of collected tests in the run. |
+| `{git_repo}` | The `origin` remote URL, or empty when not in a git repo. |
+| `{git_branch}` | The current branch, or empty when not in a git repo. |
+| `{git_commit}` | The current commit (`git describe --always --dirty`), or empty when not in a git repo. |
+
+**Defaults when unset.** Because `{target}` is derived from the collected
+tests, the defaults reflect what actually ran and don't change with flag order
+or `-k` / `-m` filters:
+
+(`<project>` below is the rootdir directory name.)
+
+| Invocation | default `name` | default `test_case` |
+|---|---|---|
+| `pytest tests/test_motor.py::test_spin[12V]` | `<project>/tests/test_motor.py::test_spin 2026-...` | `<project>/tests/test_motor.py::test_spin` |
+| `pytest -v tests/test_motor.py` | `<project>/tests/test_motor.py 2026-...` | `<project>/tests/test_motor.py` |
+| `pytest -k motor` (hits `tests/motor/`) | `<project>/tests/motor 2026-...` | `<project>/tests/motor` |
+| `pytest` (whole suite) | `<project> 2026-...` | `<project>` |
+
+The git placeholders are resolved independently of `--no-sift-git-metadata`
+(which only controls whether git values are stored on the report metadata) and
+render empty outside a git checkout. An unknown placeholder is reported as a
+warning and the value falls back to the default rather than failing the run.
+
+Regardless of the name, the full pytest command is always preserved on the
+report's metadata under the `pytest_command` key, so the exact invocation stays
+queryable and viewable in the report detail.
+
+### Report metadata
+
+`[tool.sift.pytest.report.metadata]` is a TOML table whose typed values land
+on the report's metadata alongside the git fields and the auto-recorded
+`pytest_command`. Use it for build IDs, fixture identifiers, shift labels,
+and any key/value data not otherwise modeled.
+
+```toml title="pyproject.toml — static metadata defaults"
+[tool.sift.pytest.report.metadata]
+build_id = "v1.2.3"
+fixture  = "PSU-A"
+shift    = "night"
+lane     = 2          # ints, floats, and bools come through with their TOML type
+verbose  = true
+```
+
+For per-run dynamic entries (CI build IDs, cycling serial numbers), attach them
+in your `conftest.py` through the `report_context` fixture rather than the TOML
+table.
+
+Nested tables, lists, and `null` values in
+`[tool.sift.pytest.report.metadata]` are skipped with a warning since the
+report's metadata is a flat `dict[str, str | float | bool]`.
+
 ## Controlling which tests produce reports
 
 By default every test in the session produces a Sift step. Two markers and one
diff --git a/python/docs/guides/pytest_plugin/index.md b/python/docs/guides/pytest_plugin/index.md
index 9344885b3..a649204a4 100644
--- a/python/docs/guides/pytest_plugin/index.md
+++ b/python/docs/guides/pytest_plugin/index.md
@@ -10,27 +10,27 @@ the report itself.
 Install the client and pytest:
 
 ```bash
-pip install sift-stack-py pytest python-dotenv
+pip install sift-stack-py pytest
 ```
 
-Set your connection details in a `.env` next to your tests:
+The default `sift_client` fixture reads its connection details from the
+environment:
 
-```bash title=".env"
+```bash
 SIFT_API_KEY="..."
 SIFT_GRPC_URI="..."
 SIFT_REST_URI="..."
 ```
 
-Find these on the Sift Manage page, where you can also generate an API key.
+Find these on the Sift Manage page, where you can also generate an API key. Set
+them in your shell or CI secret store. For local dev, `pip install
+pytest-dotenv` and drop the same values in a `.env` next to your tests — it
+loads them automatically, no code required.
 
 Register the plugin with a single `pytest_plugins` declaration in your top-level
 `conftest.py`:
 
 ```python title="conftest.py"
-from dotenv import load_dotenv
-
-load_dotenv()
-
 pytest_plugins = ["sift_client.pytest_plugin"]
 ```
 
diff --git a/python/docs/guides/pytest_plugin/running_modes.md b/python/docs/guides/pytest_plugin/running_modes.md
index 9289428e4..6c5ab05be 100644
--- a/python/docs/guides/pytest_plugin/running_modes.md
+++ b/python/docs/guides/pytest_plugin/running_modes.md
@@ -43,8 +43,8 @@ operator.
 
 **Online** shows the report metadata, step and measurement breakdowns, and a
 clickable link. The web host is derived from the REST URI for known Sift hosts;
-for on-prem or custom deployments set `--sift-report-url-base`
-(ini: `sift_report_url_base`, env: `SIFT_APP_URL`). Add `--sift-open-report` to
+for on-prem or custom deployments set `sift_app_url`
+(ini) or the `SIFT_APP_URL` env var. Add `--sift-open-report` to
 open the report in a browser at session end.
 
 ```text
@@ -191,16 +191,15 @@ and tests can branch on provenance. Offline-mode entities also report
 How to turn it on, in the order most projects pick:
 
 ```bash
-# In an .envrc, devcontainer, or CI job config
-export SIFT_DISABLED=1
-
 # Per-invocation kill-switch
 pytest --sift-disabled
+```
 
+```toml
 # Per-project default (uncommon; online is usually the right default)
 # pyproject.toml:
-#   [tool.pytest.ini_options]
-#   sift_disabled = true
+[tool.pytest.ini_options]
+sift_disabled = true
 ```
 
 Good fit for local dev without Sift credentials. Also for library consumers who
diff --git a/python/examples/pytest_plugin/README.md b/python/examples/pytest_plugin/README.md
index 6eeaf9a34..0a94b7f97 100644
--- a/python/examples/pytest_plugin/README.md
+++ b/python/examples/pytest_plugin/README.md
@@ -8,7 +8,7 @@ numeric / string / bool bounds, gate markers, and the ini opt-outs.
 ```
 examples/pytest_plugin/
 ├── conftest.py                            # registers the plugin
-├── pytest.ini                             # available ini knobs (all commented at defaults)
+├── pyproject.toml                         # pytest knobs + report name/test_case/metadata
 ├── .env.example                           # credential template (copy to .env for local runs)
 └── tests/
     ├── pytest_only/                       # subpackage step: `pytest_only` opens a parent step
@@ -24,13 +24,14 @@ Every layer of organization shows up in the report tree: Python packages
 (directories with `__init__.py`), modules (test files), classes (including
 nested classes), and parametrize axes each open a parent step. Flip
 `sift_package_step`, `sift_module_step`, `sift_class_step`, or
-`sift_parametrize_nesting` to `false` in `pytest.ini` to disable this behavior.
+`sift_parametrize_nesting` to `false` in `pyproject.toml` to disable this behavior.
 
 ## Run it
 
 **Against a real Sift org**:
 
 ```bash
+pip install pytest-dotenv        # auto-loads .env; or export the vars yourself
 cp .env.example .env
 # Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
 pytest -v
@@ -48,8 +49,8 @@ import-test-result-log /tmp/sift-demo.jsonl
 
 ## What the report tree looks like
 
-With the plugin's defaults (everything in `pytest.ini` left commented), running
-this demo produces a tree like:
+With the plugin's defaults (the `[tool.pytest.ini_options]` knobs left
+commented), running this demo produces a tree like:
 
 ```
 TestReport (FAILED, since failures propagate up from leaves)
@@ -107,14 +108,14 @@ skip every measurement that follows. Expected
 pytest output is `16 passed, 3 failed, 1 skipped`.
 
 Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
-`pytest.ini` to `false` to collapse a layer.
+`pyproject.toml` to `false` to collapse a layer.
 
 ## What each file demonstrates
 
 | File | Feature |
 |---|---|
-| `conftest.py` | Plugin registration via `pytest_plugins`; optional `load_dotenv()` |
-| `pytest.ini` | The four nesting flags + git metadata flag at their defaults |
+| `conftest.py` | Plugin registration via `pytest_plugins` (a single line) |
+| `pyproject.toml` | Pytest nesting/git-metadata knobs at their defaults; report `name`, `test_case`, and `metadata` under `[tool.sift.pytest.report]` |
 | `tests/pytest_only/test_pytest_only_demo.py` | Plain pytest tests with no Sift APIs. The plugin captures pass/fail automatically; covers functions, fixtures, parametrize, classes, plus one each of `AssertionError` (FAILED), `pytest.skip` (SKIPPED), and a raised `ValueError` (ERROR) |
 | `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `step.fail_if_measurements_failed()` end-of-test call that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
 | `tests/{pytest_only,with_sift}/__init__.py` | Each Python package (directory with `__init__.py`) becomes a parent step in the report tree |
diff --git a/python/examples/pytest_plugin/conftest.py b/python/examples/pytest_plugin/conftest.py
index 88253bd73..b019aef1d 100644
--- a/python/examples/pytest_plugin/conftest.py
+++ b/python/examples/pytest_plugin/conftest.py
@@ -1,15 +1,13 @@
 """Project-level conftest for the pytest plugin demo.
 
-A single ``pytest_plugins`` declaration is enough to load the plugin — its
+A single ``pytest_plugins`` declaration is all that's needed — the plugin's
 fixtures, hooks, and CLI options register through standard pytest machinery
-from there. ``load_dotenv()`` is optional; it just lets the default
-``sift_client`` fixture pick up ``SIFT_API_KEY`` / ``SIFT_GRPC_URI`` /
-``SIFT_REST_URI`` from a local ``.env`` when running against a real Sift org.
-These can also be set as environment variables using your preferred method.
-"""
-
-from dotenv import load_dotenv
+from there.
 
-load_dotenv()
+The default ``sift_client`` fixture reads ``SIFT_API_KEY`` / ``SIFT_GRPC_URI``
+/ ``SIFT_REST_URI`` from the environment. Set them however you prefer: your CI
+secret store, your shell, or a local ``.env`` loaded by ``pytest-dotenv``
+(``pip install pytest-dotenv`` and it auto-loads ``.env`` — no code here).
+"""
 
 pytest_plugins = ["sift_client.pytest_plugin"]
diff --git a/python/examples/pytest_plugin/pyproject.toml b/python/examples/pytest_plugin/pyproject.toml
new file mode 100644
index 000000000..71280d16a
--- /dev/null
+++ b/python/examples/pytest_plugin/pyproject.toml
@@ -0,0 +1,33 @@
+# Single config file for the demo. Pytest behavior lives under
+# [tool.pytest.ini_options]; Sift report content lives under
+# [tool.sift.pytest.report].
+
+[tool.pytest.ini_options]
+# Defaults give you the full step tree: every package, module, class, and
+# parametrize axis becomes a parent step. These are the available knobs and
+# their defaults — uncomment to opt out of a layer.
+#
+# sift_autouse = true              # autouse fixtures (default: true)
+# sift_package_step = true         # Python package (dir with __init__.py) parent step (default: true)
+# sift_module_step = true          # module (test file) parent step (default: true)
+# sift_class_step = true           # class parent step incl. nested (default: true)
+# sift_parametrize_nesting = true  # parametrize parent steps (default: true)
+# sift_git_metadata = true         # git repo/branch/commit included on the report (default: true)
+
+[tool.sift.pytest.report]
+# Display name for the report. Placeholders: {target} {command} {args}
+# {rootdir} {timestamp} {count} {git_repo} {git_branch} {git_commit}.
+# Omit to use the default "{target} {timestamp}". {target} reflects what ran,
+# from the collected tests, anchored to the project name: e.g.
+# project/tests/test_x.py::test_y (single test, [param] stripped),
+# project/tests/motor (several files' common dir), or project (whole suite).
+name = "pytest-plugin demo ({count} tests) {timestamp}"
+# Grouping key across runs (same placeholders available). Omit to default to
+# {target} (what ran).
+test_case = "pytest-plugin-demo {git_branch}"
+
+[tool.sift.pytest.report.metadata]
+# Free-form key/value metadata stamped on every report. Values keep their TOML
+# type (string, int, float, bool).
+ci_revision = 2
+test_source = 'pytest-plugin-demo'
\ No newline at end of file
diff --git a/python/examples/pytest_plugin/pytest.ini b/python/examples/pytest_plugin/pytest.ini
deleted file mode 100644
index 90a1a824b..000000000
--- a/python/examples/pytest_plugin/pytest.ini
+++ /dev/null
@@ -1,11 +0,0 @@
-[pytest]
-# Defaults give you the full step tree: every package, module, class, and
-# parametrize axis becomes a parent step. These are the available ini options
-# and their defaults.
-#
-# sift_autouse = true              # autouse fixtures (default: true)
-# sift_package_step = true         # Python package (dir with __init__.py) parent step (default: true)
-# sift_module_step = true          # module (test file) parent step (default: true)
-# sift_class_step = true           # class parent step incl. nested (default: true)
-# sift_parametrize_nesting = true  # parametrize parent steps (default: true)
-# sift_git_metadata = true         # git repo/branch/commit included on the report (default: true)
diff --git a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
index ee3eef513..7cbe8f8ce 100644
--- a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
+++ b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
@@ -121,11 +121,17 @@ def test_report_level_metadata(step, report_context) -> None:
     The same ``update({...})`` pattern works for any field on
     ``TestReportUpdate`` (``run_id``, ``serial_number``, ``part_number``,
     ``system_operator``, ``metadata``, ...). Useful for linking a session
-    to a Sift Run or tagging the report with build / operator info.
+    to a Sift Run or tagging the report with build / operator info at runtime.
+
+    Updating ``metadata`` *replaces* the whole map server-side, so spread the
+    report's current metadata first to add keys without dropping the entries
+    configured under ``[tool.sift.pytest.report.metadata]`` (or the git
+    metadata and auto-recorded ``pytest_command``).
     """
     report_context.report.update(
         {
             "metadata": {
+                **report_context.report.metadata,
                 "build_id": "v1.2.3",
                 "operator": "ci",
             }
diff --git a/python/lib/sift_client/_internal/pyproject_config.py b/python/lib/sift_client/_internal/pyproject_config.py
new file mode 100644
index 000000000..6a8bd177b
--- /dev/null
+++ b/python/lib/sift_client/_internal/pyproject_config.py
@@ -0,0 +1,84 @@
+"""Loader for the ``[tool.sift]`` table in a project's ``pyproject.toml``.
+
+The pytest plugin consumes this loader to resolve report-content config (under
+``[tool.sift.pytest.report]``) and SDK-level fallbacks (URIs under
+``[tool.sift]``). A malformed or missing ``pyproject.toml`` returns ``{}`` so a
+bad config file never aborts the session — the plugin falls back to its
+built-in defaults and surfaces a single warning.
+"""
+
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+# ``tomllib`` landed in 3.11; ``tomli`` is the same parser packaged for older
+# interpreters and is declared as a conditional install dep on 3.8-3.10.
+try:
+    import tomllib  # type: ignore[import-not-found,unused-ignore]
+except ImportError:  # pragma: no cover - exercised on 3.8-3.10 only
+    import tomli as tomllib  # type: ignore[no-redef,import-not-found,unused-ignore]
+
+if TYPE_CHECKING:
+    import pytest
+
+
+# Bound the upward walk so a misconfigured environment can't trigger an
+# unbounded filesystem traversal looking for a project root that isn't there.
+_MAX_PARENT_WALK = 3
+
+
+def _find_pyproject(config: pytest.Config) -> Path | None:
+    """Locate the active project's ``pyproject.toml``.
+
+    Order:
+    1. ``config.inipath`` when it is itself a ``pyproject.toml`` (the common
+       case: project uses ``[tool.pytest.ini_options]`` so pytest loaded the
+       ini settings directly from pyproject).
+    2. ``<config.rootpath>/pyproject.toml``.
+    3. A bounded walk upward from ``rootpath`` for monorepo layouts where
+       pytest's rootdir is a subdirectory and the project pyproject lives
+       higher up.
+    """
+    inipath = config.inipath
+    if inipath is not None and inipath.name == "pyproject.toml" and inipath.is_file():
+        return inipath
+    cur = Path(config.rootpath).resolve()
+    candidate = cur / "pyproject.toml"
+    if candidate.is_file():
+        return candidate
+    for _ in range(_MAX_PARENT_WALK):
+        cur = cur.parent
+        candidate = cur / "pyproject.toml"
+        if candidate.is_file():
+            return candidate
+    return None
+
+
+def load_tool_sift(config: pytest.Config) -> dict[str, Any]:
+    """Return the parsed ``[tool.sift]`` table from the project's pyproject.toml.
+
+    Returns ``{}`` when no pyproject is discoverable, when the file omits the
+    ``[tool.sift]`` table, or when parsing fails. A parse / IO failure emits a
+    single :class:`SiftPytestPluginWarning` so the session continues with
+    defaults rather than aborting on a malformed file.
+    """
+    pyproject = _find_pyproject(config)
+    if pyproject is None:
+        return {}
+    try:
+        with pyproject.open("rb") as fh:
+            data = tomllib.load(fh)
+    except (OSError, tomllib.TOMLDecodeError) as exc:
+        # Deferred import: ``pytest_plugin`` imports this loader, so a
+        # top-level import here would close the cycle at module load time.
+        from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+        warnings.warn(
+            f"Failed to read {pyproject} for [tool.sift]: {type(exc).__name__}: {exc}",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+        return {}
+    return (data.get("tool") or {}).get("sift") or {}
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
index 90a5fcb56..263ac03ac 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
@@ -67,20 +67,6 @@ def test_disabled_does_not_require_credentials(
         result = pytester.runpytest_subprocess("--sift-disabled")
         result.assert_outcomes(passed=1)
 
-    def test_disabled_via_env_var(
-        self,
-        pytester: pytest.Pytester,
-        clear_sift_env: None,
-        write_plugin_conftest: Callable[[], None],
-        monkeypatch: pytest.MonkeyPatch,
-    ) -> None:
-        """``SIFT_DISABLED=1`` triggers disabled mode without the CLI flag."""
-        write_plugin_conftest()
-        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
-        monkeypatch.setenv("SIFT_DISABLED", "1")
-        result = pytester.runpytest_subprocess()
-        result.assert_outcomes(passed=1)
-
     def test_disabled_supersedes_offline(
         self,
         pytester: pytest.Pytester,
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py b/python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py
new file mode 100644
index 000000000..a4c723b47
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py
@@ -0,0 +1,272 @@
+"""Tests for [tool.sift.pytest.report] and the report-content env-var overrides.
+
+Report-content fields are configured under ``[tool.sift.pytest.report]`` in
+pyproject.toml and overridden per-run via ``SIFT_REPORT_*`` env vars. These
+tests drive offline-mode inner sessions and inspect the JSONL
+``CreateTestReport`` line, which serializes every report field with its proto
+type intact.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Callable
+
+from google.protobuf import json_format
+from sift.metadata.v1.metadata_pb2 import MetadataValue
+
+from sift_client.util.metadata import metadata_proto_to_dict
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+def _create_report_dict(log_text: str) -> dict:
+    """Parse the JSON payload from the ``[CreateTestReport:...]`` log line."""
+    for line in log_text.splitlines():
+        if line.startswith("[CreateTestReport:"):
+            return json.loads(line[line.index("{") :])
+    raise AssertionError(f"no CreateTestReport line in log:\n{log_text}")
+
+
+def _metadata_pairs(report: dict) -> dict[str, str | float | bool]:
+    """Unwrap the report's JSON metadata map into a ``{key: value}`` dict.
+
+    Each entry is the JSON form of a ``MetadataValue`` proto, so parse it back
+    into the proto and reuse the canonical ``metadata_proto_to_dict`` converter
+    rather than hand-walking the value slots.
+    """
+    protos = [json_format.ParseDict(entry, MetadataValue()) for entry in report.get("metadata", [])]
+    return metadata_proto_to_dict(protos)
+
+
+class TestReportFields:
+    def test_toml_resolves_every_field(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Every report-content field resolves from ``[tool.sift.pytest.report]``."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            test_case        = "case-from-toml"
+            test_system_name = "rig-7"
+            system_operator  = "ci-bot"
+            serial_number    = "SN-001"
+            part_number      = "PN-9000"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == "case-from-toml"
+        assert report["testSystemName"] == "rig-7"
+        assert report["systemOperator"] == "ci-bot"
+        assert report["serialNumber"] == "SN-001"
+        assert report["partNumber"] == "PN-9000"
+
+    def test_test_case_template_renders(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``test_case`` accepts the same template placeholders as ``name``."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            test_case = "case-{rootdir}-{count}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"].startswith("case-"), report["testCase"]
+        assert report["testCase"].endswith("-1"), report["testCase"]
+
+    def test_default_target_single_test_is_function(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """One test collected -> default test_case is the project-anchored function nodeid.
+
+        Derivation is from the collected items, so it doesn't depend on flag
+        order or which path form was typed; the value is anchored to the
+        rootdir (project) name.
+        """
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(test_demo="def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/test_demo.py::test_one", report[
+            "testCase"
+        ]
+
+    def test_default_target_single_test_strips_param(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """A parametrized single test drops the ``[param]`` suffix from the key."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(
+            test_demo=(
+                "import pytest\n@pytest.mark.parametrize('v', [12])\ndef test_p(step, v): pass\n"
+            )
+        )
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/test_demo.py::test_p", report[
+            "testCase"
+        ]
+
+    def test_default_target_single_file(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Multiple tests in one file -> the default target is that file (anchored)."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(test_demo="def test_a(step): pass\ndef test_b(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=2)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/test_demo.py", report["testCase"]
+
+    def test_default_target_multiple_files_common_dir(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Tests across several files -> the default target is their common directory (anchored)."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        suite = pytester.mkdir("suite")
+        (suite / "test_a.py").write_text("def test_a(step): pass\n")
+        (suite / "test_b.py").write_text("def test_b(step): pass\n")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=2)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/suite", report["testCase"]
+
+    def test_default_target_whole_tree_is_project(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Tests spanning the rootdir -> the default target is the bare project name."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        # Two files directly under rootdir -> common path is rootdir itself.
+        pytester.makepyfile(test_a="def test_a(step): pass", test_b="def test_b(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=2)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == pytester.path.name, report["testCase"]
+
+    def test_env_overrides_toml(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        monkeypatch: pytest.MonkeyPatch,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An env var wins over a value set in ``[tool.sift.pytest.report]``."""
+        log_path = tmp_path / "run.jsonl"
+        monkeypatch.setenv("SIFT_REPORT_SYSTEM_OPERATOR", "env-wins")
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            system_operator = "ci-bot"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["systemOperator"] == "env-wins"
+
+    def test_metadata_table_typed_values(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``[tool.sift.pytest.report.metadata]`` keeps TOML types end-to-end."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report.metadata]
+            build_id = "v1.2.3"
+            lane     = 2
+            verbose  = true
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        pairs = _metadata_pairs(_create_report_dict(log_path.read_text()))
+        assert pairs.get("build_id") == "v1.2.3"
+        # Ints and floats share the proto's numeric slot.
+        assert pairs.get("lane") == 2
+        assert pairs.get("verbose") is True
+        # Auto-recorded keys still present alongside the typed entries.
+        assert "pytest_command" in pairs
+
+    def test_loader_warns_on_bad_toml(
+        self,
+        tmp_path: Path,
+        recwarn: pytest.WarningsRecorder,
+    ) -> None:
+        """A malformed pyproject.toml emits a warning and the loader returns ``{}``.
+
+        pytest itself aborts the session when its own ``pyproject.toml`` is
+        unparseable, so the loader's graceful warning path only matters when
+        the file is reachable via the loader's own discovery (e.g. an upward
+        walk in a monorepo). Exercise the loader directly here.
+        """
+        from types import SimpleNamespace
+
+        from sift_client._internal.pyproject_config import load_tool_sift
+
+        bad = tmp_path / "pyproject.toml"
+        bad.write_text('[tool.sift]\ngrpc_uri = "unterminated\n')
+        fake_config = SimpleNamespace(inipath=bad, rootpath=tmp_path)
+
+        result = load_tool_sift(fake_config)  # type: ignore[arg-type]
+
+        assert result == {}
+        messages = [str(w.message) for w in recwarn.list]
+        assert any("[tool.sift]" in m and "Failed to read" in m for m in messages), messages
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_report_name.py b/python/lib/sift_client/_tests/pytest_plugin/test_report_name.py
new file mode 100644
index 000000000..5808c5a78
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_report_name.py
@@ -0,0 +1,120 @@
+"""Tests for report display-name templating.
+
+The report ``name`` is rendered from a template set under
+``[tool.sift.pytest.report] name`` and defaults to ``"{target} {timestamp}"``.
+The full pytest invocation is preserved on the report's metadata under
+``pytest_command``. These tests drive offline-mode inner sessions and inspect
+the JSONL ``CreateTestReport`` line for the rendered values.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+def _create_report_line(content: str) -> str:
+    """Return the ``[CreateTestReport:...]`` JSONL line from a log file."""
+    for line in content.splitlines():
+        if line.startswith("[CreateTestReport:"):
+            return line
+    raise AssertionError(f"no CreateTestReport line in log:\n{content}")
+
+
+class TestReportName:
+    def test_toml_template(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``[tool.sift.pytest.report] name`` renders placeholders into the report name."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            name = "TomlReport-{count}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        line = _create_report_line(log_path.read_text())
+        assert '"name":"TomlReport-1"' in line, line
+
+    def test_full_command_preserved_in_metadata(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The full pytest invocation is stored on the report metadata."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        line = _create_report_line(log_path.read_text())
+        assert '"pytest_command"' in line, line
+        # The recorded command reflects the actual invocation.
+        assert "--sift-offline" in line, line
+
+    def test_git_placeholders_render_empty_outside_repo(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Git placeholders are recognized and render empty when not in a repo.
+
+        The inner pytester session runs in a temp dir that is not a git
+        checkout, so ``{git_branch}`` resolves to an empty string rather than
+        triggering the unknown-placeholder fallback.
+        """
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            name = "R-{git_branch}-{count}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Invalid sift_report_name template" not in combined, combined
+        line = _create_report_line(log_path.read_text())
+        assert '"name":"R--1"' in line, line
+
+    def test_invalid_template_falls_back_and_warns(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An unknown placeholder warns and falls back without aborting the session."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            name = "{nope}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Invalid sift_report_name template" in combined, combined
+        # The report is still created despite the bad template.
+        _create_report_line(log_path.read_text())
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py b/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
new file mode 100644
index 000000000..ba6fbf5a5
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
@@ -0,0 +1,39 @@
+"""Guard rail that pins the docs settings table to the ``_OPTIONS`` registry.
+
+If you add or change a setting in ``lib/sift_client/pytest_plugin.py`` without
+regenerating the Markdown table in ``docs/guides/pytest_plugin/configuration.md``,
+this test fails with the up-to-date block to paste in.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import pytest
+
+
+# python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py -> python/
+_REPO_PYTHON_DIR = Path(__file__).resolve().parents[4]
+_DOCS_PATH = _REPO_PYTHON_DIR / "docs/guides/pytest_plugin/configuration.md"
+
+
+def test_settings_reference_docs_in_sync(pytestconfig: pytest.Config) -> None:
+    """The Markdown table under '## Settings reference' matches the registry verbatim."""
+    if not _DOCS_PATH.exists():
+        import pytest
+
+        pytest.skip(f"{_DOCS_PATH} not present in this checkout")
+    from sift_client.pytest_plugin import _render_settings_reference
+
+    rendered = _render_settings_reference()
+    content = _DOCS_PATH.read_text()
+    if rendered not in content:
+        import pytest
+
+        pytest.fail(
+            "Settings reference is out of sync with the _OPTIONS registry. Replace the "
+            "table under '## Settings reference' in "
+            "docs/guides/pytest_plugin/configuration.md with:\n\n" + rendered
+        )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py b/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
new file mode 100644
index 000000000..ed7a92dc4
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
@@ -0,0 +1,113 @@
+"""Tests for the unknown-setting warnings fired in ``pytest_configure``.
+
+The plugin scans ``SIFT_*`` env vars and ``[tool.sift.pytest.*]`` keys at
+session start and emits a ``SiftPytestPluginWarning`` for anything not
+declared in the central ``_OPTIONS`` registry. A typo (`SIFT_REPORT_SERIALNUM`
+instead of `SIFT_REPORT_SERIAL_NUMBER`) would otherwise silently no-op.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    import pytest
+
+
+class TestTypoDetector:
+    def test_unknown_env_var_warns(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An unknown ``SIFT_*`` env var emits a warning with a closest-match hint."""
+        monkeypatch.setenv("SIFT_REPORT_SERIALNUM", "SN-1")  # missing underscore
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown SIFT_* env var `SIFT_REPORT_SERIALNUM`" in combined, combined
+        assert "did you mean `SIFT_REPORT_SERIAL_NUMBER`" in combined, combined
+
+    def test_known_env_var_silent(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Declared env vars don't warn."""
+        monkeypatch.setenv("SIFT_REPORT_SERIAL_NUMBER", "SN-1")
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown SIFT_*" not in combined, combined
+
+    def test_unknown_toml_key_warns(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An unknown ``[tool.sift.pytest.report]`` key warns with a suggestion."""
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            serial_numbr = "SN-1"
+            """
+        )
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown sift config key" in combined, combined
+        assert "pytest.report.serial_numbr" in combined, combined
+        assert "did you mean" in combined, combined
+        assert "serial_number" in combined, combined
+
+    def test_unknown_toml_outside_pytest_scope_silent(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``[tool.sift.X]`` outside ``tool.sift.pytest`` is not the plugin's concern.
+
+        Other Sift tools may use ``tool.sift.<other-subtree>`` (the build-time
+        ``[tool.sift.extras]`` in this repo's own pyproject is one example);
+        the detector intentionally only walks ``tool.sift.pytest``.
+        """
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.something_else]
+            anything = "goes"
+            """
+        )
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown sift config key" not in combined, combined
+
+    def test_metadata_subtree_keys_are_user_defined(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Keys under ``[tool.sift.pytest.report.metadata]`` don't trigger warnings."""
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report.metadata]
+            anything_at_all = "value"
+            another_thing   = 42
+            """
+        )
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown sift config key" not in combined, combined
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index ed2d71fb6..4341bf122 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -11,10 +11,12 @@
 import pytest
 
 from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.pyproject_config import load_tool_sift
 from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
 from sift_client.util.test_results.context_manager import (
+    _git_metadata,
     _quiet_fork_stderr,
     format_assertion_message,
     format_truncated_traceback,
@@ -191,9 +193,9 @@ def _build_hierarchy_chain(
     broadly so a misbehaving collector doesn't abort the whole collection
     phase — that frame's docstring just becomes ``None``.
     """
-    include_package = bool(_option_or_ini(config, _PACKAGE_STEP))
-    include_module = bool(_option_or_ini(config, _MODULE_STEP))
-    include_class = bool(_option_or_ini(config, _CLASS_STEP))
+    include_package = bool(_PACKAGE_STEP.resolve(config))
+    include_module = bool(_MODULE_STEP.resolve(config))
+    include_class = bool(_CLASS_STEP.resolve(config))
 
     chain: list[tuple[str, str, str | None, bool]] = []
     # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
@@ -220,207 +222,423 @@ def _build_hierarchy_chain(
     return tuple(reversed(chain))
 
 
+# Settings-reference categories. Each maps to a docs subsection and, in the
+# renderer, to the column subset that category actually uses.
+_CAT_BEHAVIOR = "Pytest behavior"
+_CAT_CONNECTION = "Connection"
+_CAT_REPORT = "Report content"
+_CATEGORIES = (_CAT_BEHAVIOR, _CAT_CONNECTION, _CAT_REPORT)
+
+_TOOL_SIFT_KEY = pytest.StashKey[dict]()
+
+
+def _tool_sift(config: pytest.Config | None) -> dict[str, Any]:
+    """Session-cached ``[tool.sift]`` table.
+
+    Every option that reads TOML, plus the typo detector, would otherwise
+    re-parse pyproject.toml on the session-start path — and re-emit the
+    malformed-file warning each time. Parse once per session via the config
+    stash; ``load_tool_sift`` stays the uncached parser for direct callers.
+    """
+    if config is None:
+        return {}
+    cached = config.stash.get(_TOOL_SIFT_KEY, None)
+    if cached is None:
+        cached = load_tool_sift(config)
+        config.stash[_TOOL_SIFT_KEY] = cached
+    return cached
+
+
 @dataclass(frozen=True)
 class _Option:
-    """A single Sift plugin setting, registered as a CLI flag and/or an ini key.
+    """One setting and the logic to resolve it from wherever it can be set.
+
+    A setting may be read from an env var, a CLI flag, a pytest ini key, or a
+    ``[tool.sift...]`` TOML path. :meth:`resolve` walks the declared surfaces in
+    env > cli > ini > toml order. ``metadata`` is the one exception: a free-form
+    TOML table (``merge=True``) resolved by :meth:`resolve_merged`.
 
-    ``ini_name`` is used as both the ini key and the CLI ``dest``, so a value
-    set either way lands on the same config slot. ``cli_flag=None`` makes the
-    option ini-only (e.g. the URI fallbacks).
+    One registry of these drives ``pytest_addoption``, the resolvers, the docs
+    settings-reference table, and the typo detector, so a setting is added or
+    changed in one place.
+
+    Surface fields (declare only the ones a setting uses):
+
+    - ``cli`` / ``cli_action``: CLI flag (e.g. ``"--sift-offline"``) and
+      argparse action; ``cli_dest`` is derived from the flag.
+    - ``ini`` / ``ini_type`` / ``ini_default``: pytest ini key under
+      ``[tool.pytest.ini_options]`` and its pytest type + default.
+    - ``toml``: tuple path under ``[tool.sift...]``, e.g.
+      ``("pytest", "report", "name")`` -> ``tool.sift.pytest.report.name``.
+    - ``env``: full env var name, e.g. ``"SIFT_API_KEY"``.
+
+    ``category`` groups the option in the docs settings reference (one of
+    ``_CATEGORIES``).
     """
 
-    ini_name: str
-    ini_help: str
-    cli_flag: str | None = None
-    cli_help: str | None = None
-    action: str | None = None
+    name: str
+    help: str
+    category: str
+    cli: str | None = None
+    cli_action: str | None = None
+    ini: str | None = None
     ini_type: str | None = None
     ini_default: Any = None
+    toml: tuple[str, ...] | None = None
+    env: str | None = None
+    merge: bool = False
+
+    @property
+    def cli_dest(self) -> str:
+        """Argparse ``dest`` for the option.
+
+        When the option has both a CLI flag and an ini key, the dest matches
+        the ini name so ``config.getoption(ini_name)`` returns the CLI value
+        (and falls through to ``config.getini(ini_name)`` when the flag wasn't
+        passed). Without an ini key, the dest derives from the flag name.
+        """
+        if self.ini:
+            return self.ini
+        if self.cli is None:
+            return self.name
+        return self.cli.lstrip("-").replace("-", "_")
+
+    def __post_init__(self) -> None:
+        if self.cli_action and not self.cli:
+            raise ValueError(f"_Option({self.name!r}): cli_action requires cli")
+        if self.ini_type and not self.ini:
+            raise ValueError(f"_Option({self.name!r}): ini_type requires ini")
+        if self.merge and not self.toml:
+            raise ValueError(f"_Option({self.name!r}): merge=True needs toml")
+        if not any([self.cli, self.ini, self.toml, self.env]):
+            raise ValueError(f"_Option({self.name!r}): declares no surfaces")
+        if self.category not in _CATEGORIES:
+            raise ValueError(f"_Option({self.name!r}): category must be one of {_CATEGORIES}")
+
+    def resolve(self, config: pytest.Config | None) -> Any:
+        """First set value from declared surfaces; ``None`` when unset everywhere.
+
+        Walk order is env > cli > ini > toml. No current option declares both
+        env and cli, so the chain isn't ambiguous in practice.
+        ``getini`` returns the typed default for unset bool/list keys, so this
+        only returns ini values for booleans (always meaningful), non-empty
+        strings, and non-empty lists.
+        """
+        if self.env:
+            env_value = os.getenv(self.env)
+            if env_value not in (None, ""):
+                return env_value
+        if config is None:
+            return None
+        if self.cli:
+            cli_value = config.getoption(self.cli_dest, default=None)
+            if cli_value is not None:
+                return cli_value
+        if self.ini:
+            try:
+                ini_value = config.getini(self.ini)
+            except (KeyError, ValueError):
+                ini_value = None
+            if isinstance(ini_value, bool):
+                return ini_value
+            if isinstance(ini_value, str) and ini_value:
+                return ini_value
+            if isinstance(ini_value, list) and ini_value:
+                return ini_value
+        if self.toml:
+            toml_value = _walk_toml(_tool_sift(config), self.toml)
+            if toml_value not in (None, ""):
+                return toml_value
+        return None
 
-
+    def resolve_merged(self, config: pytest.Config | None) -> dict[str, str | float | bool]:
+        """For ``merge=True`` dict-shape settings: the free-form TOML table.
+
+        TOML values that don't fit ``dict[str, str | float | bool]`` (nested
+        tables, lists, ``None``) are dropped with a warning so a malformed
+        entry can't crash report creation.
+        """
+        result: dict[str, str | float | bool] = {}
+        if config is not None and self.toml:
+            base = _walk_toml(_tool_sift(config), self.toml)
+            if isinstance(base, dict):
+                for key, value in base.items():
+                    if not isinstance(key, str):
+                        continue
+                    if isinstance(value, (bool, str, int, float)):
+                        # ``bool`` first since ``isinstance(True, int)`` is True.
+                        result[key] = value  # type: ignore[assignment]
+                        continue
+                    warnings.warn(
+                        f"[tool.sift.{'.'.join(self.toml)}] entry {key!r} ignored: "
+                        f"unsupported type {type(value).__name__}.",
+                        SiftPytestPluginWarning,
+                        stacklevel=2,
+                    )
+        return result
+
+
+def _walk_toml(data: dict[str, Any], path: tuple[str, ...]) -> Any:
+    """Walk a parsed TOML tree along ``path``; return None on any missing key."""
+    cur: Any = data
+    for key in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(key)
+        if cur is None:
+            return None
+    return cur
+
+
+# ---------------------------------------------------------------------------
+# Settings registry.
+#
+# Add new options here. The registry drives `pytest_addoption`, resolution,
+# the docs settings-reference table, and the unknown-key typo detector, so a
+# setting is declared once instead of wired up in several places.
+#
+# Where each setting lives follows a few principles:
+#   - Secrets (the API key) come from environment variables only, never a
+#     committed file.
+#   - Pytest behavior lives in [tool.pytest.ini_options] so it integrates with
+#     `pytest --help` / `--co` / `--trace-config`.
+#   - Sift report content lives in [tool.sift.pytest.report.*].
+#   - Non-secret endpoints take an env var plus one static home (ini or toml,
+#     not both).
+#   - A CLI flag is added only when there is a real per-run override workflow;
+#     stable project config stays in ini/toml.
+#   - Dynamic per-run values are injected via environment variables (pytest-dotenv
+#     loads .env for local dev; CI sets the same names from its secret store).
+# ---------------------------------------------------------------------------
+
+# Pytest behavior. The CLI flag survives because the per-run override is real.
 _LOG_FILE = _Option(
-    cli_flag="--sift-log-file",
-    ini_name="sift_log_file",
-    cli_help="Path to write the Sift test result log file. "
-    "Use 'true' (default) to auto-create a temp file, "
-    "False, 'false', or 'none' to disable logging, "
-    "or a file path to write to a specific location.",
-    ini_help="Default value for --sift-log-file. Same values accepted as "
-    "the CLI flag (path, 'true', 'false', 'none').",
+    name="log_file",
+    category=_CAT_BEHAVIOR,
+    help="Path to the JSONL log of create/update calls (path | true | false | none).",
+    cli="--sift-log-file",
+    ini="sift_log_file",
 )
-
 _GIT_METADATA = _Option(
-    cli_flag="--no-sift-git-metadata",
-    ini_name="sift_git_metadata",
-    action="store_false",
-    cli_help="Exclude git metadata from the Sift test results. "
-    "Git metadata (repo, branch, commit) is included by default.",
-    ini_help="Include git repo/branch/commit in the report (true/false). "
-    "Defaults to true. The --no-sift-git-metadata CLI flag overrides "
-    "this when passed.",
+    name="git_metadata",
+    category=_CAT_BEHAVIOR,
+    help="Capture git repo/branch/commit on the report.",
+    cli="--no-sift-git-metadata",
+    cli_action="store_false",
+    ini="sift_git_metadata",
     ini_type="bool",
     ini_default=True,
 )
-
 _OFFLINE = _Option(
-    cli_flag="--sift-offline",
-    ini_name="sift_offline",
-    action="store_true",
-    cli_help="Run without contacting Sift. All create/update calls are written "
-    "to a JSONL log file for later replay via `import-test-result-log`. "
-    "No session-start ping is attempted.",
-    ini_help="When true, run in offline mode (same effect as --sift-offline). Defaults to false.",
+    name="offline",
+    category=_CAT_BEHAVIOR,
+    help="Skip the session-start ping; route create/update through the JSONL log.",
+    cli="--sift-offline",
+    cli_action="store_true",
+    ini="sift_offline",
     ini_type="bool",
     ini_default=False,
 )
-
 _DISABLED = _Option(
-    cli_flag="--sift-disabled",
-    ini_name="sift_disabled",
-    action="store_true",
-    cli_help="Disable Sift integration entirely. Nothing contacts the API "
-    "and no log file is written. `step.measure(...)` still returns real "
-    "pass/fail booleans. Returned entities expose `is_simulated == True`. "
-    "Also honored via the `SIFT_DISABLED` env var. Supersedes every other "
-    "flag.",
-    ini_help="When true, run in disabled mode (same effect as --sift-disabled). "
-    "Also honored via the SIFT_DISABLED env var. Supersedes every other "
-    "setting. Defaults to false.",
+    name="disabled",
+    category=_CAT_BEHAVIOR,
+    help="Disable Sift entirely (no API calls, no log file). Supersedes --sift-offline.",
+    cli="--sift-disabled",
+    cli_action="store_true",
+    ini="sift_disabled",
     ini_type="bool",
     ini_default=False,
 )
 
-_GRPC_URI = _Option(
-    ini_name="sift_grpc_uri",
-    ini_help="Sift gRPC endpoint URI. The default `sift_client` fixture "
-    "prefers the SIFT_GRPC_URI environment variable and falls back to "
-    "this ini value.",
-)
-
-_REST_URI = _Option(
-    ini_name="sift_rest_uri",
-    ini_help="Sift REST endpoint URI. The default `sift_client` fixture "
-    "prefers the SIFT_REST_URI environment variable and falls back to "
-    "this ini value.",
-)
-
-_REPORT_URL_BASE = _Option(
-    cli_flag="--sift-report-url-base",
-    ini_name="sift_report_url_base",
-    cli_help="Sift web-app origin used to build the clickable report link in the "
-    "terminal footer (e.g. https://app.siftstack.com). Set this for on-prem or "
-    "custom deployments whose API host can't be mapped to a frontend "
-    "automatically. Also honored via the SIFT_APP_URL env var. When unset, the "
-    "link is derived from the REST URI for known Sift hosts.",
-    ini_help="Default for --sift-report-url-base. The Sift web-app origin used to "
-    "build the report link in the terminal footer. Also honored via the "
-    "SIFT_APP_URL env var. When unset, the link is derived from the REST URI for "
-    "known Sift hosts.",
-)
-
 _OPEN = _Option(
-    cli_flag="--sift-open-report",
-    ini_name="sift_open_report",
-    action="store_true",
-    cli_help="Open the resulting Sift test report in a browser at session end. "
-    "Online mode only; no-op when the report URL can't be resolved. Intended for "
-    "local development.",
-    ini_help="When true, open the report in a browser at session end (online only). "
-    "Defaults to false.",
+    name="open_report",
+    category=_CAT_BEHAVIOR,
+    help="Open the resulting report in a browser at session end (online only; "
+    "no-op when the report URL can't be resolved).",
+    cli="--sift-open-report",
+    cli_action="store_true",
+    ini="sift_open_report",
     ini_type="bool",
     ini_default=False,
 )
 
+# Pytest behavior: set-once project defaults (no CLI flag — no per-run override).
 _AUTOUSE = _Option(
-    ini_name="sift_autouse",
-    ini_help="Default for the Sift autouse fixtures (report_context, step, "
-    "_hierarchy_parents, _parametrize_parents). When true (default), tests "
-    "are included unless marked with @pytest.mark.sift_exclude. When false, "
-    "tests are skipped unless marked with @pytest.mark.sift_include. "
-    "Bulk-apply markers in a directory's conftest via "
-    "`pytest_collection_modifyitems`.",
+    name="autouse",
+    category=_CAT_BEHAVIOR,
+    help="Default for the Sift autouse fixtures (report_context, step, hierarchy/parametrize parents).",
+    ini="sift_autouse",
     ini_type="bool",
     ini_default=True,
 )
-
 _PACKAGE_STEP = _Option(
-    ini_name="sift_package_step",
-    ini_help="When true (default), open a parent step for each Python package "
-    "(directory with an ``__init__.py``) in the test path. Set to false to "
-    "flatten package grouping.",
+    name="package_step",
+    category=_CAT_BEHAVIOR,
+    help="Open a parent step for each Python package in the test path.",
+    ini="sift_package_step",
     ini_type="bool",
     ini_default=True,
 )
-
 _MODULE_STEP = _Option(
-    ini_name="sift_module_step",
-    ini_help="When true (default), open a per-module parent step. Set to false "
-    "to skip module-level grouping in the report tree.",
+    name="module_step",
+    category=_CAT_BEHAVIOR,
+    help="Open a parent step for each test module.",
+    ini="sift_module_step",
     ini_type="bool",
     ini_default=True,
 )
-
 _CLASS_STEP = _Option(
-    ini_name="sift_class_step",
-    ini_help="When true (default), open per-class parent steps (including nested "
-    "classes). Set to false to keep class methods at module level.",
+    name="class_step",
+    category=_CAT_BEHAVIOR,
+    help="Open per-class parent steps, including nested classes.",
+    ini="sift_class_step",
     ini_type="bool",
     ini_default=True,
 )
-
 _PARAMETRIZE_NESTING = _Option(
-    ini_name="sift_parametrize_nesting",
-    ini_help="When true (default), parametrized tests nest under shared parent "
-    "steps (e.g. test_a -> v=1, v=2). Set to false to keep the flat per-test "
-    "leaf naming (test_a[1], test_a[2]).",
+    name="parametrize_nesting",
+    category=_CAT_BEHAVIOR,
+    help="Cluster parametrized tests under shared parent steps (e.g. test_a -> v=1, v=2).",
+    ini="sift_parametrize_nesting",
     ini_type="bool",
     ini_default=True,
 )
 
+# Credentials. The API key is env-only; the URIs accept env + ini.
+_API_KEY = _Option(
+    name="api_key",
+    category=_CAT_CONNECTION,
+    help="Sift API key (secret, env-only).",
+    env="SIFT_API_KEY",
+)
+_GRPC_URI = _Option(
+    name="grpc_uri",
+    category=_CAT_CONNECTION,
+    help="Sift gRPC endpoint URI.",
+    env="SIFT_GRPC_URI",
+    ini="sift_grpc_uri",
+)
+_REST_URI = _Option(
+    name="rest_uri",
+    category=_CAT_CONNECTION,
+    help="Sift REST endpoint URI.",
+    env="SIFT_REST_URI",
+    ini="sift_rest_uri",
+)
+_APP_URL = _Option(
+    name="app_url",
+    category=_CAT_CONNECTION,
+    help="Sift web-app origin for the report link in the terminal footer (e.g. "
+    "https://app.siftstack.com). When unset, the link is derived from the REST URI "
+    "for known Sift hosts.",
+    env="SIFT_APP_URL",
+    ini="sift_app_url",
+)
+
+# Report content. Project defaults in [tool.sift.pytest.report]; CI injects
+# per-run values via SIFT_REPORT_* env vars (pytest-dotenv handles .env files
+# for local dev).
+_REPORT_NAME = _Option(
+    name="report_name",
+    category=_CAT_REPORT,
+    help="Template for the report display name. Placeholders: {target}, {command}, {args}, "
+    "{rootdir}, {timestamp}, {count}, {git_repo}, {git_branch}, {git_commit}.",
+    toml=("pytest", "report", "name"),
+)
+_TEST_CASE = _Option(
+    name="test_case",
+    category=_CAT_REPORT,
+    help="Template for the report's test_case field (same placeholders as report_name).",
+    toml=("pytest", "report", "test_case"),
+)
+_TEST_SYSTEM_NAME = _Option(
+    name="test_system_name",
+    category=_CAT_REPORT,
+    help="Name of the test system / rig. Defaults to the host's name.",
+    env="SIFT_REPORT_TEST_SYSTEM_NAME",
+    toml=("pytest", "report", "test_system_name"),
+)
+_SYSTEM_OPERATOR = _Option(
+    name="system_operator",
+    category=_CAT_REPORT,
+    help="Operator running the test. Defaults to the OS user.",
+    env="SIFT_REPORT_SYSTEM_OPERATOR",
+    toml=("pytest", "report", "system_operator"),
+)
+_SERIAL_NUMBER = _Option(
+    name="serial_number",
+    category=_CAT_REPORT,
+    help="Serial number of the unit under test.",
+    env="SIFT_REPORT_SERIAL_NUMBER",
+    toml=("pytest", "report", "serial_number"),
+)
+_PART_NUMBER = _Option(
+    name="part_number",
+    category=_CAT_REPORT,
+    help="Part number of the unit under test.",
+    env="SIFT_REPORT_PART_NUMBER",
+    toml=("pytest", "report", "part_number"),
+)
+_METADATA = _Option(
+    name="metadata",
+    category=_CAT_REPORT,
+    help="Free-form report metadata, as a TOML table of scalar values. For "
+    "dynamic per-run keys, attach them in conftest via the report_context fixture.",
+    toml=("pytest", "report", "metadata"),
+    merge=True,
+)
+
 _OPTIONS: tuple[_Option, ...] = (
     _LOG_FILE,
     _GIT_METADATA,
     _OFFLINE,
     _DISABLED,
-    _GRPC_URI,
-    _REST_URI,
-    _REPORT_URL_BASE,
     _OPEN,
     _AUTOUSE,
     _PACKAGE_STEP,
     _MODULE_STEP,
     _CLASS_STEP,
     _PARAMETRIZE_NESTING,
+    _API_KEY,
+    _GRPC_URI,
+    _REST_URI,
+    _APP_URL,
+    _REPORT_NAME,
+    _TEST_CASE,
+    _TEST_SYSTEM_NAME,
+    _SYSTEM_OPERATOR,
+    _SERIAL_NUMBER,
+    _PART_NUMBER,
+    _METADATA,
 )
 
 
 def pytest_addoption(parser: pytest.Parser) -> None:
-    """Register Sift-specific command-line options and ini keys.
+    """Register every CLI flag and pytest ini key declared in ``_OPTIONS``.
 
-    Each option can be set on the command line or under ``[tool.pytest.ini_options]``
-    in ``pyproject.toml`` (or ``[pytest]`` in ``pytest.ini``). CLI values take
-    precedence over ini values, which take precedence over the built-in default.
+    One loop drives both surfaces — adding a setting is one entry in the
+    registry, not three edits across this function and a docs table.
     """
     group = parser.getgroup("sift", description="Sift test results")
     for opt in _OPTIONS:
-        if opt.cli_flag is not None:
+        if opt.cli is not None:
             cli_kwargs: dict[str, Any] = {
-                "dest": opt.ini_name,
+                "dest": opt.cli_dest,
                 "default": None,
-                "help": opt.cli_help,
+                "help": opt.help,
             }
-            if opt.action is not None:
-                cli_kwargs["action"] = opt.action
-            group.addoption(opt.cli_flag, **cli_kwargs)
-
-        ini_kwargs: dict[str, Any] = {"help": opt.ini_help, "default": opt.ini_default}
-        if opt.ini_type is not None:
-            ini_kwargs["type"] = opt.ini_type
-        parser.addini(opt.ini_name, **ini_kwargs)
+            if opt.cli_action is not None:
+                cli_kwargs["action"] = opt.cli_action
+            group.addoption(opt.cli, **cli_kwargs)
+        if opt.ini is not None:
+            ini_kwargs: dict[str, Any] = {"help": opt.help, "default": opt.ini_default}
+            if opt.ini_type is not None:
+                ini_kwargs["type"] = opt.ini_type
+            parser.addini(opt.ini, **ini_kwargs)
 
 
 def pytest_configure(config: pytest.Config) -> None:
-    """Register the Sift gate markers so they show up in `pytest --markers`."""
+    """Register the Sift gate markers and warn on unknown ``SIFT_*`` settings."""
     config.addinivalue_line(
         "markers",
         "sift_include: force the Sift autouse fixtures to activate for this test "
@@ -431,6 +649,158 @@ def pytest_configure(config: pytest.Config) -> None:
         "sift_exclude: force the Sift autouse fixtures to skip this test "
         "regardless of the `sift_autouse` ini default.",
     )
+    # Surface typos in env vars and [tool.sift...] keys at session start so a
+    # silent no-op (env var that doesn't match anything, table key the loader
+    # ignores) becomes visible. The registry is the source of truth for what's
+    # known.
+    _warn_on_unknown_env_vars()
+    _warn_on_unknown_toml_keys(config)
+
+
+def _render_settings_reference() -> str:
+    """Render the Markdown settings reference from ``_OPTIONS``.
+
+    One ``### <category>`` subsection per category, each table showing only the
+    columns that category uses (so no dead all-``—`` columns). The plugin docs
+    at ``docs/guides/pytest_plugin/configuration.md`` embed this output verbatim
+    so the registry and the docs can't drift;
+    ``test_settings_reference_docs_in_sync`` is the guard rail. Regenerate with::
+
+        uv run python -c "from sift_client.pytest_plugin import _render_settings_reference; print(_render_settings_reference())"
+    """
+
+    def _cli_cell(opt: _Option) -> str:
+        return f"`{opt.cli}`" if opt.cli else "—"
+
+    def _ini_cell(opt: _Option) -> str:
+        return f"`{opt.ini}`" if opt.ini else "—"
+
+    def _toml_cell(opt: _Option) -> str:
+        if not opt.toml:
+            return "—"
+        if opt.merge:
+            return f"`[tool.sift.{'.'.join(opt.toml)}]` (table)"
+        section = ".".join(opt.toml[:-1])
+        return f"`[tool.sift.{section}] {opt.toml[-1]}`"
+
+    def _env_cell(opt: _Option) -> str:
+        if opt.env:
+            return f"`{opt.env}`"
+        return "—"
+
+    # Per-category column layout: only the surfaces that category actually uses.
+    # Each column is (header, cell-renderer).
+    columns_by_category = {
+        _CAT_BEHAVIOR: [
+            ("CLI flag", _cli_cell),
+            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
+        ],
+        _CAT_CONNECTION: [
+            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
+            ("Env var", _env_cell),
+        ],
+        _CAT_REPORT: [
+            ("TOML (`[tool.sift...]`)", _toml_cell),
+            ("Env var", _env_cell),
+        ],
+    }
+
+    def _escape(cell: str) -> str:
+        # Literal pipes inside a Markdown table cell need backslash escaping or
+        # they'd be parsed as column separators.
+        return cell.replace("|", "\\|")
+
+    blocks: list[str] = []
+    for category in _CATEGORIES:
+        opts = [o for o in _OPTIONS if o.category == category]
+        if not opts:
+            continue
+        columns = columns_by_category[category]
+        headers = ["Setting", *(h for h, _ in columns)]
+        lines = [
+            f"### {category}",
+            "",
+            "| " + " | ".join(headers) + " |",
+            "|" + "|".join(["---"] * len(headers)) + "|",
+        ]
+        for opt in opts:
+            cells = [opt.help, *(render(opt) for _, render in columns)]
+            lines.append("| " + " | ".join(_escape(c) for c in cells) + " |")
+        blocks.append("\n".join(lines))
+    return "\n\n".join(blocks)
+
+
+def _warn_on_unknown_env_vars() -> None:
+    """Emit a warning for any ``SIFT_*`` env var not declared in the registry.
+
+    The registry declares each env var by its full name (``opt.env``); a
+    ``SIFT_*`` var that matches none of them is almost always a typo.
+    """
+    import difflib
+
+    known_full = {opt.env for opt in _OPTIONS if opt.env}
+    suggestion_pool = sorted(known_full)
+    for name in sorted(os.environ):
+        if not name.startswith("SIFT_"):
+            continue
+        if name in known_full:
+            continue
+        close = difflib.get_close_matches(name, suggestion_pool, n=1, cutoff=0.6)
+        hint = f" (did you mean `{close[0]}`?)" if close else ""
+        warnings.warn(
+            f"Unknown SIFT_* env var `{name}`{hint}; ignored.",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+
+
+def _warn_on_unknown_toml_keys(config: pytest.Config) -> None:
+    """Walk ``[tool.sift.pytest.*]`` in pyproject.toml and warn on keys outside the registry.
+
+    Only the ``tool.sift.pytest`` subtree is checked. Other ``tool.sift.*``
+    subtrees are reserved for non-pytest Sift tooling (e.g. ``tool.sift.extras``
+    is consumed by this repo's extras-generation script) and aren't our
+    concern. Free-form subtrees (``merge=True`` options like ``metadata``)
+    stop the walk — their keys are user-defined and not validated.
+    """
+    import difflib
+
+    data = _tool_sift(config)
+    pytest_table = (data or {}).get("pytest")
+    if not isinstance(pytest_table, dict):
+        return
+    # Build leaf/free-form/prefix sets relative to the ``("pytest", ...)`` root
+    # the registry already uses, so the walk runs on the table we just sliced.
+    leaves = {opt.toml for opt in _OPTIONS if opt.toml and not opt.merge}
+    free_form = {opt.toml for opt in _OPTIONS if opt.toml and opt.merge}
+    prefixes: set[tuple[str, ...]] = set()
+    for full in leaves | free_form:
+        for i in range(len(full)):
+            prefixes.add(full[:i])
+
+    def _walk(node: Any, base: tuple[str, ...]) -> None:
+        if base in free_form or not isinstance(node, dict):
+            return
+        for key, value in node.items():
+            path = (*base, str(key))
+            if path in leaves or path in free_form:
+                continue
+            if path in prefixes:
+                _walk(value, path)
+                continue
+            full_name = "tool.sift." + ".".join(path)
+            same_depth = [
+                ".".join(p) for p in (leaves | free_form | prefixes) if len(p) == len(path)
+            ]
+            close = difflib.get_close_matches(".".join(path), same_depth, n=1, cutoff=0.6)
+            hint = f" (did you mean `tool.sift.{close[0]}`?)" if close else ""
+            warnings.warn(
+                f"Unknown sift config key `{full_name}`{hint}; ignored.",
+                SiftPytestPluginWarning,
+                stacklevel=2,
+            )
+
+    _walk(pytest_table, ("pytest",))
 
 
 def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
@@ -472,13 +842,11 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
 
 
 def _is_offline(pytestconfig: pytest.Config | None) -> bool:
-    return bool(_option_or_ini(pytestconfig, _OFFLINE))
+    return bool(_OFFLINE.resolve(pytestconfig))
 
 
 def _is_disabled(pytestconfig: pytest.Config | None) -> bool:
-    if bool(_option_or_ini(pytestconfig, _DISABLED)):
-        return True
-    return os.getenv("SIFT_DISABLED", "").lower() in ("1", "true", "yes")
+    return bool(_DISABLED.resolve(pytestconfig))
 
 
 def _sdk_version() -> str:
@@ -692,7 +1060,7 @@ def pytest_terminal_summary(terminalreporter: Any, exitstatus: int, config: pyte
         config.stash[SIFT_REPORT_ID_STASH_KEY] = report_id
     if report_url is not None:
         config.stash[SIFT_REPORT_URL_STASH_KEY] = report_url
-        if _option_or_ini(config, _OPEN):
+        if _OPEN.resolve(config):
             _maybe_open_report(report_url)
 
     if quiet:
@@ -768,7 +1136,7 @@ def pytest_terminal_summary(terminalreporter: Any, exitstatus: int, config: pyte
         _sift_kv(
             terminalreporter,
             "Report",
-            f"id {report_id}  (set sift_report_url_base for a clickable link)",
+            f"id {report_id}  (set sift_app_url for a clickable link)",
         )
 
     if report_id and getattr(context, "replay_incomplete", False) and log_file is not None:
@@ -793,24 +1161,6 @@ def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bo
     return default
 
 
-def _option_or_ini(pytestconfig: pytest.Config | None, opt: _Option) -> Any:
-    """Resolve a Sift plugin setting from CLI > ini > None.
-
-    The ``addoption`` registrations use ``default=None`` so we can tell whether
-    the CLI was actually used. When the CLI didn't set a value, fall back to
-    the matching ``addini`` key.
-    """
-    if pytestconfig is None:
-        return None
-    cli = pytestconfig.getoption(opt.ini_name, default=None)
-    if cli is not None:
-        return cli
-    try:
-        return pytestconfig.getini(opt.ini_name)
-    except (KeyError, ValueError):
-        return None
-
-
 def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
     """Determine log_file value from CLI flag or ini key.
 
@@ -828,7 +1178,7 @@ def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool |
     Rejects ``--sift-log-file=none`` combined with ``--sift-offline`` since
     offline mode needs the log file as its sole sink.
     """
-    raw = _option_or_ini(pytestconfig, _LOG_FILE)
+    raw = _LOG_FILE.resolve(pytestconfig)
     disabled = raw is False or (isinstance(raw, str) and raw.lower() in ("false", "none"))
     if disabled and _is_offline(pytestconfig):
         raise pytest.UsageError(
@@ -1007,19 +1357,154 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
         _finalize_after_teardown(item, report)
 
 
+def _relativize(path: Path, rootpath: Path) -> str:
+    """Path relative to rootdir, or the basename when it sits outside the tree."""
+    try:
+        rel = str(path.relative_to(rootpath))
+    except ValueError:
+        return path.name
+    return "" if rel == "." else rel
+
+
+def _strip_param(nodeid: str) -> str:
+    """Drop the trailing ``[param]`` from a nodeid, keeping ``file::Class::func``.
+
+    The parametrize id is a variation of the test, not its identity — leaving it
+    in would make a re-parametrization silently shift the grouping key. Splits on
+    the last ``::`` segment and cuts at its first ``[``; class/function names
+    never contain ``[``, so nested brackets in a param value can't confuse it.
+    """
+    head, sep, leaf = nodeid.rpartition("::")
+    leaf = leaf.split("[", 1)[0]
+    return f"{head}{sep}{leaf}"
+
+
+def _derive_target(request: pytest.FixtureRequest, args: tuple[str, ...]) -> str:
+    """Describe what was run, from the collected items rather than the command line.
+
+    Collection is the ground truth of selection — independent of flag order,
+    ``-k`` / ``-m`` filters, or which path form was typed. Every value is
+    anchored to the rootdir (project) name so the shape is uniform; granularity
+    narrows with the selection:
+
+    * a single test -> ``project/tests/test_motor.py::test_spin`` (param stripped)
+    * a single file -> ``project/tests/test_motor.py``
+    * many files    -> their common directory, ``project/tests/motor``
+    * whole tree / nothing collected / paths outside rootdir -> ``project``
+
+    The report is session-level and individual tests are its steps, so the
+    file/directory grain is the natural unit of "what ran" for the report
+    itself. The verbatim invocation stays available via ``{command}`` and the
+    ``pytest_command`` metadata key.
+    """
+    rootpath = request.config.rootpath
+    root = rootpath.name
+
+    def _anchor(rel: str) -> str:
+        return f"{root}/{rel}" if rel else root
+
+    items = list(getattr(request.session, "items", ()) or ())
+    if not items:
+        return root
+    if len(items) == 1:
+        return _anchor(_strip_param(items[0].nodeid))
+    paths = {p for p in (getattr(i, "path", None) for i in items) if p is not None}
+    if not paths:
+        return root
+    if len(paths) == 1:
+        return _anchor(_relativize(next(iter(paths)), rootpath))
+    try:
+        common = Path(os.path.commonpath([str(p) for p in paths]))
+    except ValueError:
+        # e.g. paths on different drives (Windows); fall back to the project.
+        return root
+    return _anchor(_relativize(common, rootpath))
+
+
+def _build_template_fields(
+    target: str,
+    command: str,
+    args: tuple[str, ...],
+    request: pytest.FixtureRequest,
+) -> dict[str, Any]:
+    """Build the placeholder mapping shared by the name and test_case templates."""
+    items = getattr(request.session, "items", ()) or ()
+    git = _git_metadata() or {}
+    return {
+        "target": target,
+        "command": command,
+        "args": " ".join(args),
+        "rootdir": request.config.rootpath.name,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "count": len(items),
+        "git_repo": git.get("git_repo", ""),
+        "git_branch": git.get("git_branch", ""),
+        "git_commit": git.get("git_commit", ""),
+    }
+
+
+def _format_template(
+    template: str,
+    fields: dict[str, Any],
+    *,
+    fallback: str,
+    option_label: str,
+) -> str:
+    """Format ``template`` with ``fields``; on bad input, warn and return ``fallback``.
+
+    A bad template should never block test results from being recorded, so the
+    rendering errors collapse to a warning + fallback rather than aborting the
+    session.
+    """
+    try:
+        return template.format(**fields)
+    except (KeyError, IndexError, ValueError) as exc:
+        warnings.warn(
+            f"Invalid {option_label} template {template!r} ({exc}); using fallback.",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+        return fallback
+
+
 def _report_context_impl(
     sift_client: SiftClient,
     request: pytest.FixtureRequest,
     pytestconfig: pytest.Config | None = None,
 ) -> Generator[ReportContext, None, None]:
     args = request.config.invocation_params.args
-    test_path = Path(args[0]) if args else None
-    if test_path is not None and test_path.exists():
-        base_name = test_path.name
-        test_case: Path | str = test_path
-    else:
-        base_name = "pytest " + " ".join(args) if args else "pytest"
-        test_case = base_name
+    # ``target`` is "what ran", derived from the collected items (see
+    # _derive_target) — invocation-independent, unlike parsing the command
+    # line. Both the display name and test_case default to it; the verbatim
+    # command stays available via {command} and the pytest_command metadata.
+    target = _derive_target(request, args)
+    command = "pytest " + " ".join(args) if args else "pytest"
+    fields = _build_template_fields(target, command, args, request)
+    name_template = _REPORT_NAME.resolve(pytestconfig) or "{target} {timestamp}"
+    name = _format_template(
+        name_template,
+        fields,
+        fallback=f"{target} {fields['timestamp']}",
+        option_label="sift_report_name",
+    )
+    test_case_template = _TEST_CASE.resolve(pytestconfig)
+    test_case = (
+        _format_template(
+            test_case_template,
+            fields,
+            fallback=target,
+            option_label="sift_test_case",
+        )
+        if test_case_template
+        else target
+    )
+    # Metadata starts from the [tool.sift.pytest.report.metadata] TOML table, and
+    # the auto-recorded pytest_command layers in last so the user can't
+    # accidentally overwrite it.
+    report_metadata: dict[str, str | float | bool] = {
+        **_METADATA.resolve_merged(pytestconfig),
+        "pytest_command": command,
+    }
     # Mode → ReportContext flags:
     #   online (default): log_file=<temp or user path>, replay_log_file=True
     #   --sift-offline:   log_file=<temp or user path>, replay_log_file=False
@@ -1027,15 +1512,19 @@ def _report_context_impl(
     disabled = sift_client._simulate
     offline = False if disabled else _is_offline(pytestconfig)
     log_file: str | Path | bool | None = False if disabled else _resolve_log_file(pytestconfig)
-    git_metadata = _option_or_ini(pytestconfig, _GIT_METADATA)
-    include_git_metadata = True if git_metadata is None else bool(git_metadata)
+    include_git_metadata = bool(_GIT_METADATA.resolve(pytestconfig))
     with ReportContext(
         sift_client,
-        name=f"{base_name} {datetime.now(timezone.utc).isoformat()}",
-        test_case=str(test_case),
+        name=name,
+        test_case=test_case,
+        test_system_name=_TEST_SYSTEM_NAME.resolve(pytestconfig) or None,
+        system_operator=_SYSTEM_OPERATOR.resolve(pytestconfig) or None,
+        serial_number=_SERIAL_NUMBER.resolve(pytestconfig) or None,
+        part_number=_PART_NUMBER.resolve(pytestconfig) or None,
         log_file=log_file,
         include_git_metadata=include_git_metadata,
         replay_log_file=not (disabled or offline),
+        metadata=report_metadata,
     ) as context:
         global REPORT_CONTEXT
         REPORT_CONTEXT = context
@@ -1054,12 +1543,6 @@ def _report_context_impl(
                 _drain_hierarchy_stack()
 
 
-_CREDENTIAL_KEYS: tuple[tuple[str, _Option | None], ...] = (
-    ("SIFT_API_KEY", None),  # env-only; never read from ini to keep secrets out of source control.
-    ("SIFT_GRPC_URI", _GRPC_URI),
-    ("SIFT_REST_URI", _REST_URI),
-)
-
 # Placeholder credentials used in --sift-offline mode when env/ini values
 # are missing. Offline mode never makes network calls, so the values are
 # only syntactically required by SiftConnectionConfig.
@@ -1088,19 +1571,6 @@ def _build_disabled_client() -> SiftClient:
     return client
 
 
-def _resolve_credential(
-    pytestconfig: pytest.Config | None, env_name: str, opt: _Option | None
-) -> str | None:
-    """Resolve a Sift credential: env var first, then ini key (if registered), else None."""
-    env_value = os.getenv(env_name)
-    if env_value:
-        return env_value
-    if opt is None or pytestconfig is None:
-        return None
-    ini_value = pytestconfig.getini(opt.ini_name)
-    return ini_value if isinstance(ini_value, str) and ini_value else None
-
-
 @pytest.fixture(scope="session")
 def sift_client(pytestconfig: pytest.Config) -> SiftClient:
     """Default ``SiftClient`` resolved from environment variables and ini keys.
@@ -1126,33 +1596,34 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
     """
     if _is_disabled(pytestconfig):
         return _build_disabled_client()
-    resolved = {env: _resolve_credential(pytestconfig, env, opt) for env, opt in _CREDENTIAL_KEYS}
+    resolved = {
+        "SIFT_API_KEY": _API_KEY.resolve(pytestconfig),
+        "SIFT_GRPC_URI": _GRPC_URI.resolve(pytestconfig),
+        "SIFT_REST_URI": _REST_URI.resolve(pytestconfig),
+    }
     missing = [env for env, value in resolved.items() if not value]
     if missing and not _is_offline(pytestconfig):
         raise pytest.UsageError(
             "Sift credentials missing: "
             + ", ".join(missing)
             + ". Set the environment variable(s) — pytest-dotenv loads them "
-            "from a `.env` file automatically — or set the URIs via "
-            "`sift_grpc_uri` / `sift_rest_uri` under `[tool.pytest.ini_options]` "
+            "from a `.env` file automatically — or set the URIs under "
+            "`sift_grpc_uri` / `sift_rest_uri` in `[tool.pytest.ini_options]` "
             "in pyproject.toml, or override the sift_client fixture in your "
             "conftest.py, or pass --sift-offline / --sift-disabled to run "
             "without contacting Sift."
         )
     for env in missing:
         resolved[env] = _OFFLINE_DEFAULTS[env]
-    # Web-app origin for the report link: the sift_report_url_base CLI/ini option
-    # wins, then the SIFT_APP_URL env var, else host-based derivation in
-    # SiftClient.app_url.
-    report_url_base = _option_or_ini(pytestconfig, _REPORT_URL_BASE) or os.getenv("SIFT_APP_URL")
-    # `or ""` is unreachable in practice since the `missing` check above guarantees
-    # non-None values
+    # Web-app origin for the report link: the SIFT_APP_URL env var wins, then the
+    # sift_app_url ini key, else host-based derivation in SiftClient.app_url.
+    app_url = _APP_URL.resolve(pytestconfig)
     return SiftClient(
         connection_config=SiftConnectionConfig(
-            api_key=resolved.get("SIFT_API_KEY") or "",
-            grpc_url=resolved.get("SIFT_GRPC_URI") or "",
-            rest_url=resolved.get("SIFT_REST_URI") or "",
-            app_url=report_url_base or None,
+            api_key=resolved["SIFT_API_KEY"] or "",
+            grpc_url=resolved["SIFT_GRPC_URI"] or "",
+            rest_url=resolved["SIFT_REST_URI"] or "",
+            app_url=app_url or None,
         )
     )
 
@@ -1223,7 +1694,7 @@ def _step_impl(
     # by ``_parametrize_parents``. When parametrize-nesting is disabled, fall
     # back to the bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf
     # remains uniquely identifiable.
-    if _option_or_ini(request.config, _PARAMETRIZE_NESTING):
+    if _PARAMETRIZE_NESTING.resolve(request.config):
         path = node.stash.get(_PARAMETRIZE_PATH_KEY, ())
         name = path[-1] if path else str(node.name)
     else:
@@ -1261,7 +1732,7 @@ def _hierarchy_parents(
 
     Gated off when the item is excluded (avoids eager ``report_context`` setup).
     """
-    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    default = bool(_AUTOUSE.resolve(pytestconfig))
     if not _sift_enabled_for(request.node, default):
         return None
     # Fall back to computing the chain on-demand for items that bypassed
@@ -1343,10 +1814,10 @@ def _parametrize_parents(
     diff against a subsequent test's chain pops them, or until
     ``pytest_sessionfinish`` drains anything left at session end.
     """
-    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    default = bool(_AUTOUSE.resolve(pytestconfig))
     if not _sift_enabled_for(request.node, default):
         return None
-    if not _option_or_ini(pytestconfig, _PARAMETRIZE_NESTING):
+    if not _PARAMETRIZE_NESTING.resolve(pytestconfig):
         return None
     # Fall back to on-demand computation for dynamically-inserted items;
     # see _hierarchy_parents for the same rationale.
@@ -1401,7 +1872,7 @@ def step(
     ``SiftClient(_simulate=True)`` placeholder, so every write returns a
     synthesized response without contacting Sift.
     """
-    default = bool(_option_or_ini(pytestconfig, _AUTOUSE))
+    default = bool(_AUTOUSE.resolve(pytestconfig))
     if not _sift_enabled_for(request.node, default):
         yield None
         return
diff --git a/python/lib/sift_client/sift_types/_mixins/metadata.py b/python/lib/sift_client/sift_types/_mixins/metadata.py
new file mode 100644
index 000000000..b53fa5dce
--- /dev/null
+++ b/python/lib/sift_client/sift_types/_mixins/metadata.py
@@ -0,0 +1,19 @@
+"""Placeholder for a future ``MetadataMixin`` (not yet implemented).
+
+TODO(metadata-mixin): metadata updates REPLACE the whole map.
+``entity.update({"metadata": {...}})`` builds a field mask over ``metadata``
+(see ``ModelUpdate.to_proto_with_mask`` in ``sift_types/_base.py``) and replaces
+it server-side — callers must spread the current ``.metadata`` first or silently
+drop existing keys (config defaults, git fields, ``pytest_command``).
+
+Planned shape: a ``MetadataMixin`` exposing a read-merge-write helper such as
+``add_metadata(**kv)`` / ``merge_metadata(dict)``, implemented as
+``self.update({"metadata": {**self.metadata, **kv}})``. Mix into every read
+entity that carries a ``metadata`` field — ``Asset``, ``Run``, ``Report``,
+``TestReport``, ``TestStep``, ``TestMeasurement`` — alongside
+``FileAttachmentsMixin`` and ``SimulatedMixin``. It stays a mixin (not a
+``BaseType`` method) because it relies on the ``metadata`` field, which not
+every ``BaseType`` subclass has (e.g. ``CalculatedChannel`` exposes metadata
+only on its Create/Update models, so it is out of scope). Until it exists,
+merge at the call site.
+"""
diff --git a/python/lib/sift_client/sift_types/asset.py b/python/lib/sift_client/sift_types/asset.py
index 78217934f..ea0895929 100644
--- a/python/lib/sift_client/sift_types/asset.py
+++ b/python/lib/sift_client/sift_types/asset.py
@@ -27,6 +27,8 @@ class Asset(BaseType[AssetProto, "Asset"], FileAttachmentsMixin):
     modified_date: datetime
     modified_by_user_id: str
     tags: list[str | Tag]
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     is_archived: bool
 
diff --git a/python/lib/sift_client/sift_types/report.py b/python/lib/sift_client/sift_types/report.py
index 42f349f42..34f64e2f1 100644
--- a/python/lib/sift_client/sift_types/report.py
+++ b/python/lib/sift_client/sift_types/report.py
@@ -108,6 +108,8 @@ class Report(BaseType[ReportProto, "Report"]):
     summaries: list[ReportRuleSummary]
     tags: list[str]
     rerun_from_report_id: str | None = None
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     job_id: str
     archived_date: datetime | None = None
diff --git a/python/lib/sift_client/sift_types/run.py b/python/lib/sift_client/sift_types/run.py
index ec6690896..e91225342 100644
--- a/python/lib/sift_client/sift_types/run.py
+++ b/python/lib/sift_client/sift_types/run.py
@@ -40,6 +40,8 @@ class Run(BaseType[RunProto, "Run"], FileAttachmentsMixin):
     created_by_user_id: str
     modified_by_user_id: str
     organization_id: str
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     tags: list[str]
     asset_ids: list[str]
diff --git a/python/lib/sift_client/sift_types/test_report.py b/python/lib/sift_client/sift_types/test_report.py
index dd786b02d..b8b1f2236 100644
--- a/python/lib/sift_client/sift_types/test_report.py
+++ b/python/lib/sift_client/sift_types/test_report.py
@@ -167,6 +167,8 @@ class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin, Simula
     start_time: datetime
     end_time: datetime
     error_info: ErrorInfo | None = None
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool] | None = None
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
@@ -402,6 +404,8 @@ class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"], Simulat
     passed: bool
     timestamp: datetime
     description: str | None = None
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool] | None = None
     channel_names: list[str] | None = None
 
@@ -645,6 +649,8 @@ class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin,
     test_case: str
     start_time: datetime
     end_time: datetime
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     serial_number: str | None = None
     part_number: str | None = None
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 41066b247..4b2e2ab9d 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -169,9 +169,12 @@ def __init__(
         test_system_name: str | None = None,
         system_operator: str | None = None,
         test_case: str | None = None,
+        serial_number: str | None = None,
+        part_number: str | None = None,
         log_file: str | Path | bool | None = None,
         include_git_metadata: bool = False,
         replay_log_file: bool = True,
+        metadata: dict[str, str | float | bool] | None = None,
     ):
         """Initialize a new report context.
 
@@ -181,10 +184,15 @@ def __init__(
             test_system_name: The name of the test system. Will default to the hostname if not provided.
             system_operator: The operator of the test system. Will default to the current user if not provided.
             test_case: The name of the test case. Will default to the basename of the file containing the test if not provided.
+            serial_number: Optional serial_number stored on the report. Unset when None.
+            part_number: Optional part_number stored on the report. Unset when None.
             log_file: If True, create a temp log file. If a path, use that path.
                 If False/None, no log file is written and create/update calls
                 the API.
             include_git_metadata: If True, include git metadata in the report.
+            metadata: Structured key/value metadata to attach to the report. Merged
+                on top of git metadata when ``include_git_metadata`` is True, so
+                explicit keys win on collision.
             replay_log_file: When True (the default) and ``log_file`` is set,
                 spawn ``import-test-result-log --incremental`` to push log
                 entries to Sift in the background during the session. When
@@ -216,6 +224,10 @@ def __init__(
         test_case = test_case if test_case else os.path.basename(__file__)
         test_system_name = test_system_name if test_system_name else socket.gethostname()
         system_operator = system_operator if system_operator else getpass.getuser()
+        combined_metadata = {
+            **(_git_metadata() or {} if include_git_metadata else {}),
+            **(metadata or {}),
+        }
         create = TestReportCreate(
             name=name,
             test_system_name=test_system_name,
@@ -224,7 +236,9 @@ def __init__(
             end_time=datetime.now(timezone.utc),
             status=TestStatus.IN_PROGRESS,
             system_operator=system_operator,
-            metadata=_git_metadata() if include_git_metadata else None,  # type: ignore
+            serial_number=serial_number,
+            part_number=part_number,
+            metadata=combined_metadata or None,  # type: ignore
         )
         self.report = client.test_results.create(create, log_file=self.log_file)
 
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 2846fedba..fdc16f7c0 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -45,6 +45,7 @@ dependencies = [
     "googleapis-common-protos>=1.60",
     "protoc-gen-openapiv2>=0.0.1",
     "filelock~=3.13",
+    'tomli~=2.0; python_version < "3.11"',
 ]
 
 [project.urls]
diff --git a/python/uv.lock b/python/uv.lock
index b8c439b1a..91eaf3c61 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -4348,6 +4348,7 @@ dependencies = [
     { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
     { name = "requests", version = "2.34.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "requests-toolbelt" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
     { name = "types-protobuf", version = "5.29.1.20241207", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
     { name = "types-protobuf", version = "6.32.1.20251210", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
     { name = "types-protobuf", version = "7.34.1.20260518", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@@ -4645,6 +4646,7 @@ requires-dist = [
     { name = "sift-stream-bindings", marker = "extra == 'docs-build'", specifier = "==0.3.0" },
     { name = "sift-stream-bindings", marker = "extra == 'sift-stream'", specifier = "==0.3.0" },
     { name = "sift-stream-bindings", marker = "extra == 'sift-stream-bindings'", specifier = "==0.3.0" },
+    { name = "tomli", marker = "python_full_version < '3.11'", specifier = "~=2.0" },
     { name = "tomlkit", marker = "extra == 'dev'", specifier = "~=0.13.3" },
     { name = "tomlkit", marker = "extra == 'dev-all'", specifier = "~=0.13.3" },
     { name = "tomlkit", marker = "extra == 'development'", specifier = "~=0.13.3" },

From 5504ed7b914fa4d571086499898e63d76efed210 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 2 Jun 2026 17:47:13 -0700
Subject: [PATCH 15/19] Python(fix): Add unit tests and fix incremental upload
 bug (#611)

---
 .../low_level_wrappers/_test_results_log.py   |   8 +-
 .../low_level_wrappers/test_results.py        |  14 +-
 .../test_incremental_replay.py                | 143 ++++++++++++++++++
 .../_tests/resources/test_test_results.py     |  69 +++++++++
 .../_tests/util/test_report_context.py        |   6 +-
 .../lib/sift_client/resources/test_results.py |   3 +-
 .../scripts/import_test_result_log.py         |   3 +-
 .../util/test_results/context_manager.py      |   2 +-
 8 files changed, 238 insertions(+), 10 deletions(-)
 create mode 100644 python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py

diff --git a/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py b/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
index 24e0534d7..383f2d5a3 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
@@ -143,9 +143,13 @@ class _ReplayState:
 
 @dataclass
 class ReplayResult:
-    """Result of replaying a log file."""
+    """Result of replaying a log file.
 
-    report: TestReport
+    ``report`` is None on an incremental resume tick that uploaded only steps or
+    measurements; the report itself was created on an earlier tick.
+    """
+
+    report: TestReport | None = None
     steps: list[TestStep] = field(default_factory=list)
     measurements: list[TestMeasurement] = field(default_factory=list)
 
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
index ff0c2b515..184833e50 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
@@ -1072,13 +1072,17 @@ async def _replay_update_report(
         id_map: dict[str, str],
         state: _ReplayState,
     ) -> None:
-        if state.report is None:
-            raise ValueError("UpdateTestReport found before CreateTestReport")
         request = UpdateTestReportRequest()
         json_format.Parse(json_str, request)
         request.test_report.test_report_id = self._map_id(
             id_map, request.test_report.test_report_id
         )
+        # Batch/simulate replays the whole log in order, so a missing report means
+        # the log is malformed. Incremental replay may have created the report on an
+        # earlier tick (its real ID lives in id_map), so state.report is legitimately
+        # None here -- the mapped ID is enough to issue the update.
+        if simulate and state.report is None:
+            raise ValueError("UpdateTestReport found before CreateTestReport")
         state.report = await self.update_test_report(
             request=request, simulate=simulate, existing=state.report
         )
@@ -1203,6 +1207,7 @@ async def _incremental_import_log_file(self, log_path: Path) -> ReplayResult:
         next tick.
         """
         tracking = LogTracking.load(log_path)
+        resuming = tracking.last_uploaded_line > 0
         id_map = tracking.id_map
         state = _ReplayState()
 
@@ -1221,7 +1226,10 @@ async def _incremental_import_log_file(self, log_path: Path) -> ReplayResult:
             tracking.last_uploaded_line += 1
             tracking.save(log_path)
 
-        if state.report is None:
+        # On a resume tick the CreateTestReport line was consumed on an earlier
+        # tick, so state.report is expected to be None; the report already exists
+        # on the server. Only a genuine first pass over an empty log is an error.
+        if state.report is None and not resuming:
             raise ValueError("No CreateTestReport found in log file")
 
         return ReplayResult(
diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py
new file mode 100644
index 000000000..ab95ddea8
--- /dev/null
+++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py
@@ -0,0 +1,143 @@
+"""Unit tests for incremental log-replay resume, with no live backend.
+
+These pin the resume-tick behavior of
+``TestResultsLowLevelClient.import_log_file(incremental=True)``: the
+CreateTestReport line is uploaded on an earlier tick, so a resuming tick rebuilds
+replay state from scratch and must apply the remaining lines without an
+in-memory report. The real gRPC create/update calls are stubbed, so these run
+offline -- unlike the end-to-end resume test, which needs the integration server.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+from sift_client._internal.low_level_wrappers.test_results import (
+    # Aliased so pytest doesn't try to collect the `Test`-prefixed client as a suite.
+    TestResultsLowLevelClient as ResultsLowLevelClient,
+)
+from sift_client.sift_types.test_report import (
+    TestReport,
+    TestReportCreate,
+    TestReportUpdate,
+    TestStatus,
+    TestStep,
+    TestStepCreate,
+    TestStepType,
+)
+
+T0 = datetime(2026, 1, 1, tzinfo=timezone.utc)
+
+
+def _make_report(id_: str) -> TestReport:
+    return TestReport(
+        id_=id_,
+        status=TestStatus.FAILED,
+        name="n",
+        test_system_name="s",
+        test_case="c",
+        start_time=T0,
+        end_time=T0,
+        metadata={},
+        is_archived=False,
+    )
+
+
+def _make_step(id_: str) -> TestStep:
+    return TestStep(
+        id_=id_,
+        test_report_id="real-report",
+        name="step",
+        step_type=TestStepType.ACTION,
+        step_path="1",
+        status=TestStatus.PASSED,
+        start_time=T0,
+        end_time=T0,
+    )
+
+
+def _report_create() -> TestReportCreate:
+    return TestReportCreate(
+        status=TestStatus.IN_PROGRESS,
+        name="n",
+        test_system_name="s",
+        test_case="c",
+        start_time=T0,
+        end_time=T0,
+    )
+
+
+@pytest.mark.asyncio
+async def test_resume_applies_trailing_report_update(tmp_path):
+    """Resume whose remaining chunk is the final UpdateTestReport must apply it.
+
+    Pre-fix this raised "UpdateTestReport found before CreateTestReport"; the
+    status update then never landed and the report stayed IN_PROGRESS.
+    """
+    log_file = tmp_path / "resume_report_update.jsonl"
+    client = ResultsLowLevelClient(grpc_client=MagicMock())
+
+    # Build the log offline via the simulate path: CreateTestReport + UpdateTestReport.
+    report = await client.create_test_report(test_report=_report_create(), log_file=log_file)
+    update = TestReportUpdate(status=TestStatus.FAILED)
+    update.resource_id = report.id_
+    await client.update_test_report(update=update, log_file=log_file)
+
+    # An earlier tick already uploaded the CreateTestReport (line 1); the report
+    # exists on the server under its real ID.
+    LogTracking(last_uploaded_line=1, id_map={report.id_: "real-report"}).save(log_file)
+
+    # Stub the real RPC the resumed tick will issue.
+    client.update_test_report = AsyncMock(return_value=_make_report("real-report"))
+
+    result = await client.import_log_file(log_file, incremental=True)
+
+    client.update_test_report.assert_awaited_once()
+    sent = client.update_test_report.await_args.kwargs["request"]
+    assert sent.test_report.test_report_id == "real-report"
+    assert sent.test_report.status == TestStatus.FAILED.value
+    assert result.report is not None
+    assert result.report.id_ == "real-report"
+
+
+@pytest.mark.asyncio
+async def test_resume_with_only_steps_does_not_require_report(tmp_path):
+    """A resume tick carrying only steps must not demand an in-memory report.
+
+    Pre-fix this raised "No CreateTestReport found in log file" (the field-report
+    trace), aborting replay of the remaining step lines.
+    """
+    log_file = tmp_path / "resume_steps_only.jsonl"
+    client = ResultsLowLevelClient(grpc_client=MagicMock())
+
+    report = await client.create_test_report(test_report=_report_create(), log_file=log_file)
+    await client.create_test_step(
+        test_step=TestStepCreate(
+            test_report_id=report.id_,
+            name="s1",
+            step_type=TestStepType.ACTION,
+            step_path="1",
+            status=TestStatus.PASSED,
+            start_time=T0,
+            end_time=T0,
+        ),
+        log_file=log_file,
+    )
+
+    LogTracking(last_uploaded_line=1, id_map={report.id_: "real-report"}).save(log_file)
+
+    client.create_test_step = AsyncMock(return_value=_make_step("real-step"))
+
+    result = await client.import_log_file(log_file, incremental=True)
+
+    client.create_test_step.assert_awaited_once()
+    sent = client.create_test_step.await_args.kwargs["request"]
+    # The step's report ID was remapped from the simulated ID to the real one.
+    assert sent.test_step.test_report_id == "real-report"
+    # The report was created on the earlier tick, so this resume tick has no report.
+    assert result.report is None
+    assert len(result.steps) == 1
diff --git a/python/lib/sift_client/_tests/resources/test_test_results.py b/python/lib/sift_client/_tests/resources/test_test_results.py
index d0ccf4d1b..ce6d7707a 100644
--- a/python/lib/sift_client/_tests/resources/test_test_results.py
+++ b/python/lib/sift_client/_tests/resources/test_test_results.py
@@ -715,6 +715,75 @@ def test_import_log_file_round_trip(self, sift_client, nostromo_run, tmp_path):
             replayed_m = replayed_measurements_by_name[direct_m.name]
             compare_test_measurement_fields(replayed_m, direct_m)
 
+    def test_incremental_import_resumes_after_report_created(
+        self, sift_client, nostromo_run, tmp_path
+    ):
+        """Incremental replay must survive a resume after the report was created.
+
+        Regression: a resume tick rebuilds replay state from scratch, so the
+        CreateTestReport line (already uploaded on an earlier tick) is skipped and
+        the in-memory report is None. The replay must still apply the remaining
+        lines -- including the final UpdateTestReport -- rather than raising
+        "No CreateTestReport found" and leaving the report stuck IN_PROGRESS.
+        """
+        t0 = datetime.now(timezone.utc)
+        log_file = tmp_path / "incremental_resume.jsonl"
+
+        # Build a complete simulation log (no real resources created yet).
+        report = sift_client.test_results.create(
+            {
+                "status": TestStatus.IN_PROGRESS,
+                "name": "Incremental Resume Report",
+                "test_system_name": "IR System",
+                "test_case": "IR Case",
+                "start_time": t0,
+                "end_time": t0 + timedelta(seconds=30),
+                "run_id": nostromo_run.id_,
+            },
+            log_file=log_file,
+        )
+        step = sift_client.test_results.create_step(
+            TestStepCreate(
+                test_report_id=report.id_,
+                name="IR Step 1",
+                step_type=TestStepType.ACTION,
+                step_path="1",
+                status=TestStatus.IN_PROGRESS,
+                start_time=t0,
+                end_time=t0 + timedelta(seconds=10),
+            ),
+            log_file=log_file,
+        )
+        sift_client.test_results.update_step(
+            step,
+            {"status": TestStatus.FAILED},
+            log_file=log_file,
+        )
+        sift_client.test_results.update(
+            test_report=report,
+            update=TestReportUpdate(status=TestStatus.FAILED),
+            log_file=log_file,
+        )
+
+        all_lines = log_file.read_text().splitlines()
+        assert all_lines[0].startswith("[CreateTestReport:")
+
+        # First tick: only the CreateTestReport is present. This creates the real
+        # report and advances the tracking cursor past line 1.
+        log_file.write_text(all_lines[0] + "\n")
+        first = sift_client.test_results.import_log_file(log_file, incremental=True)
+        real_report_id = first.report.id_
+        assert real_report_id is not None
+
+        # Later tick: the rest of the log is now available. Resuming past the
+        # CreateTestReport line must not raise, and the final UpdateTestReport must
+        # land so the report ends FAILED rather than IN_PROGRESS.
+        log_file.write_text("\n".join(all_lines) + "\n")
+        sift_client.test_results.import_log_file(log_file, incremental=True)
+
+        refetched = sift_client.test_results.get(test_report_id=real_report_id)
+        assert refetched.status == TestStatus.FAILED
+
     @pytest.mark.asyncio
     async def test_malformed_log_line_skipped(self, tmp_path):
         """Malformed lines raise a ValueError during iteration."""
diff --git a/python/lib/sift_client/_tests/util/test_report_context.py b/python/lib/sift_client/_tests/util/test_report_context.py
index e92e57bb8..73d738a7d 100644
--- a/python/lib/sift_client/_tests/util/test_report_context.py
+++ b/python/lib/sift_client/_tests/util/test_report_context.py
@@ -76,7 +76,9 @@ def test_worker_timeout_kills_and_warns() -> None:
     assert rc._import_proc.poll() is not None
     messages = "\n".join(str(w.message) for w in recorded)
     assert "did not exit in 0.2s" in messages
-    assert "import-test-result-log" in messages
+    # Recovery must resume from the tracking cursor, not batch-replay (which would
+    # duplicate already-uploaded entries), so the hint carries --incremental.
+    assert "import-test-result-log --incremental" in messages
 
 
 def test_worker_nonzero_exit_warns_stderr_no_raise() -> None:
@@ -96,4 +98,4 @@ def test_worker_nonzero_exit_warns_stderr_no_raise() -> None:
     messages = "\n".join(str(w.message) for w in recorded)
     assert "exited with code 2" in messages
     assert "rpc deadline exceeded" in messages
-    assert "import-test-result-log" in messages
+    assert "import-test-result-log --incremental" in messages
diff --git a/python/lib/sift_client/resources/test_results.py b/python/lib/sift_client/resources/test_results.py
index 9e88b6081..10ef70920 100644
--- a/python/lib/sift_client/resources/test_results.py
+++ b/python/lib/sift_client/resources/test_results.py
@@ -671,7 +671,8 @@ async def import_log_file(
             A ReplayResult containing the created report, steps, and measurements.
         """
         result = await self._low_level_client.import_log_file(log_file, incremental=incremental)
-        result.report = self._apply_client_to_instance(result.report)
+        if result.report is not None:
+            result.report = self._apply_client_to_instance(result.report)
         result.steps = self._apply_client_to_instances(result.steps)
         result.measurements = self._apply_client_to_instances(result.measurements)
         return result
diff --git a/python/lib/sift_client/scripts/import_test_result_log.py b/python/lib/sift_client/scripts/import_test_result_log.py
index 7e14e4d59..3f66af1da 100644
--- a/python/lib/sift_client/scripts/import_test_result_log.py
+++ b/python/lib/sift_client/scripts/import_test_result_log.py
@@ -20,7 +20,8 @@
 
 
 def _print_result(result: ReplayResult) -> None:
-    print(f"Report: {result.report.name} (id={result.report.id_})")
+    if result.report is not None:
+        print(f"Report: {result.report.name} (id={result.report.id_})")
     print(f"Steps:  {len(result.steps)}")
     for step in result.steps:
         print(f"  - {step.step_path} [{step.status}]")
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 4b2e2ab9d..497404c45 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -81,7 +81,7 @@ def log_replay_instructions(log_file: str | Path | None) -> None:
         return
     warnings.warn(
         f"Sift log file was not fully replayed: {log_file}. "
-        f"Re-run with `import-test-result-log {log_file}` to complete the upload.",
+        f"Re-run with `import-test-result-log --incremental {log_file}` to complete the upload.",
         SiftWarning,
         stacklevel=2,
     )

From a3a8f47b32cc79098f8d37d78d4a874611d0f43c Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Tue, 2 Jun 2026 17:51:31 -0700
Subject: [PATCH 16/19] version bump

---
 python/CHANGELOG.md   | 5 +++++
 python/pyproject.toml | 2 +-
 python/uv.lock        | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index 4905ae0d7..2ae4b6a88 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -29,6 +29,11 @@ See the [Pytest Plugin guide](https://github.com/sift-stack/sift/blob/main/pytho
 - [Report assertion message as error info](https://github.com/sift-stack/sift/pull/587)
 - [Pytest docs reorganization](https://github.com/sift-stack/sift/pull/589)
 - [Configurable report name template and preserved pytest command](https://github.com/sift-stack/sift/pull/591)
+- [Use in-process transport to improve test performance](https://github.com/sift-stack/sift/pull/590)
+- [End-of-run report summary panel and session header](https://github.com/sift-stack/sift/pull/594)
+- [Exit instead of raise on connection failure](https://github.com/sift-stack/sift/pull/606)
+- [Flexible report naming and consolidated settings registry](https://github.com/sift-stack/sift/pull/602)
+- [Fix incremental upload resume bug](https://github.com/sift-stack/sift/pull/611)
 
 ## [v0.16.2] - May 21, 2026
 
diff --git a/python/pyproject.toml b/python/pyproject.toml
index fdc16f7c0..b04bce6d3 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.17.0.dev1"
+version = "0.17.0.dev2"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }
diff --git a/python/uv.lock b/python/uv.lock
index 91eaf3c61..d6391b311 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -4315,7 +4315,7 @@ wheels = [
 
 [[package]]
 name = "sift-stack-py"
-version = "0.17.0.dev1"
+version = "0.17.0.dev2"
 source = { editable = "." }
 dependencies = [
     { name = "alive-progress", version = "3.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },

From 223f81e752161e68c2af9007014c6e053615d939 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Wed, 3 Jun 2026 15:09:24 -0700
Subject: [PATCH 17/19] Python(chore): Reorganize pytest code (#610)

---
 .../guides/pytest_plugin/configuration.md     |    2 +-
 .../_internal/pytest_plugin/__init__.py       |    0
 .../_internal/pytest_plugin/modes.py          |   68 +
 .../_internal/pytest_plugin/options.py        |  579 +++++
 .../_internal/pytest_plugin/report.py         |  506 ++++
 .../_internal/pytest_plugin/steps.py          |  310 +++
 .../_internal/pytest_plugin/terminal.py       |  231 ++
 .../pytest_plugin/test_configuration.py       |   45 +-
 .../_tests/pytest_plugin/test_hierarchy.py    |   16 +-
 .../pytest_plugin/test_settings_reference.py  |    8 +-
 .../pytest_plugin/test_terminal_output.py     |   28 +-
 .../pytest_plugin/test_typo_detector.py       |    2 +-
 python/lib/sift_client/pytest_plugin.py       | 2074 +++--------------
 13 files changed, 2067 insertions(+), 1802 deletions(-)
 create mode 100644 python/lib/sift_client/_internal/pytest_plugin/__init__.py
 create mode 100644 python/lib/sift_client/_internal/pytest_plugin/modes.py
 create mode 100644 python/lib/sift_client/_internal/pytest_plugin/options.py
 create mode 100644 python/lib/sift_client/_internal/pytest_plugin/report.py
 create mode 100644 python/lib/sift_client/_internal/pytest_plugin/steps.py
 create mode 100644 python/lib/sift_client/_internal/pytest_plugin/terminal.py

diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
index 7c7114543..a05897cd4 100644
--- a/python/docs/guides/pytest_plugin/configuration.md
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -141,7 +141,7 @@ The plugin scans `SIFT_*` env vars and `[tool.sift.pytest.*]` keys at session
 start; anything outside these tables fires a warning with a closest-match
 suggestion, so typos like `SIFT_REPORT_SERIALNUM` surface immediately.
 
-<!-- BEGIN settings-reference (auto-generated from _OPTIONS in pytest_plugin.py; regenerate via test_settings_reference_docs_in_sync) -->
+<!-- BEGIN settings-reference (auto-generated from PLUGIN_OPTIONS in sift_client/_internal/pytest_plugin/options.py; regenerate via test_settings_reference_docs_in_sync) -->
 ### Pytest behavior
 
 | Setting | CLI flag | Ini (`[tool.pytest.ini_options]`) |
diff --git a/python/lib/sift_client/_internal/pytest_plugin/__init__.py b/python/lib/sift_client/_internal/pytest_plugin/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/lib/sift_client/_internal/pytest_plugin/modes.py b/python/lib/sift_client/_internal/pytest_plugin/modes.py
new file mode 100644
index 000000000..317bcfa96
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/modes.py
@@ -0,0 +1,68 @@
+"""Run-mode detection and the per-test Sift gate.
+
+Resolves the active mode (disabled > offline > online) from the ``DISABLED_OPTION`` /
+``OFFLINE_OPTION`` options, and decides whether the Sift autouse fixtures activate for
+a given node via the ``sift_include`` / ``sift_exclude`` markers.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from sift_client._internal.pytest_plugin.options import (
+    AUTOUSE_OPTION,
+    DISABLED_OPTION,
+    OFFLINE_OPTION,
+)
+
+if TYPE_CHECKING:
+    import pytest
+
+
+def is_offline(pytestconfig: pytest.Config | None) -> bool:
+    return bool(OFFLINE_OPTION.resolve(pytestconfig))
+
+
+def is_disabled(pytestconfig: pytest.Config | None) -> bool:
+    return bool(DISABLED_OPTION.resolve(pytestconfig))
+
+
+def sdk_version() -> str:
+    """Return the installed ``sift_stack_py`` version, or ``"unknown"``."""
+    from importlib.metadata import PackageNotFoundError, version
+
+    try:
+        return version("sift_stack_py")
+    except PackageNotFoundError:
+        return "unknown"
+
+
+def mode_label(config: pytest.Config) -> str:
+    """Resolve the active mode for the terminal header: disabled > offline > online."""
+    if is_disabled(config):
+        return "disabled"
+    if is_offline(config):
+        return "offline"
+    return "online"
+
+
+def sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
+    """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
+
+    `get_closest_marker` walks the node hierarchy upward, so markers applied
+    at any level (function, class, module, package, session) are honored.
+    """
+    if node.get_closest_marker("sift_exclude"):
+        return False
+    if node.get_closest_marker("sift_include"):
+        return True
+    return default
+
+
+def gate_enabled(node: pytest.Item | pytest.Collector, config: pytest.Config) -> bool:
+    """Whether the Sift autouse fixtures should activate for ``node``.
+
+    Combines the ``sift_autouse`` ini default with the per-test marker gate, so
+    the ``step`` and parent-step fixtures share one entry point.
+    """
+    return sift_enabled_for(node, bool(AUTOUSE_OPTION.resolve(config)))
diff --git a/python/lib/sift_client/_internal/pytest_plugin/options.py b/python/lib/sift_client/_internal/pytest_plugin/options.py
new file mode 100644
index 000000000..c3b6801a1
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/options.py
@@ -0,0 +1,579 @@
+"""Declarative settings registry for the Sift pytest plugin.
+
+Every plugin setting is declared once as an :class:`Option` in the ``PLUGIN_OPTIONS``
+registry. That single registry drives ``pytest_addoption``, value resolution,
+the docs settings-reference table, and the unknown-key typo detector, so a
+setting is added or changed in one place instead of wired up across several.
+"""
+
+from __future__ import annotations
+
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Any
+
+import pytest
+
+from sift_client._internal.pyproject_config import load_tool_sift
+
+# Settings-reference categories. Each maps to a docs subsection and, in the
+# renderer, to the column subset that category actually uses.
+CAT_BEHAVIOR = "Pytest behavior"
+CAT_CONNECTION = "Connection"
+CAT_REPORT = "Report content"
+CATEGORIES = (CAT_BEHAVIOR, CAT_CONNECTION, CAT_REPORT)
+
+tool_sift_key = pytest.StashKey[dict]()
+
+
+def tool_sift(config: pytest.Config | None) -> dict[str, Any]:
+    """Session-cached ``[tool.sift]`` table.
+
+    Every option that reads TOML, plus the typo detector, would otherwise
+    re-parse pyproject.toml on the session-start path, and re-emit the
+    malformed-file warning each time. Parse once per session via the config
+    stash; ``load_tool_sift`` stays the uncached parser for direct callers.
+    """
+    if config is None:
+        return {}
+    cached = config.stash.get(tool_sift_key, None)
+    if cached is None:
+        cached = load_tool_sift(config)
+        config.stash[tool_sift_key] = cached
+    return cached
+
+
+@dataclass(frozen=True)
+class Option:
+    """A single setting plus the logic to resolve it from wherever it can be set.
+
+    A setting may come from an env var, a CLI flag, a pytest ini key, or a
+    ``[tool.sift...]`` TOML path. :meth:`resolve` walks the declared surfaces in
+    env > cli > ini > toml order; ``metadata`` (``merge=True``) is the one
+    free-form table, resolved by :meth:`resolve_merged`. The single ``PLUGIN_OPTIONS``
+    registry of these drives ``pytest_addoption``, the resolvers, the docs
+    settings-reference table, and the typo detector.
+
+    Declare only the surface fields a setting uses:
+
+    - ``cli`` / ``cli_action``: CLI flag and argparse action (``cli_dest`` derived).
+    - ``ini`` / ``ini_type`` / ``ini_default``: pytest ini key + type/default.
+    - ``toml``: tuple path under ``[tool.sift...]``, e.g.
+      ``("pytest", "report", "name")`` -> ``tool.sift.pytest.report.name``.
+    - ``env``: full env var name, e.g. ``"SIFT_API_KEY"``.
+
+    ``category`` groups the option in the docs reference (one of ``CATEGORIES``).
+    """
+
+    name: str
+    help: str
+    category: str
+    cli: str | None = None
+    cli_action: str | None = None
+    ini: str | None = None
+    ini_type: str | None = None
+    ini_default: Any = None
+    toml: tuple[str, ...] | None = None
+    env: str | None = None
+    merge: bool = False
+
+    @property
+    def cli_dest(self) -> str:
+        """Argparse ``dest`` for the option.
+
+        When the option has both a CLI flag and an ini key, the dest matches
+        the ini name so ``config.getoption(ini_name)`` returns the CLI value
+        (and falls through to ``config.getini(ini_name)`` when the flag wasn't
+        passed). Without an ini key, the dest derives from the flag name.
+        """
+        if self.ini:
+            return self.ini
+        if self.cli is None:
+            return self.name
+        return self.cli.lstrip("-").replace("-", "_")
+
+    def __post_init__(self) -> None:
+        if self.cli_action and not self.cli:
+            raise ValueError(f"Option({self.name!r}): cli_action requires cli")
+        if self.ini_type and not self.ini:
+            raise ValueError(f"Option({self.name!r}): ini_type requires ini")
+        if self.merge and not self.toml:
+            raise ValueError(f"Option({self.name!r}): merge=True needs toml")
+        if not any([self.cli, self.ini, self.toml, self.env]):
+            raise ValueError(f"Option({self.name!r}): declares no surfaces")
+        if self.category not in CATEGORIES:
+            raise ValueError(f"Option({self.name!r}): category must be one of {CATEGORIES}")
+
+    def resolve(self, config: pytest.Config | None) -> Any:
+        """First set value from declared surfaces; ``None`` when unset everywhere.
+
+        Walk order is env > cli > ini > toml. No current option declares both
+        env and cli, so the chain isn't ambiguous in practice.
+        ``getini`` returns the typed default for unset bool/list keys, so this
+        only returns ini values for booleans (always meaningful), non-empty
+        strings, and non-empty lists.
+        """
+        if self.env:
+            env_value = os.getenv(self.env)
+            if env_value not in (None, ""):
+                return env_value
+        if config is None:
+            return None
+        if self.cli:
+            cli_value = config.getoption(self.cli_dest, default=None)
+            if cli_value is not None:
+                return cli_value
+        if self.ini:
+            try:
+                ini_value = config.getini(self.ini)
+            except (KeyError, ValueError):
+                ini_value = None
+            if isinstance(ini_value, bool):
+                return ini_value
+            if isinstance(ini_value, str) and ini_value:
+                return ini_value
+            if isinstance(ini_value, list) and ini_value:
+                return ini_value
+        if self.toml:
+            toml_value = _walk_toml(tool_sift(config), self.toml)
+            if toml_value not in (None, ""):
+                return toml_value
+        return None
+
+    def resolve_merged(self, config: pytest.Config | None) -> dict[str, str | float | bool]:
+        """For ``merge=True`` dict-shape settings: the free-form TOML table.
+
+        TOML values that don't fit ``dict[str, str | float | bool]`` (nested
+        tables, lists, ``None``) are dropped with a warning so a malformed
+        entry can't crash report creation.
+        """
+        from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+        result: dict[str, str | float | bool] = {}
+        if config is not None and self.toml:
+            base = _walk_toml(tool_sift(config), self.toml)
+            if isinstance(base, dict):
+                for key, value in base.items():
+                    if not isinstance(key, str):
+                        continue
+                    if isinstance(value, (bool, str, int, float)):
+                        # ``bool`` first since ``isinstance(True, int)`` is True.
+                        result[key] = value  # type: ignore[assignment]
+                        continue
+                    warnings.warn(
+                        f"[tool.sift.{'.'.join(self.toml)}] entry {key!r} ignored: "
+                        f"unsupported type {type(value).__name__}.",
+                        SiftPytestPluginWarning,
+                        stacklevel=2,
+                    )
+        return result
+
+
+def _walk_toml(data: dict[str, Any], path: tuple[str, ...]) -> Any:
+    """Walk a parsed TOML tree along ``path``; return None on any missing key."""
+    cur: Any = data
+    for key in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(key)
+        if cur is None:
+            return None
+    return cur
+
+
+# ---------------------------------------------------------------------------
+# Settings registry.
+#
+# Add new options here. The registry drives `pytest_addoption`, resolution,
+# the docs settings-reference table, and the unknown-key typo detector, so a
+# setting is declared once instead of wired up in several places.
+#
+# Where each setting lives follows a few principles:
+#   - Secrets (the API key) come from environment variables only, never a
+#     committed file.
+#   - Pytest behavior lives in [tool.pytest.ini_options] so it integrates with
+#     `pytest --help` / `--co` / `--trace-config`.
+#   - Sift report content lives in [tool.sift.pytest.report.*].
+#   - Non-secret endpoints take an env var plus one static home (ini or toml,
+#     not both).
+#   - A CLI flag is added only when there is a real per-run override workflow;
+#     stable project config stays in ini/toml.
+#   - Dynamic per-run values are injected via environment variables (pytest-dotenv
+#     loads .env for local dev; CI sets the same names from its secret store).
+# ---------------------------------------------------------------------------
+
+# Pytest behavior. The CLI flag survives because the per-run override is real.
+LOG_FILE_OPTION = Option(
+    name="log_file",
+    category=CAT_BEHAVIOR,
+    help="Path to the JSONL log of create/update calls (path | true | false | none).",
+    cli="--sift-log-file",
+    ini="sift_log_file",
+)
+GIT_METADATA_OPTION = Option(
+    name="git_metadata",
+    category=CAT_BEHAVIOR,
+    help="Capture git repo/branch/commit on the report.",
+    cli="--no-sift-git-metadata",
+    cli_action="store_false",
+    ini="sift_git_metadata",
+    ini_type="bool",
+    ini_default=True,
+)
+OFFLINE_OPTION = Option(
+    name="offline",
+    category=CAT_BEHAVIOR,
+    help="Skip the session-start ping; route create/update through the JSONL log.",
+    cli="--sift-offline",
+    cli_action="store_true",
+    ini="sift_offline",
+    ini_type="bool",
+    ini_default=False,
+)
+DISABLED_OPTION = Option(
+    name="disabled",
+    category=CAT_BEHAVIOR,
+    help="Disable Sift entirely (no API calls, no log file). Supersedes --sift-offline.",
+    cli="--sift-disabled",
+    cli_action="store_true",
+    ini="sift_disabled",
+    ini_type="bool",
+    ini_default=False,
+)
+
+OPEN_OPTION = Option(
+    name="open_report",
+    category=CAT_BEHAVIOR,
+    help="Open the resulting report in a browser at session end (online only; "
+    "no-op when the report URL can't be resolved).",
+    cli="--sift-open-report",
+    cli_action="store_true",
+    ini="sift_open_report",
+    ini_type="bool",
+    ini_default=False,
+)
+
+# Pytest behavior: set-once project defaults (no CLI flag, no per-run override).
+AUTOUSE_OPTION = Option(
+    name="autouse",
+    category=CAT_BEHAVIOR,
+    help="Default for the Sift autouse fixtures (report_context, step, hierarchy/parametrize parents).",
+    ini="sift_autouse",
+    ini_type="bool",
+    ini_default=True,
+)
+PACKAGE_STEP_OPTION = Option(
+    name="package_step",
+    category=CAT_BEHAVIOR,
+    help="Open a parent step for each Python package in the test path.",
+    ini="sift_package_step",
+    ini_type="bool",
+    ini_default=True,
+)
+MODULE_STEP_OPTION = Option(
+    name="module_step",
+    category=CAT_BEHAVIOR,
+    help="Open a parent step for each test module.",
+    ini="sift_module_step",
+    ini_type="bool",
+    ini_default=True,
+)
+CLASS_STEP_OPTION = Option(
+    name="class_step",
+    category=CAT_BEHAVIOR,
+    help="Open per-class parent steps, including nested classes.",
+    ini="sift_class_step",
+    ini_type="bool",
+    ini_default=True,
+)
+PARAMETRIZE_NESTING_OPTION = Option(
+    name="parametrize_nesting",
+    category=CAT_BEHAVIOR,
+    help="Cluster parametrized tests under shared parent steps (e.g. test_a -> v=1, v=2).",
+    ini="sift_parametrize_nesting",
+    ini_type="bool",
+    ini_default=True,
+)
+
+# Credentials. The API key is env-only; the URIs accept env + ini.
+API_KEY_OPTION = Option(
+    name="api_key",
+    category=CAT_CONNECTION,
+    help="Sift API key (secret, env-only).",
+    env="SIFT_API_KEY",
+)
+GRPC_URI_OPTION = Option(
+    name="grpc_uri",
+    category=CAT_CONNECTION,
+    help="Sift gRPC endpoint URI.",
+    env="SIFT_GRPC_URI",
+    ini="sift_grpc_uri",
+)
+REST_URI_OPTION = Option(
+    name="rest_uri",
+    category=CAT_CONNECTION,
+    help="Sift REST endpoint URI.",
+    env="SIFT_REST_URI",
+    ini="sift_rest_uri",
+)
+APP_URL_OPTION = Option(
+    name="app_url",
+    category=CAT_CONNECTION,
+    help="Sift web-app origin for the report link in the terminal footer (e.g. "
+    "https://app.siftstack.com). When unset, the link is derived from the REST URI "
+    "for known Sift hosts.",
+    env="SIFT_APP_URL",
+    ini="sift_app_url",
+)
+
+# Report content. Project defaults in [tool.sift.pytest.report]; CI injects
+# per-run values via SIFT_REPORT_* env vars (pytest-dotenv handles .env files
+# for local dev).
+REPORT_NAME_OPTION = Option(
+    name="report_name",
+    category=CAT_REPORT,
+    help="Template for the report display name. Placeholders: {target}, {command}, {args}, "
+    "{rootdir}, {timestamp}, {count}, {git_repo}, {git_branch}, {git_commit}.",
+    toml=("pytest", "report", "name"),
+)
+TEST_CASE_OPTION = Option(
+    name="test_case",
+    category=CAT_REPORT,
+    help="Template for the report's test_case field (same placeholders as report_name).",
+    toml=("pytest", "report", "test_case"),
+)
+TEST_SYSTEM_NAME_OPTION = Option(
+    name="test_system_name",
+    category=CAT_REPORT,
+    help="Name of the test system / rig. Defaults to the host's name.",
+    env="SIFT_REPORT_TEST_SYSTEM_NAME",
+    toml=("pytest", "report", "test_system_name"),
+)
+SYSTEM_OPERATOR_OPTION = Option(
+    name="system_operator",
+    category=CAT_REPORT,
+    help="Operator running the test. Defaults to the OS user.",
+    env="SIFT_REPORT_SYSTEM_OPERATOR",
+    toml=("pytest", "report", "system_operator"),
+)
+SERIAL_NUMBER_OPTION = Option(
+    name="serial_number",
+    category=CAT_REPORT,
+    help="Serial number of the unit under test.",
+    env="SIFT_REPORT_SERIAL_NUMBER",
+    toml=("pytest", "report", "serial_number"),
+)
+PART_NUMBER_OPTION = Option(
+    name="part_number",
+    category=CAT_REPORT,
+    help="Part number of the unit under test.",
+    env="SIFT_REPORT_PART_NUMBER",
+    toml=("pytest", "report", "part_number"),
+)
+METADATA_OPTION = Option(
+    name="metadata",
+    category=CAT_REPORT,
+    help="Free-form report metadata, as a TOML table of scalar values. For "
+    "dynamic per-run keys, attach them in conftest via the report_context fixture.",
+    toml=("pytest", "report", "metadata"),
+    merge=True,
+)
+
+PLUGIN_OPTIONS: tuple[Option, ...] = (
+    LOG_FILE_OPTION,
+    GIT_METADATA_OPTION,
+    OFFLINE_OPTION,
+    DISABLED_OPTION,
+    OPEN_OPTION,
+    AUTOUSE_OPTION,
+    PACKAGE_STEP_OPTION,
+    MODULE_STEP_OPTION,
+    CLASS_STEP_OPTION,
+    PARAMETRIZE_NESTING_OPTION,
+    API_KEY_OPTION,
+    GRPC_URI_OPTION,
+    REST_URI_OPTION,
+    APP_URL_OPTION,
+    REPORT_NAME_OPTION,
+    TEST_CASE_OPTION,
+    TEST_SYSTEM_NAME_OPTION,
+    SYSTEM_OPERATOR_OPTION,
+    SERIAL_NUMBER_OPTION,
+    PART_NUMBER_OPTION,
+    METADATA_OPTION,
+)
+
+
+def register_options(parser: pytest.Parser) -> None:
+    """Register every option's CLI flag and ini key on the pytest parser.
+
+    One loop drives both surfaces, so adding a setting is one entry in
+    ``PLUGIN_OPTIONS``, not edits scattered across the ``pytest_addoption`` hook.
+    """
+    group = parser.getgroup("sift", description="Sift test results")
+    for opt in PLUGIN_OPTIONS:
+        if opt.cli is not None:
+            cli_kwargs: dict[str, Any] = {
+                "dest": opt.cli_dest,
+                "default": None,
+                "help": opt.help,
+            }
+            if opt.cli_action is not None:
+                cli_kwargs["action"] = opt.cli_action
+            group.addoption(opt.cli, **cli_kwargs)
+        if opt.ini is not None:
+            ini_kwargs: dict[str, Any] = {"help": opt.help, "default": opt.ini_default}
+            if opt.ini_type is not None:
+                ini_kwargs["type"] = opt.ini_type
+            parser.addini(opt.ini, **ini_kwargs)
+
+
+def render_settings_reference() -> str:
+    """Render the Markdown settings reference from ``PLUGIN_OPTIONS``.
+
+    One ``### <category>`` subsection per category, each table showing only the
+    columns that category uses (so no dead all-``—`` columns). The plugin docs
+    at ``docs/guides/pytest_plugin/configuration.md`` embed this output verbatim
+    so the registry and the docs can't drift;
+    ``test_settings_reference_docs_in_sync`` is the guard rail. Regenerate with::
+
+        uv run python -c "from sift_client._internal.pytest_plugin.options import render_settings_reference; print(render_settings_reference())"
+    """
+
+    def _cli_cell(opt: Option) -> str:
+        return f"`{opt.cli}`" if opt.cli else "—"
+
+    def _ini_cell(opt: Option) -> str:
+        return f"`{opt.ini}`" if opt.ini else "—"
+
+    def _toml_cell(opt: Option) -> str:
+        if not opt.toml:
+            return "—"
+        if opt.merge:
+            return f"`[tool.sift.{'.'.join(opt.toml)}]` (table)"
+        section = ".".join(opt.toml[:-1])
+        return f"`[tool.sift.{section}] {opt.toml[-1]}`"
+
+    def _env_cell(opt: Option) -> str:
+        if opt.env:
+            return f"`{opt.env}`"
+        return "—"
+
+    # Per-category column layout: only the surfaces that category actually uses.
+    # Each column is (header, cell-renderer).
+    columns_by_category = {
+        CAT_BEHAVIOR: [
+            ("CLI flag", _cli_cell),
+            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
+        ],
+        CAT_CONNECTION: [
+            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
+            ("Env var", _env_cell),
+        ],
+        CAT_REPORT: [
+            ("TOML (`[tool.sift...]`)", _toml_cell),
+            ("Env var", _env_cell),
+        ],
+    }
+
+    def _escape(cell: str) -> str:
+        # Literal pipes inside a Markdown table cell need backslash escaping or
+        # they'd be parsed as column separators.
+        return cell.replace("|", "\\|")
+
+    blocks: list[str] = []
+    for category in CATEGORIES:
+        opts = [o for o in PLUGIN_OPTIONS if o.category == category]
+        if not opts:
+            continue
+        columns = columns_by_category[category]
+        headers = ["Setting", *(h for h, _ in columns)]
+        lines = [
+            f"### {category}",
+            "",
+            "| " + " | ".join(headers) + " |",
+            "|" + "|".join(["---"] * len(headers)) + "|",
+        ]
+        for opt in opts:
+            cells = [opt.help, *(render(opt) for _, render in columns)]
+            lines.append("| " + " | ".join(_escape(c) for c in cells) + " |")
+        blocks.append("\n".join(lines))
+    return "\n\n".join(blocks)
+
+
+def warn_on_unknown_env_vars() -> None:
+    """Emit a warning for any ``SIFT_*`` env var not declared in the registry.
+
+    The registry declares each env var by its full name (``opt.env``); a
+    ``SIFT_*`` var that matches none of them is almost always a typo.
+    """
+    import difflib
+
+    from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+    known_full = {opt.env for opt in PLUGIN_OPTIONS if opt.env}
+    suggestion_pool = sorted(known_full)
+    for name in sorted(os.environ):
+        if not name.startswith("SIFT_"):
+            continue
+        if name in known_full:
+            continue
+        close = difflib.get_close_matches(name, suggestion_pool, n=1, cutoff=0.6)
+        hint = f" (did you mean `{close[0]}`?)" if close else ""
+        warnings.warn(
+            f"Unknown SIFT_* env var `{name}`{hint}; ignored.",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+
+
+def warn_on_unknown_toml_keys(config: pytest.Config) -> None:
+    """Walk ``[tool.sift.pytest.*]`` in pyproject.toml and warn on keys outside the registry.
+
+    Only the ``tool.sift.pytest`` subtree is checked. Other ``tool.sift.*``
+    subtrees are reserved for non-pytest Sift tooling (e.g. ``tool.sift.extras``
+    is consumed by this repo's extras-generation script) and aren't our
+    concern. Free-form subtrees (``merge=True`` options like ``metadata``)
+    stop the walk; their keys are user-defined and not validated.
+    """
+    import difflib
+
+    from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+    data = tool_sift(config)
+    pytest_table = (data or {}).get("pytest")
+    if not isinstance(pytest_table, dict):
+        return
+    # Build leaf/free-form/prefix sets relative to the ``("pytest", ...)`` root
+    # the registry already uses, so the walk runs on the table we just sliced.
+    leaves = {opt.toml for opt in PLUGIN_OPTIONS if opt.toml and not opt.merge}
+    free_form = {opt.toml for opt in PLUGIN_OPTIONS if opt.toml and opt.merge}
+    prefixes: set[tuple[str, ...]] = set()
+    for full in leaves | free_form:
+        for i in range(len(full)):
+            prefixes.add(full[:i])
+
+    def _walk(node: Any, base: tuple[str, ...]) -> None:
+        if base in free_form or not isinstance(node, dict):
+            return
+        for key, value in node.items():
+            path = (*base, str(key))
+            if path in leaves or path in free_form:
+                continue
+            if path in prefixes:
+                _walk(value, path)
+                continue
+            full_name = "tool.sift." + ".".join(path)
+            same_depth = [
+                ".".join(p) for p in (leaves | free_form | prefixes) if len(p) == len(path)
+            ]
+            close = difflib.get_close_matches(".".join(path), same_depth, n=1, cutoff=0.6)
+            hint = f" (did you mean `tool.sift.{close[0]}`?)" if close else ""
+            warnings.warn(
+                f"Unknown sift config key `{full_name}`{hint}; ignored.",
+                SiftPytestPluginWarning,
+                stacklevel=2,
+            )
+
+    _walk(pytest_table, ("pytest",))
diff --git a/python/lib/sift_client/_internal/pytest_plugin/report.py b/python/lib/sift_client/_internal/pytest_plugin/report.py
new file mode 100644
index 000000000..5ce0590f1
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/report.py
@@ -0,0 +1,506 @@
+"""Report construction, status resolution, and step creation.
+
+Builds the session ``ReportContext`` from resolved settings (name/test_case
+templates, log-file mode, credentials for disabled mode), resolves a function
+step's status from pytest's per-phase reports, and finalizes after teardown.
+``report_context_impl`` is a pure generator that yields the context; the
+plugin's ``report_context`` fixture owns the module-level ``REPORT_CONTEXT``.
+"""
+
+from __future__ import annotations
+
+import os
+import warnings
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.pytest_plugin.modes import is_offline
+from sift_client._internal.pytest_plugin.options import (
+    GIT_METADATA_OPTION,
+    LOG_FILE_OPTION,
+    METADATA_OPTION,
+    PARAMETRIZE_NESTING_OPTION,
+    PART_NUMBER_OPTION,
+    REPORT_NAME_OPTION,
+    SERIAL_NUMBER_OPTION,
+    SYSTEM_OPERATOR_OPTION,
+    TEST_CASE_OPTION,
+    TEST_SYSTEM_NAME_OPTION,
+)
+from sift_client._internal.pytest_plugin.steps import (
+    drain_hierarchy_stack,
+    drain_parametrize_stack,
+    parametrize_path_key,
+)
+from sift_client.sift_types.test_report import ErrorInfo, TestStatus
+from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import (
+    _git_metadata,
+    format_assertion_message,
+    format_truncated_traceback,
+)
+
+if TYPE_CHECKING:
+    from sift_client.util.test_results.context_manager import NewStep
+
+
+def resolve_real_report_id(context: Any) -> str | None:
+    """Resolve the real server-side report id for the online footer link.
+
+    In synchronous online mode (``--sift-log-file=false``) the report is created
+    directly against the API, so ``report.id_`` is already the real id. In the
+    default incremental mode the report is created through the simulate path
+    (a client-side UUID) and the background worker maps it to the real id on
+    replay, recording it in the ``<log>.tracking`` sidecar's ``id_map``. By the
+    time this footer runs the session-scoped report context has torn down and
+    the worker has drained, so the sidecar is final.
+
+    Returns ``None`` when the worker never mapped the report (e.g. it died before
+    replaying the create), meaning no real report exists to link.
+    """
+    report = context.report
+    if not report.id_:
+        # No id was ever assigned (unset/empty); nothing to link.
+        return None
+    sim_id = str(report.id_)
+    if not getattr(report, "is_simulated", False):
+        return sim_id
+    log_file = getattr(context, "log_file", None)
+    if log_file is None:
+        return None
+    from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+
+    return LogTracking.load(log_file).id_map.get(sim_id)
+
+
+def resolve_report_link(context: Any, offline: bool) -> tuple[str | None, str | None]:
+    """Resolve ``(report_id, report_url)`` for the terminal footer.
+
+    Offline runs never upload, so the id is ``None``. Online, the id comes from
+    ``resolve_real_report_id`` and the URL is built only when both the id and the
+    client's ``app_url`` are set. Truthiness, not ``is not None``: a
+    resolved-but-empty id (degenerate sidecar mapping, unset proto field) must
+    fall through to the "not uploaded" path, not produce a ``/test-results/`` link.
+    """
+    report_id = None if offline else resolve_real_report_id(context)
+    report_url = (
+        f"{context.client.app_url}/test-results/{report_id}"
+        if report_id and context.client.app_url
+        else None
+    )
+    return report_id, report_url
+
+
+def error_info_from_longrepr(longrepr: Any) -> ErrorInfo:
+    """Fall back to the report's longrepr when no Python exception is available."""
+    return ErrorInfo(error_code=1, error_message=str(longrepr) if longrepr is not None else "")
+
+
+def resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
+    """Resolve the function step's status from pytest's per-phase reports.
+
+    Reads ``_sift_phase_setup`` / ``_sift_phase_call`` and the test's xfail marker,
+    then mutates ``new_step.current_step`` in place and flips
+    ``new_step._sift_managed_externally`` so ``NewStep.__exit__`` emits the
+    resolved status without re-classifying.
+
+    When the call phase reports ``passed`` and no override is needed (i.e. the
+    test's own status or substep failures should drive the result), this leaves
+    the step alone so the default ``__exit__`` resolution stays in charge.
+    """
+    current_step = new_step.current_step
+    if current_step is None:
+        # The step never opened (the autouse fixture short-circuited or was
+        # disabled). Nothing to resolve.
+        return
+    setup_phase = getattr(item, "_sift_phase_setup", None)
+    call_phase = getattr(item, "_sift_phase_call", None)
+    xfail_marker = item.get_closest_marker("xfail")
+    xfail_runs = xfail_marker.kwargs.get("run", True) if xfail_marker is not None else True
+
+    status: TestStatus | None = None
+    error_info: ErrorInfo | None = None
+    keep_managed = False
+
+    if setup_phase is not None and setup_phase.report.outcome == "failed":
+        status = TestStatus.ERROR
+        excinfo = setup_phase.call.excinfo
+        if excinfo is not None:
+            error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+        else:
+            error_info = error_info_from_longrepr(setup_phase.report.longrepr)
+    elif setup_phase is not None and setup_phase.report.outcome == "skipped":
+        status = TestStatus.SKIPPED
+    elif call_phase is None:
+        # Setup completed but the call-phase report never fired; the inner
+        # pytester session was aborted (e.g. by KeyboardInterrupt) before the
+        # plugin could observe the outcome. Leave the step at IN_PROGRESS so
+        # the report does not lie about a clean pass.
+        keep_managed = True
+    else:
+        wasxfail = getattr(call_phase.report, "wasxfail", None)
+        if wasxfail is not None:
+            if call_phase.report.outcome == "failed":
+                # Strict xpass: pytest synthesizes a failure when an xfail(strict=True)
+                # test unexpectedly passes. The xfail mark no longer matches reality.
+                status = TestStatus.FAILED
+            elif call_phase.report.outcome == "skipped":
+                if xfail_marker is not None and xfail_runs is False:
+                    # xfail(run=False): the test body never executed.
+                    status = TestStatus.SKIPPED
+                else:
+                    # xfail + expected failure: the test fulfilled its xfail expectation.
+                    status = TestStatus.PASSED
+            else:
+                # Non-strict xpass: passes that weren't required to fail.
+                status = TestStatus.PASSED
+        elif call_phase.report.outcome == "passed":
+            # Default __exit__ resolves PASSED/FAILED from open_step_results and any
+            # status the test code may have set. Don't override it here.
+            return
+        elif call_phase.report.outcome == "skipped":
+            status = TestStatus.SKIPPED
+        elif call_phase.report.outcome == "failed":
+            excinfo = call_phase.call.excinfo
+            children_passed = new_step.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
+            if excinfo is None:
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, AssertionError):
+                status = TestStatus.FAILED
+                error_info = format_assertion_message(excinfo.type, excinfo.value)
+            elif isinstance(excinfo.value, pytest.fail.Exception):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
+                # Hard exits the plugin can observe: pytest converted the
+                # raise into a call-phase report. The session-aborting variant
+                # (call_phase is None) lands earlier and stays IN_PROGRESS.
+                status = TestStatus.ABORTED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif xfail_marker is not None:
+                # xfail(raises=X) with a non-matching exception: the contract failed.
+                status = TestStatus.FAILED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif not children_passed:
+                # A substep already recorded the error and carries the traceback;
+                # the test step only inherits the child-failed signal.
+                status = TestStatus.FAILED
+            else:
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+
+    if status is None and not keep_managed:
+        return
+
+    if status is not None:
+        # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
+        current_step.__dict__["status"] = status
+        if error_info is not None:
+            current_step.__dict__["error_info"] = error_info
+    new_step._sift_managed_externally = True
+
+
+def finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestReport) -> None:
+    """Upgrade a closed step to FAILED when the teardown phase failed.
+
+    The autouse step fixture has already exited by the time the teardown
+    makereport hook fires, so call ``step.update`` again to override the status
+    server-side and propagate the failure to the still-open parent step.
+    """
+    step: NewStep | None = getattr(item, "_sift_step", None)
+    if step is None:
+        return
+    current_step = step.current_step
+    if current_step is None:
+        return
+    if teardown_report.outcome == "failed" and current_step.status == TestStatus.PASSED:
+        current_step.update({"status": TestStatus.FAILED})
+        step.report_context.mark_step_failed_after_close(current_step)
+
+
+def _relativize(path: Path, rootpath: Path) -> str:
+    """Path relative to rootdir, or the basename when it sits outside the tree."""
+    try:
+        rel = str(path.relative_to(rootpath))
+    except ValueError:
+        return path.name
+    return "" if rel == "." else rel
+
+
+def _strip_param(nodeid: str) -> str:
+    """Drop the trailing ``[param]`` from a nodeid, keeping ``file::Class::func``.
+
+    The parametrize id is a variation of the test, not its identity; leaving it
+    in would make a re-parametrization silently shift the grouping key. Splits on
+    the last ``::`` segment and cuts at its first ``[``; class/function names
+    never contain ``[``, so nested brackets in a param value can't confuse it.
+    """
+    head, sep, leaf = nodeid.rpartition("::")
+    leaf = leaf.split("[", 1)[0]
+    return f"{head}{sep}{leaf}"
+
+
+def derive_target(request: pytest.FixtureRequest, args: tuple[str, ...]) -> str:
+    """Describe what was run, from the collected items rather than the command line.
+
+    Collection is the ground truth of selection, independent of flag order,
+    ``-k`` / ``-m`` filters, or which path form was typed. Every value is
+    anchored to the rootdir (project) name so the shape is uniform; granularity
+    narrows with the selection:
+
+    * a single test -> ``project/tests/test_motor.py::test_spin`` (param stripped)
+    * a single file -> ``project/tests/test_motor.py``
+    * many files    -> their common directory, ``project/tests/motor``
+    * whole tree / nothing collected / paths outside rootdir -> ``project``
+
+    The report is session-level and individual tests are its steps, so the
+    file/directory grain is the natural unit of "what ran" for the report
+    itself. The verbatim invocation stays available via ``{command}`` and the
+    ``pytest_command`` metadata key.
+    """
+    rootpath = request.config.rootpath
+    root = rootpath.name
+
+    def _anchor(rel: str) -> str:
+        return f"{root}/{rel}" if rel else root
+
+    items = list(getattr(request.session, "items", ()) or ())
+    if not items:
+        return root
+    if len(items) == 1:
+        return _anchor(_strip_param(items[0].nodeid))
+    paths = {p for p in (getattr(i, "path", None) for i in items) if p is not None}
+    if not paths:
+        return root
+    if len(paths) == 1:
+        return _anchor(_relativize(next(iter(paths)), rootpath))
+    try:
+        common = Path(os.path.commonpath([str(p) for p in paths]))
+    except ValueError:
+        # e.g. paths on different drives (Windows); fall back to the project.
+        return root
+    return _anchor(_relativize(common, rootpath))
+
+
+def build_template_fields(
+    target: str,
+    command: str,
+    args: tuple[str, ...],
+    request: pytest.FixtureRequest,
+) -> dict[str, Any]:
+    """Build the placeholder mapping shared by the name and test_case templates."""
+    items = getattr(request.session, "items", ()) or ()
+    git = _git_metadata() or {}
+    return {
+        "target": target,
+        "command": command,
+        "args": " ".join(args),
+        "rootdir": request.config.rootpath.name,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "count": len(items),
+        "git_repo": git.get("git_repo", ""),
+        "git_branch": git.get("git_branch", ""),
+        "git_commit": git.get("git_commit", ""),
+    }
+
+
+def format_template(
+    template: str,
+    fields: dict[str, Any],
+    *,
+    fallback: str,
+    option_label: str,
+) -> str:
+    """Format ``template`` with ``fields``; on bad input, warn and return ``fallback``.
+
+    A bad template should never block test results from being recorded, so the
+    rendering errors collapse to a warning + fallback rather than aborting the
+    session.
+    """
+    from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+    try:
+        return template.format(**fields)
+    except (KeyError, IndexError, ValueError) as exc:
+        warnings.warn(
+            f"Invalid {option_label} template {template!r} ({exc}); using fallback.",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+        return fallback
+
+
+def resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
+    """Determine log_file value from CLI flag or ini key.
+
+    Three signal types arrive here:
+
+    * ``None``: unset; nothing was passed on the CLI and the ini key is
+      absent. Treat as the default "use a temp file."
+    * Python ``False``: an explicit disable, typically set in a conftest via
+      ``config.option.sift_log_file = False``. Return ``None`` so
+      the rest of the pipeline knows to skip logging entirely.
+    * A string (from CLI or ini): interpret ``"true"`` / ``"1"`` as the temp
+      file default, ``"false"`` / ``"none"`` as disable, anything else as a
+      file path.
+
+    Rejects ``--sift-log-file=none`` combined with ``--sift-offline`` since
+    offline mode needs the log file as its sole sink.
+    """
+    raw = LOG_FILE_OPTION.resolve(pytestconfig)
+    disabled = raw is False or (isinstance(raw, str) and raw.lower() in ("false", "none"))
+    if disabled and is_offline(pytestconfig):
+        raise pytest.UsageError(
+            "--sift-log-file=none is incompatible with --sift-offline; offline "
+            "mode requires a log file. Pin one with --sift-log-file=<path>, or "
+            "drop --sift-log-file=none to use a temp file."
+        )
+    if raw is False:
+        return None
+    if not raw:
+        return True
+    lower = str(raw).lower()
+    if lower in ("true", "1"):
+        return True
+    if lower in ("false", "none"):
+        return None
+    return Path(raw)
+
+
+def report_context_impl(
+    sift_client: SiftClient,
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config | None = None,
+) -> Generator[ReportContext, None, None]:
+    args = request.config.invocation_params.args
+    # ``target`` is "what ran", derived from the collected items (see
+    # derive_target), invocation-independent, unlike parsing the command
+    # line. Both the display name and test_case default to it; the verbatim
+    # command stays available via {command} and the pytest_command metadata.
+    target = derive_target(request, args)
+    command = "pytest " + " ".join(args) if args else "pytest"
+    fields = build_template_fields(target, command, args, request)
+    name_template = REPORT_NAME_OPTION.resolve(pytestconfig) or "{target} {timestamp}"
+    name = format_template(
+        name_template,
+        fields,
+        fallback=f"{target} {fields['timestamp']}",
+        option_label="sift_report_name",
+    )
+    test_case_template = TEST_CASE_OPTION.resolve(pytestconfig)
+    test_case = (
+        format_template(
+            test_case_template,
+            fields,
+            fallback=target,
+            option_label="sift_test_case",
+        )
+        if test_case_template
+        else target
+    )
+    # Metadata starts from the [tool.sift.pytest.report.metadata] TOML table, and
+    # the auto-recorded pytest_command layers in last so the user can't
+    # accidentally overwrite it.
+    report_metadata: dict[str, str | float | bool] = {
+        **METADATA_OPTION.resolve_merged(pytestconfig),
+        "pytest_command": command,
+    }
+    # Mode → ReportContext flags:
+    #   online (default): log_file=<temp or user path>, replay_log_file=True
+    #   --sift-offline:   log_file=<temp or user path>, replay_log_file=False
+    #   --sift-disabled:  log_file=False,               replay_log_file=False
+    disabled = sift_client._simulate
+    offline = False if disabled else is_offline(pytestconfig)
+    log_file: str | Path | bool | None = False if disabled else resolve_log_file(pytestconfig)
+    include_git_metadata = bool(GIT_METADATA_OPTION.resolve(pytestconfig))
+    with ReportContext(
+        sift_client,
+        name=name,
+        test_case=test_case,
+        test_system_name=TEST_SYSTEM_NAME_OPTION.resolve(pytestconfig) or None,
+        system_operator=SYSTEM_OPERATOR_OPTION.resolve(pytestconfig) or None,
+        serial_number=SERIAL_NUMBER_OPTION.resolve(pytestconfig) or None,
+        part_number=PART_NUMBER_OPTION.resolve(pytestconfig) or None,
+        log_file=log_file,
+        include_git_metadata=include_git_metadata,
+        replay_log_file=not (disabled or offline),
+        metadata=report_metadata,
+    ) as context:
+        try:
+            yield context
+        finally:
+            # Drain the hierarchy + parametrize stacks INSIDE the
+            # ReportContext's ``with`` block, so the final ``__exit__``
+            # update calls for those parent steps are written to the log
+            # file BEFORE the import worker drains. Without this, the
+            # worker exits with a partial backlog and the parent steps
+            # are stuck IN_PROGRESS in the Sift report.
+            try:
+                drain_parametrize_stack()
+            finally:
+                drain_hierarchy_stack()
+
+
+# Placeholder credentials used in --sift-offline mode when env/ini values
+# are missing. Offline mode never makes network calls, so the values are
+# only syntactically required by SiftConnectionConfig.
+OFFLINE_DEFAULTS = {
+    "SIFT_API_KEY": "offline",
+    "SIFT_GRPC_URI": "offline.invalid:0",
+    "SIFT_REST_URI": "http://offline.invalid",
+}
+
+
+def build_disabled_client() -> SiftClient:
+    """Construct a SiftClient for ``--sift-disabled`` mode.
+
+    Tagged with ``_simulate=True`` so test-results writes short-circuit through
+    the existing low-level simulate path without contacting Sift. The URLs are
+    syntactically valid but unreachable; nothing dials them.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="disabled",
+            grpc_url="disabled.invalid:0",
+            rest_url="http://disabled.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
+
+def step_impl(
+    report_context: ReportContext, request: pytest.FixtureRequest
+) -> Generator[NewStep, None, None]:
+    node = request.node
+    # Items get a parametrize path stashed in ``pytest_collection_modifyitems``;
+    # modules/other nodes fall back to their node name. The leaf frame
+    # (``path[-1]``) is the test-specific display name; parents are opened
+    # by ``_parametrize_parents``. When parametrize-nesting is disabled, fall
+    # back to the bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf
+    # remains uniquely identifiable.
+    if PARAMETRIZE_NESTING_OPTION.resolve(request.config):
+        path = node.stash.get(parametrize_path_key, ())
+        name = path[-1] if path else str(node.name)
+    else:
+        name = str(node.name)
+    # ``node.obj`` may not exist (e.g., ``pytest.DoctestItem``) or may raise
+    # when accessed; fall back to no description in those cases rather than
+    # erroring out a perfectly valid test. ``getattr``'s default only
+    # suppresses ``AttributeError``; the try/except catches everything else
+    # (RuntimeError from a misbehaving ``__doc__`` descriptor, etc.).
+    try:
+        existing_docstring = getattr(getattr(node, "obj", None), "__doc__", None) or None
+    except Exception:
+        existing_docstring = None
+    with report_context.new_step(
+        name=name, description=existing_docstring, assertion_as_fail_not_error=False
+    ) as new_step:
+        node._sift_step = new_step
+        yield new_step
+        resolve_initial_status(new_step, node)
diff --git a/python/lib/sift_client/_internal/pytest_plugin/steps.py b/python/lib/sift_client/_internal/pytest_plugin/steps.py
new file mode 100644
index 000000000..9904ceecb
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/steps.py
@@ -0,0 +1,310 @@
+"""Parent-step stacks: the parametrize and hierarchy frames shared across items.
+
+Holds the collection-phase stash keys and the two module-level frame stacks
+(``parametrize_stack`` / ``hierarchy_stack``), the helpers that build a chain
+for an item and drain the stacks, and the per-item reconcilers the autouse
+fixtures delegate to. Frames are shared across sibling test items and drained
+innermost-first at session end.
+"""
+
+from __future__ import annotations
+
+import warnings
+from typing import Any, Tuple
+
+import pytest
+
+from sift_client._internal.pytest_plugin.options import (
+    CLASS_STEP_OPTION,
+    MODULE_STEP_OPTION,
+    PACKAGE_STEP_OPTION,
+    PARAMETRIZE_NESTING_OPTION,
+)
+
+STASH_MISSING = object()
+
+parametrize_path_key = pytest.StashKey[Tuple[str, ...]]()
+# Each frame: (path_key, open step). Frames are shared across sibling test items
+# and drained at session end.
+parametrize_stack: list[tuple[str, Any]] = []
+
+hierarchy_key = pytest.StashKey[Tuple[Tuple[str, str, "str | None", bool], ...]]()
+# Outer-to-inner frames for the item's collection-tree ancestors. Each chain
+# entry is ``(identity, name, doc, rendered)``:
+#   - ``identity``: a globally-unique key (``node.nodeid``) used for diff
+#     comparison. Two ancestors at the same depth with the same display name
+#     but reached via different paths (e.g., ``proj_a/utils`` and
+#     ``proj_b/utils`` in a monorepo) get distinct identities, so they never
+#     silently merge in the diff.
+#   - ``name``: the human-readable step name used when ``rendered`` opens the
+#     Sift step.
+#   - ``doc``: docstring used for the step description if rendered.
+#   - ``rendered``: True iff the corresponding ``sift_*_step`` ini flag is on.
+#     Non-rendered frames participate in the diff but do not call
+#     ``rc.new_step(...)``; they appear with ``ns=None`` in the stack.
+#
+# Stack entries: ``(identity, name, open_step_or_None)``. Frames are shared
+# across sibling test items and drained at session end. Drained AFTER
+# parametrize_stack since parametrize parents nest inside hierarchy parents.
+hierarchy_stack: list[tuple[str, str, Any]] = []
+
+
+def drain_step_stack(stack: list, *, swallow_errors: bool = True) -> None:
+    """Pop and close every frame.
+
+    With ``swallow_errors=True`` (default, used at teardown / session end),
+    per-frame failures are surfaced as ``SiftPytestStepDrainWarning`` so a
+    single misbehaving ``__exit__`` can't block the rest of the stack from
+    cleaning up or cascade out of pytest's finalizer chain.
+
+    With ``swallow_errors=False`` (mid-session, when a class transition forces
+    parametrize parents to close), the stack is still fully drained but the
+    first per-frame exception is re-raised at the end as a
+    ``SiftPytestStepDrainError`` so a real upstream invariant violation
+    surfaces as a test error instead of a silenceable warning.
+    """
+    from sift_client.pytest_plugin import SiftPytestStepDrainError, SiftPytestStepDrainWarning
+
+    errors: list[tuple[str, BaseException]] = []
+    while stack:
+        entry = stack.pop()
+        # Tolerate either ``(name, ns)`` (parametrize stack) or
+        # ``(identity, name, ns)`` (hierarchy stack) entries.
+        name, ns = entry[-2], entry[-1]
+        if ns is None:
+            # Non-rendered diff-only frame (e.g. a Package frame when
+            # ``sift_package_step=false``); nothing to close.
+            continue
+        try:
+            ns.__exit__(None, None, None)
+        except Exception as exc:
+            if swallow_errors:
+                warnings.warn(
+                    f"Sift plugin: closing step {name!r} during drain raised "
+                    f"{type(exc).__name__}: {exc}",
+                    SiftPytestStepDrainWarning,
+                    stacklevel=2,
+                )
+            else:
+                errors.append((name, exc))
+    if errors:
+        first_name, first_exc = errors[0]
+        raise SiftPytestStepDrainError(
+            f"Sift plugin: {len(errors)} step(s) raised while draining mid-session; "
+            f"first failure on {first_name!r}: {type(first_exc).__name__}: {first_exc}"
+        ) from first_exc
+
+
+def drain_parametrize_stack(*, swallow_errors: bool = True) -> None:
+    drain_step_stack(parametrize_stack, swallow_errors=swallow_errors)
+
+
+def drain_hierarchy_stack(*, swallow_errors: bool = True) -> None:
+    drain_step_stack(hierarchy_stack, swallow_errors=swallow_errors)
+
+
+def close_frame(name: str, ns: Any) -> None:
+    """Close a single frame, warning on per-frame failure.
+
+    Used by the mid-session hierarchy-stack pop and the rollback paths so a
+    misbehaving ``__exit__`` neither shadows the original exception nor leaks
+    sibling frames. ``ns=None`` indicates a non-rendered diff-only frame; skip.
+    """
+    from sift_client.pytest_plugin import SiftPytestStepDrainWarning
+
+    if ns is None:
+        return
+    try:
+        ns.__exit__(None, None, None)
+    except Exception as exc:
+        warnings.warn(
+            f"Sift plugin: closing step {name!r} raised {type(exc).__name__}: {exc}",
+            SiftPytestStepDrainWarning,
+            stacklevel=2,
+        )
+
+
+def build_parametrize_path(item: pytest.Item) -> tuple[str, ...]:
+    """Outer-to-inner step display names for a parametrized item.
+
+    Pytest stores ``callspec.params`` with the BOTTOM decorator's axis first;
+    the Sift step tree treats the TOP decorator as outermost, so we reverse.
+    """
+    callspec = getattr(item, "callspec", None)
+    if callspec is None or not callspec.params:
+        return ()
+    originalname = getattr(item, "originalname", item.name)
+    frames: list[str] = [originalname]
+    for name, value in reversed(callspec.params.items()):
+        frames.append(f"{name}={value!r}")
+    return tuple(frames)
+
+
+def build_hierarchy_chain(
+    item: pytest.Item | pytest.Collector,
+    config: pytest.Config,
+) -> tuple[tuple[str, str, str | None, bool], ...]:
+    """Outer-to-inner ``(identity, name, docstring, rendered)`` for collection ancestors.
+
+    Walks ``item.parent`` upward and ALWAYS collects every ``pytest.Package``,
+    ``pytest.Module``, and ``pytest.Class`` ancestor; they all participate in
+    the diff that keeps the report tree coherent across tests, so two
+    same-named ancestors reached via different paths (e.g., ``proj_a/utils``
+    and ``proj_b/utils`` in a monorepo where the ``proj_*`` dirs are
+    ``pytest.Dir`` nodes the walker skips) cannot silently merge.
+
+    The ``identity`` field is ``node.nodeid``, globally unique per collected
+    node. The diff compares on identity, not the display ``name``.
+
+    The ``rendered`` flag is True iff the layer's ini flag is on
+    (``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``).
+    Non-rendered frames participate in the diff for identity but don't open a
+    Sift step.
+
+    The ``node.obj`` access is a pytest property that imports the underlying
+    Python object and can raise *any* exception (ImportError, custom
+    metaclass errors, descriptor ``__doc__`` properties that throw). Guard
+    broadly so a misbehaving collector doesn't abort the whole collection
+    phase; that frame's docstring just becomes ``None``.
+    """
+    include_package = bool(PACKAGE_STEP_OPTION.resolve(config))
+    include_module = bool(MODULE_STEP_OPTION.resolve(config))
+    include_class = bool(CLASS_STEP_OPTION.resolve(config))
+
+    chain: list[tuple[str, str, str | None, bool]] = []
+    # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
+    # isn't part of pytest's public API; widen to ``Any`` for the walk.
+    node: Any = item
+    while node is not None:
+        if isinstance(node, pytest.Class):
+            rendered = include_class
+        elif isinstance(node, pytest.Module):
+            rendered = include_module
+        elif isinstance(node, pytest.Package):
+            rendered = include_package
+        else:
+            node = node.parent
+            continue
+        try:
+            doc = (
+                (getattr(node, "obj", None) and getattr(node.obj, "__doc__", None)) or ""
+            ).strip() or None
+        except Exception:
+            doc = None
+        chain.append((node.nodeid, node.name, doc, rendered))
+        node = node.parent
+    return tuple(reversed(chain))
+
+
+def reconcile_hierarchy(request: pytest.FixtureRequest, config: pytest.Config) -> None:
+    """Open/close hierarchy parents so the open stack matches the item's chain.
+
+    Diffs the item's desired ``(package, module, class)`` chain against
+    ``hierarchy_stack`` on identity (nodeid), pops the stale tail, and pushes
+    new rendered frames. Which node types render is decided at build time by
+    ``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``; when the
+    chain changes, the parametrize stack is drained first since parametrize
+    parents nest INSIDE these.
+    """
+    # Fall back to computing the chain on-demand for items that bypassed
+    # ``pytest_collection_modifyitems`` (e.g., dynamically inserted by another
+    # plugin's later hook). Defaulting to ``()`` would incorrectly drain the
+    # entire open hierarchy stack for those items.
+    desired = request.node.stash.get(hierarchy_key, STASH_MISSING)
+    if desired is STASH_MISSING:
+        desired = build_hierarchy_chain(request.node, config)
+    common = 0
+    # Compare on identity (nodeid); same-named ancestors at different paths
+    # MUST stay distinct.
+    while (
+        common < len(hierarchy_stack)
+        and common < len(desired)
+        and hierarchy_stack[common][0] == desired[common][0]
+    ):
+        common += 1
+    # Any change to the hierarchy chain orphans parametrize parents from the
+    # previous test. Drain them before mutating the hierarchy stack so
+    # ReportContext's top-of-stack invariant holds. Strict mode: a per-frame
+    # ``__exit__`` failure here signals a real upstream drift between the
+    # plugin stacks and ReportContext; raise it as a test error instead of a
+    # silenceable warning.
+    if common < len(hierarchy_stack) or common < len(desired):
+        drain_parametrize_stack(swallow_errors=False)
+    # Symmetric per-frame guard for the hierarchy pop so one bad ``__exit__``
+    # doesn't leave hierarchy_stack partially drained for every subsequent test.
+    while len(hierarchy_stack) > common:
+        _identity, name, ns = hierarchy_stack.pop()
+        close_frame(name, ns)
+    if not desired[common:]:
+        return
+    # Fetch ``report_context`` lazily, but only when there's at least one
+    # rendered frame to push. Pure diff-only frames (e.g. a Package frame when
+    # ``sift_package_step=false``) just update hierarchy_stack with ns=None.
+    rc = None
+    # Roll back any partial push so a mid-loop exception doesn't leave half
+    # the chain orphaned on the stack. Per-frame guard inside the rollback so
+    # a failing ``__exit__`` doesn't shadow the original exception or leak
+    # the remaining opened frames.
+    opened: list[tuple[str, str, Any]] = []
+    try:
+        for identity, name, doc, rendered in desired[common:]:
+            if rendered:
+                if rc is None:
+                    rc = request.getfixturevalue("report_context")
+                ns = rc.new_step(name=name, description=doc, assertion_as_fail_not_error=False)
+                ns.__enter__()
+                opened.append((identity, name, ns))
+            else:
+                opened.append((identity, name, None))
+    except BaseException:
+        while opened:
+            _identity, name, ns = opened.pop()
+            close_frame(name, ns)
+        raise
+    hierarchy_stack.extend(opened)
+
+
+def reconcile_parametrize(request: pytest.FixtureRequest, config: pytest.Config) -> None:
+    """Open/close shared parametrize parents so the open stack matches the item.
+
+    Diffs the item's desired parametrize path against ``parametrize_stack``:
+    pops the stale tail, then opens new parents (everything except the innermost
+    frame, which the ``step`` fixture creates as the leaf). Parents persist
+    across sibling items so a tree like ``test_x[a=1]`` / ``test_x[a=2]`` shares
+    one ``test_x`` container. No-op when ``sift_parametrize_nesting=false``.
+    """
+    if not PARAMETRIZE_NESTING_OPTION.resolve(config):
+        return
+    # Fall back to on-demand computation for dynamically-inserted items;
+    # see reconcile_hierarchy for the same rationale.
+    desired = request.node.stash.get(parametrize_path_key, STASH_MISSING)
+    if desired is STASH_MISSING:
+        desired = build_parametrize_path(request.node)
+    parents = desired[:-1]
+    common = 0
+    while (
+        common < len(parametrize_stack)
+        and common < len(parents)
+        and parametrize_stack[common][0] == parents[common]
+    ):
+        common += 1
+    # Per-frame guard so one bad ``__exit__`` doesn't leave parametrize_stack
+    # partially drained for every subsequent test.
+    while len(parametrize_stack) > common:
+        name, ns = parametrize_stack.pop()
+        close_frame(name, ns)
+    if not parents[common:]:
+        return
+    rc = request.getfixturevalue("report_context")
+    opened: list[tuple[str, Any]] = []
+    try:
+        for display in parents[common:]:
+            ns = rc.new_step(name=display, assertion_as_fail_not_error=False)
+            ns.__enter__()
+            opened.append((display, ns))
+    except BaseException:
+        while opened:
+            name, ns = opened.pop()
+            close_frame(name, ns)
+        raise
+    parametrize_stack.extend(opened)
diff --git a/python/lib/sift_client/_internal/pytest_plugin/terminal.py b/python/lib/sift_client/_internal/pytest_plugin/terminal.py
new file mode 100644
index 000000000..4f1eee0dd
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/terminal.py
@@ -0,0 +1,231 @@
+"""Terminal-summary formatting for the session-end Sift report panel.
+
+Row writers and colored count/measurement segments used by
+``pytest_terminal_summary``, plus the best-effort browser opener for
+``--sift-open-report``. Color is dropped automatically when the terminal has no
+markup (not a TTY or ``--color=no``), so captured/CI output stays plain text.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from sift_client._internal.pytest_plugin.modes import mode_label, sdk_version
+from sift_client.sift_types.test_report import TestStatus
+from sift_client.util.test_results.context_manager import _quiet_fork_stderr
+
+LABEL_WIDTH = 13
+
+
+def sift_kv(terminalreporter: Any, label: str, value: str, **value_markup: bool) -> None:
+    """Write an indented ``label  value`` row, bolding the label.
+
+    ``value_markup`` (e.g. ``green=True``, ``cyan=True``) styles only the value.
+    Color is dropped automatically when the terminal has no markup (not a TTY or
+    ``--color=no``), so captured/CI output stays plain text.
+    """
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{label:<{LABEL_WIDTH}}", bold=True)
+    terminalreporter.write_line(value, **value_markup)
+
+
+# Step-count breakdown order and labels for the footer's "Steps" row.
+STEP_COUNT_ORDER: tuple[tuple[TestStatus, str], ...] = (
+    (TestStatus.PASSED, "passed"),
+    (TestStatus.FAILED, "failed"),
+    (TestStatus.ERROR, "error"),
+    (TestStatus.ABORTED, "aborted"),
+    (TestStatus.SKIPPED, "skipped"),
+    (TestStatus.IN_PROGRESS, "in progress"),
+)
+
+
+# Per-status color for the footer's step breakdown: green pass, red
+# failure/error/abort, yellow skip; in-progress (and anything else) stays plain.
+STEP_STATUS_MARKUP: dict[TestStatus, dict[str, bool]] = {
+    TestStatus.PASSED: {"green": True},
+    TestStatus.FAILED: {"red": True},
+    TestStatus.ERROR: {"red": True},
+    TestStatus.ABORTED: {"red": True},
+    TestStatus.SKIPPED: {"yellow": True},
+}
+
+
+def step_count_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
+    """Build ``(text, markup)`` segments for a step tally, non-zero only."""
+    return [
+        (f"{counts.get(status, 0)} {label}", STEP_STATUS_MARKUP.get(status, {}))
+        for status, label in STEP_COUNT_ORDER
+        if counts.get(status, 0)
+    ]
+
+
+def measurement_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
+    """Build ``(text, markup)`` segments for a measurement tally, non-zero only."""
+    segments: list[tuple[str, dict[str, bool]]] = []
+    if counts.get(True, 0):
+        segments.append((f"{counts[True]} passed", {"green": True}))
+    if counts.get(False, 0):
+        segments.append((f"{counts[False]} failed", {"red": True}))
+    return segments
+
+
+def write_count_row(
+    terminalreporter: Any, label: str, segments: list[tuple[str, dict[str, bool]]]
+) -> None:
+    """Write a ``label  a · b · c`` row, applying each segment's color markup."""
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{label:<{LABEL_WIDTH}}", bold=True)
+    for index, (text, markup) in enumerate(segments):
+        if index:
+            terminalreporter.write(" · ")
+        terminalreporter.write(text, **markup)
+    terminalreporter.write_line("")
+
+
+def report_panel_title(report: Any, terminalreporter: Any) -> str:
+    """``Sift report · <name>`` for the section rule, truncated to the terminal width.
+
+    The report name embeds a timestamp (and, for invocation-based runs, the
+    pytest args), so a long name is truncated with an ellipsis to keep the
+    separator line from wrapping.
+    """
+    base = "Sift report"
+    name = getattr(report, "name", None)
+    if not name:
+        return base
+    title = f"{base} · {name}"
+    fullwidth = getattr(getattr(terminalreporter, "_tw", None), "fullwidth", 80)
+    # Reserve room for the separator characters and spaces write_sep adds.
+    limit = max(len(base), fullwidth - 8)
+    if len(title) > limit:
+        title = title[: limit - 1] + "…"
+    return title
+
+
+def maybe_open_report(url: str) -> None:
+    """Best-effort open the report URL in a browser (for ``--sift-open-report``).
+
+    Skipped on CI or non-interactive sessions so a committed ``sift_open_report``
+    setting can't spawn a browser on a headless agent; the flag is meant for
+    local development.
+    """
+    import sys
+    import webbrowser
+
+    if os.environ.get("CI") or not sys.stdout.isatty():
+        return
+    try:
+        # webbrowser.open forks/execs the platform opener while the gRPC client's
+        # background threads are live; redirect fd 2 across the fork to swallow
+        # gRPC's prefork notice (same treatment as the plugin's other fork sites).
+        with _quiet_fork_stderr():
+            webbrowser.open(url)
+    except Exception:
+        # Headless / no browser available: opening is a convenience, never fatal.
+        pass
+
+
+def write_disabled_summary(terminalreporter: Any) -> None:
+    """Print the one-line panel shown in ``--sift-disabled`` mode."""
+    terminalreporter.write_sep("=", "Sift", cyan=True, bold=True)
+    terminalreporter.write_line("Sift disabled — no test report created.")
+
+
+def write_report_summary(
+    terminalreporter: Any,
+    context: Any,
+    config: Any,
+    report_id: str | None,
+    report_url: str | None,
+    offline: bool,
+) -> None:
+    """Print the session-end report panel: outcome, tallies, provenance, action.
+
+    ``report_id`` / ``report_url`` come from ``resolve_report_link``. The action
+    row is a clickable link (online), the upload command (offline), or a replay
+    hint when the report never uploaded.
+    """
+    log_file = getattr(context, "log_file", None)
+
+    failed = bool(getattr(context, "any_failures", False))
+    status_word, status_markup = (
+        ("FAILED", {"red": True, "bold": True})
+        if failed
+        else ("PASSED", {"green": True, "bold": True})
+    )
+    # Offline results live only in the local log until replayed, so the status
+    # row calls that out instead of repeating the version (already in the header).
+    status_context = (
+        f"{mode_label(config)} · not uploaded"
+        if offline
+        else f"{mode_label(config)} · sift-stack-py {sdk_version()}"
+    )
+
+    report = context.report
+
+    terminalreporter.write_sep(
+        "=", report_panel_title(report, terminalreporter), cyan=True, bold=True
+    )
+
+    # Identity row: the test case (test path or pytest invocation).
+    if report.test_case:
+        sift_kv(terminalreporter, "Test case", str(report.test_case))
+
+    # Status row: colored outcome, then compact mode context.
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{'Status':<{LABEL_WIDTH}}", bold=True)
+    terminalreporter.write(status_word, **status_markup)
+    terminalreporter.write_line(f"      {status_context}")
+
+    # Step + measurement tallies (green pass, red failure, yellow skip).
+    write_count_row(
+        terminalreporter,
+        "Steps",
+        step_count_segments(context.step_status_counts) or [("no steps", {})],
+    )
+    measurements = measurement_segments(context.measurement_counts)
+    if measurements:
+        write_count_row(terminalreporter, "Measurements", measurements)
+
+    # Provenance row: test system and operator.
+    system = " · ".join(part for part in (report.test_system_name, report.system_operator) if part)
+    if system:
+        sift_kv(terminalreporter, "System", system)
+
+    # Local log file (write-through backup online, sole sink offline).
+    if log_file is not None:
+        sift_kv(terminalreporter, "Log file", str(log_file))
+
+    if offline:
+        if log_file is not None:
+            terminalreporter.write_sep("-", "to upload to Sift")
+            terminalreporter.write_line(f"  >> import-test-result-log {log_file}", cyan=True)
+        return
+
+    if not report_id:
+        # Incremental upload never mapped the report (the worker died before
+        # replaying the create), so there's no real report to link.
+        sift_kv(
+            terminalreporter,
+            "Report",
+            f"not uploaded — replay with: import-test-result-log {log_file}",
+            yellow=True,
+        )
+    elif report_url is not None:
+        sift_kv(terminalreporter, "Report", report_url, cyan=True)
+    else:
+        sift_kv(
+            terminalreporter,
+            "Report",
+            f"id {report_id}  (set sift_app_url for a clickable link)",
+        )
+
+    if report_id and getattr(context, "replay_incomplete", False) and log_file is not None:
+        sift_kv(
+            terminalreporter,
+            "",
+            f"may be incomplete — finish with: import-test-result-log {log_file}",
+            yellow=True,
+        )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
index 4efb9f554..a61035b90 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
@@ -27,8 +27,8 @@ def test_ini_log_file_none(
     ) -> None:
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _resolve_log_file
-            print("RESOLVED:", _resolve_log_file(config))
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
             """,
         )
         pytester.makepyprojecttoml(
@@ -56,8 +56,8 @@ def test_python_false_disables_log_file(
         write_probe_conftest(
             """
             config.option.sift_log_file = False
-            from sift_client.pytest_plugin import _resolve_log_file
-            print("RESOLVED:", _resolve_log_file(config))
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
             """,
         )
         pytester.makepyfile("def test_noop(): pass")
@@ -73,8 +73,8 @@ def test_ini_log_file_path(
         log_path = tmp_path / "sift-run.jsonl"
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _resolve_log_file
-            print("RESOLVED:", _resolve_log_file(config))
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
             """,
         )
         pytester.makepyprojecttoml(
@@ -94,8 +94,8 @@ def test_ini_offline_true(
     ) -> None:
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _is_offline
-            print("OFFLINE:", _is_offline(config))
+            from sift_client._internal.pytest_plugin.modes import is_offline
+            print("OFFLINE:", is_offline(config))
             """,
         )
         pytester.makepyprojecttoml(
@@ -115,8 +115,8 @@ def test_ini_disabled_true(
     ) -> None:
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _is_disabled
-            print("DISABLED:", _is_disabled(config))
+            from sift_client._internal.pytest_plugin.modes import is_disabled
+            print("DISABLED:", is_disabled(config))
             """,
         )
         pytester.makepyprojecttoml(
@@ -159,8 +159,8 @@ def test_cli_overrides_ini(
         cli_path = tmp_path / "cli-wins.jsonl"
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _resolve_log_file
-            print("RESOLVED:", _resolve_log_file(config))
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
             """,
         )
         pytester.makepyprojecttoml(
@@ -181,8 +181,8 @@ def test_cli_offline_flag(
         """The ``--sift-offline`` CLI flag flips the resolver to True."""
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _is_offline
-            print("OFFLINE:", _is_offline(config))
+            from sift_client._internal.pytest_plugin.modes import is_offline
+            print("OFFLINE:", is_offline(config))
             """,
         )
         pytester.makepyfile("def test_noop(): pass")
@@ -197,8 +197,8 @@ def test_cli_disabled_flag(
         """The ``--sift-disabled`` CLI flag flips the resolver to True."""
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import _is_disabled
-            print("DISABLED:", _is_disabled(config))
+            from sift_client._internal.pytest_plugin.modes import is_disabled
+            print("DISABLED:", is_disabled(config))
             """,
         )
         pytester.makepyfile("def test_noop(): pass")
@@ -232,14 +232,11 @@ def test_defaults_when_neither_set(
     ) -> None:
         write_probe_conftest(
             """
-            from sift_client.pytest_plugin import (
-                _is_disabled,
-                _is_offline,
-                _resolve_log_file,
-            )
-            print("RESOLVED:", _resolve_log_file(config))
-            print("OFFLINE:", _is_offline(config))
-            print("DISABLED:", _is_disabled(config))
+            from sift_client._internal.pytest_plugin.modes import is_disabled, is_offline
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
+            print("OFFLINE:", is_offline(config))
+            print("DISABLED:", is_disabled(config))
             print("INI_GIT:", config.getini("sift_git_metadata"))
             """,
         )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
index 9e0dd52e0..39ee0ccf6 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -311,10 +311,8 @@ def test_y(self, w):
 
 def test_drain_step_stack_continues_past_failing_exit() -> None:
     """Lenient mode: a misbehaving ``__exit__`` must not block the rest of the stack."""
-    from sift_client.pytest_plugin import (
-        SiftPytestStepDrainWarning,
-        _drain_step_stack,
-    )
+    from sift_client._internal.pytest_plugin.steps import drain_step_stack
+    from sift_client.pytest_plugin import SiftPytestStepDrainWarning
 
     class _Good:
         def __init__(self) -> None:
@@ -330,7 +328,7 @@ def __exit__(self, *_: object) -> None:
     g1, g2, bad = _Good(), _Good(), _Bad()
     stack: list[tuple[str, object]] = [("g1", g1), ("bad", bad), ("g2", g2)]
     with pytest.warns(SiftPytestStepDrainWarning, match="boom"):
-        _drain_step_stack(stack)
+        drain_step_stack(stack)
     assert stack == []
     assert g1.closed
     assert g2.closed
@@ -338,10 +336,8 @@ def __exit__(self, *_: object) -> None:
 
 def test_drain_step_stack_strict_drains_fully_then_raises() -> None:
     """Strict mode: drain every frame, then raise with the FIRST failure chained."""
-    from sift_client.pytest_plugin import (
-        SiftPytestStepDrainError,
-        _drain_step_stack,
-    )
+    from sift_client._internal.pytest_plugin.steps import drain_step_stack
+    from sift_client.pytest_plugin import SiftPytestStepDrainError
 
     class _Good:
         def __init__(self) -> None:
@@ -362,7 +358,7 @@ def __exit__(self, *_: object) -> None:
     # one collected and surfaces in __cause__.
     stack: list[tuple[str, object]] = [("g", g), ("b1", b1), ("b2", b2)]
     with pytest.raises(SiftPytestStepDrainError, match="2 step.*'b2'") as exc_info:
-        _drain_step_stack(stack, swallow_errors=False)
+        drain_step_stack(stack, swallow_errors=False)
     # Stack fully drained even though it raised.
     assert stack == []
     assert g.closed
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py b/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
index ba6fbf5a5..0bb46c76f 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
@@ -1,4 +1,4 @@
-"""Guard rail that pins the docs settings table to the ``_OPTIONS`` registry.
+"""Guard rail that pins the docs settings table to the ``PLUGIN_OPTIONS`` registry.
 
 If you add or change a setting in ``lib/sift_client/pytest_plugin.py`` without
 regenerating the Markdown table in ``docs/guides/pytest_plugin/configuration.md``,
@@ -25,15 +25,15 @@ def test_settings_reference_docs_in_sync(pytestconfig: pytest.Config) -> None:
         import pytest
 
         pytest.skip(f"{_DOCS_PATH} not present in this checkout")
-    from sift_client.pytest_plugin import _render_settings_reference
+    from sift_client._internal.pytest_plugin.options import render_settings_reference
 
-    rendered = _render_settings_reference()
+    rendered = render_settings_reference()
     content = _DOCS_PATH.read_text()
     if rendered not in content:
         import pytest
 
         pytest.fail(
-            "Settings reference is out of sync with the _OPTIONS registry. Replace the "
+            "Settings reference is out of sync with the PLUGIN_OPTIONS registry. Replace the "
             "table under '## Settings reference' in "
             "docs/guides/pytest_plugin/configuration.md with:\n\n" + rendered
         )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py b/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
index 76550cc22..0845f143b 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
@@ -13,10 +13,10 @@
 from typing import TYPE_CHECKING, Callable
 
 from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
-from sift_client.pytest_plugin import (
-    _measurement_segments,
-    _resolve_real_report_id,
-    _step_count_segments,
+from sift_client._internal.pytest_plugin.report import resolve_real_report_id
+from sift_client._internal.pytest_plugin.terminal import (
+    measurement_segments,
+    step_count_segments,
 )
 from sift_client.sift_types.test_report import TestStatus
 
@@ -29,7 +29,7 @@
 class TestStepCountSegments:
     def test_lists_nonzero_statuses_in_order_with_color(self) -> None:
         counts = Counter({TestStatus.PASSED: 4, TestStatus.FAILED: 2, TestStatus.SKIPPED: 1})
-        assert _step_count_segments(counts) == [
+        assert step_count_segments(counts) == [
             ("4 passed", {"green": True}),
             ("2 failed", {"red": True}),
             ("1 skipped", {"yellow": True}),
@@ -37,28 +37,28 @@ def test_lists_nonzero_statuses_in_order_with_color(self) -> None:
 
     def test_error_and_aborted_are_red(self) -> None:
         counts = Counter({TestStatus.ERROR: 1, TestStatus.ABORTED: 1})
-        assert _step_count_segments(counts) == [
+        assert step_count_segments(counts) == [
             ("1 error", {"red": True}),
             ("1 aborted", {"red": True}),
         ]
 
     def test_empty_is_empty(self) -> None:
-        assert _step_count_segments(Counter()) == []
+        assert step_count_segments(Counter()) == []
 
 
 class TestMeasurementSegments:
     def test_passed_green_failed_red(self) -> None:
-        assert _measurement_segments(Counter({True: 2, False: 1})) == [
+        assert measurement_segments(Counter({True: 2, False: 1})) == [
             ("2 passed", {"green": True}),
             ("1 failed", {"red": True}),
         ]
 
     def test_empty_is_empty(self) -> None:
-        assert _measurement_segments(Counter()) == []
+        assert measurement_segments(Counter()) == []
 
 
 class TestResolveRealReportId:
-    """``_resolve_real_report_id`` maps the footer to the real server report id."""
+    """``resolve_real_report_id`` maps the footer to the real server report id."""
 
     def test_synchronous_online_uses_report_id_directly(self) -> None:
         # No log file, non-simulated report (``--sift-log-file=false`` path).
@@ -66,7 +66,7 @@ def test_synchronous_online_uses_report_id_directly(self) -> None:
             report=SimpleNamespace(id_="real-123", is_simulated=False),
             log_file=None,
         )
-        assert _resolve_real_report_id(context) == "real-123"
+        assert resolve_real_report_id(context) == "real-123"
 
     def test_incremental_resolves_via_sidecar(self, tmp_path: Path) -> None:
         log_file = tmp_path / "run.jsonl"
@@ -76,7 +76,7 @@ def test_incremental_resolves_via_sidecar(self, tmp_path: Path) -> None:
             report=SimpleNamespace(id_="sim-1", is_simulated=True),
             log_file=log_file,
         )
-        assert _resolve_real_report_id(context) == "real-1"
+        assert resolve_real_report_id(context) == "real-1"
 
     def test_empty_report_id_returns_none(self) -> None:
         # An unset/empty id must not produce a ``/test-results/`` link.
@@ -84,7 +84,7 @@ def test_empty_report_id_returns_none(self) -> None:
             report=SimpleNamespace(id_="", is_simulated=False),
             log_file=None,
         )
-        assert _resolve_real_report_id(context) is None
+        assert resolve_real_report_id(context) is None
 
     def test_incremental_unmapped_returns_none(self, tmp_path: Path) -> None:
         # Worker died before mapping the report: no sidecar entry.
@@ -94,7 +94,7 @@ def test_incremental_unmapped_returns_none(self, tmp_path: Path) -> None:
             report=SimpleNamespace(id_="sim-1", is_simulated=True),
             log_file=log_file,
         )
-        assert _resolve_real_report_id(context) is None
+        assert resolve_real_report_id(context) is None
 
 
 class TestHeader:
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py b/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
index ed7a92dc4..435170ed5 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
@@ -2,7 +2,7 @@
 
 The plugin scans ``SIFT_*`` env vars and ``[tool.sift.pytest.*]`` keys at
 session start and emits a ``SiftPytestPluginWarning`` for anything not
-declared in the central ``_OPTIONS`` registry. A typo (`SIFT_REPORT_SERIALNUM`
+declared in the central ``PLUGIN_OPTIONS`` registry. A typo (`SIFT_REPORT_SERIALNUM`
 instead of `SIFT_REPORT_SERIAL_NUMBER`) would otherwise silently no-op.
 """
 
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 4341bf122..43f689894 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -1,1574 +1,123 @@
-from __future__ import annotations
-
-import os
-import warnings
-from dataclasses import dataclass
-from datetime import datetime, timezone
-from pathlib import Path
-from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Generator, Tuple
-
-import pytest
-
-from sift_client import SiftClient, SiftConnectionConfig
-from sift_client._internal.pyproject_config import load_tool_sift
-from sift_client.errors import SiftWarning
-from sift_client.sift_types.test_report import ErrorInfo, TestStatus
-from sift_client.util.test_results import ReportContext
-from sift_client.util.test_results.context_manager import (
-    _git_metadata,
-    _quiet_fork_stderr,
-    format_assertion_message,
-    format_truncated_traceback,
-)
-
-
-class SiftPytestPluginWarning(SiftWarning):
-    """Base warning for issues raised by the Sift pytest plugin."""
-
-
-class SiftPytestStepDrainWarning(SiftPytestPluginWarning):
-    """A step's ``__exit__`` raised while the plugin was draining its stack.
-
-    Surfaced at module-teardown or session-end so the drain can continue and
-    pytest test outcomes stay unaffected; the underlying exception is included
-    in the message for debugging.
-    """
-
-
-class SiftPytestStepDrainError(RuntimeError):
-    """Raised when mid-session drain fails — signals a likely upstream invariant break."""
-
-
-if TYPE_CHECKING:
-    from sift_client.util.test_results.context_manager import NewStep
-
-REPORT_CONTEXT: Any = None
-
-# Set at session end with the resolved (real) report id/URL when online and
-# uploaded. Read from a project's conftest in a later hook (e.g.
-# ``pytest_unconfigure``) to post the link, write a file, etc.
-SIFT_REPORT_ID_STASH_KEY = pytest.StashKey[str]()
-SIFT_REPORT_URL_STASH_KEY = pytest.StashKey[str]()
-
-_STASH_MISSING = object()
-
-_PARAMETRIZE_PATH_KEY = pytest.StashKey[Tuple[str, ...]]()
-# Each frame: (path_key, open step). Frames are shared across sibling test items
-# and drained at session end.
-_PARAMETRIZE_STACK: list[tuple[str, Any]] = []
-
-_HIERARCHY_KEY = pytest.StashKey[Tuple[Tuple[str, str, "str | None", bool], ...]]()
-# Outer-to-inner frames for the item's collection-tree ancestors. Each chain
-# entry is ``(identity, name, doc, rendered)``:
-#   - ``identity``: a globally-unique key (``node.nodeid``) used for diff
-#     comparison. Two ancestors at the same depth with the same display name
-#     but reached via different paths (e.g., ``proj_a/utils`` and
-#     ``proj_b/utils`` in a monorepo) get distinct identities, so they never
-#     silently merge in the diff.
-#   - ``name``: the human-readable step name used when ``rendered`` opens the
-#     Sift step.
-#   - ``doc``: docstring used for the step description if rendered.
-#   - ``rendered``: True iff the corresponding ``sift_*_step`` ini flag is on.
-#     Non-rendered frames participate in the diff but do not call
-#     ``rc.new_step(...)`` — they appear with ``ns=None`` in the stack.
-#
-# Stack entries: ``(identity, name, open_step_or_None)``. Frames are shared
-# across sibling test items and drained at session end. Drained AFTER
-# _PARAMETRIZE_STACK since parametrize parents nest inside hierarchy parents.
-_HIERARCHY_STACK: list[tuple[str, str, Any]] = []
-
-
-def _drain_step_stack(stack: list, *, swallow_errors: bool = True) -> None:
-    """Pop and close every frame.
-
-    With ``swallow_errors=True`` (default, used at teardown / session end),
-    per-frame failures are surfaced as ``SiftPytestStepDrainWarning`` so a
-    single misbehaving ``__exit__`` can't block the rest of the stack from
-    cleaning up or cascade out of pytest's finalizer chain.
-
-    With ``swallow_errors=False`` (mid-session, when a class transition forces
-    parametrize parents to close), the stack is still fully drained but the
-    first per-frame exception is re-raised at the end as a
-    ``SiftPytestStepDrainError`` so a real upstream invariant violation
-    surfaces as a test error instead of a silenceable warning.
-    """
-    errors: list[tuple[str, BaseException]] = []
-    while stack:
-        entry = stack.pop()
-        # Tolerate either ``(name, ns)`` (parametrize stack) or
-        # ``(identity, name, ns)`` (hierarchy stack) entries.
-        name, ns = entry[-2], entry[-1]
-        if ns is None:
-            # Non-rendered diff-only frame (e.g. a Package frame when
-            # ``sift_package_step=false``); nothing to close.
-            continue
-        try:
-            ns.__exit__(None, None, None)
-        except Exception as exc:
-            if swallow_errors:
-                warnings.warn(
-                    f"Sift plugin: closing step {name!r} during drain raised "
-                    f"{type(exc).__name__}: {exc}",
-                    SiftPytestStepDrainWarning,
-                    stacklevel=2,
-                )
-            else:
-                errors.append((name, exc))
-    if errors:
-        first_name, first_exc = errors[0]
-        raise SiftPytestStepDrainError(
-            f"Sift plugin: {len(errors)} step(s) raised while draining mid-session; "
-            f"first failure on {first_name!r}: {type(first_exc).__name__}: {first_exc}"
-        ) from first_exc
-
-
-def _drain_parametrize_stack(*, swallow_errors: bool = True) -> None:
-    _drain_step_stack(_PARAMETRIZE_STACK, swallow_errors=swallow_errors)
-
-
-def _drain_hierarchy_stack(*, swallow_errors: bool = True) -> None:
-    _drain_step_stack(_HIERARCHY_STACK, swallow_errors=swallow_errors)
-
-
-def _close_frame(name: str, ns: Any) -> None:
-    """Close a single frame, warning on per-frame failure.
-
-    Used by the mid-session hierarchy-stack pop and the rollback paths so a
-    misbehaving ``__exit__`` neither shadows the original exception nor leaks
-    sibling frames. ``ns=None`` indicates a non-rendered diff-only frame; skip.
-    """
-    if ns is None:
-        return
-    try:
-        ns.__exit__(None, None, None)
-    except Exception as exc:
-        warnings.warn(
-            f"Sift plugin: closing step {name!r} raised {type(exc).__name__}: {exc}",
-            SiftPytestStepDrainWarning,
-            stacklevel=2,
-        )
-
-
-def _build_parametrize_path(item: pytest.Item) -> tuple[str, ...]:
-    """Outer-to-inner step display names for a parametrized item.
-
-    Pytest stores ``callspec.params`` with the BOTTOM decorator's axis first;
-    the Sift step tree treats the TOP decorator as outermost, so we reverse.
-    """
-    callspec = getattr(item, "callspec", None)
-    if callspec is None or not callspec.params:
-        return ()
-    originalname = getattr(item, "originalname", item.name)
-    frames: list[str] = [originalname]
-    for name, value in reversed(callspec.params.items()):
-        frames.append(f"{name}={value!r}")
-    return tuple(frames)
-
-
-def _build_hierarchy_chain(
-    item: pytest.Item | pytest.Collector,
-    config: pytest.Config,
-) -> tuple[tuple[str, str, str | None, bool], ...]:
-    """Outer-to-inner ``(identity, name, docstring, rendered)`` for collection ancestors.
-
-    Walks ``item.parent`` upward and ALWAYS collects every ``pytest.Package``,
-    ``pytest.Module``, and ``pytest.Class`` ancestor — they all participate in
-    the diff that keeps the report tree coherent across tests, so two
-    same-named ancestors reached via different paths (e.g., ``proj_a/utils``
-    and ``proj_b/utils`` in a monorepo where the ``proj_*`` dirs are
-    ``pytest.Dir`` nodes the walker skips) cannot silently merge.
-
-    The ``identity`` field is ``node.nodeid`` — globally unique per collected
-    node. The diff compares on identity, not the display ``name``.
-
-    The ``rendered`` flag is True iff the layer's ini flag is on
-    (``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``).
-    Non-rendered frames participate in the diff for identity but don't open a
-    Sift step.
-
-    The ``node.obj`` access is a pytest property that imports the underlying
-    Python object and can raise *any* exception (ImportError, custom
-    metaclass errors, descriptor ``__doc__`` properties that throw). Guard
-    broadly so a misbehaving collector doesn't abort the whole collection
-    phase — that frame's docstring just becomes ``None``.
-    """
-    include_package = bool(_PACKAGE_STEP.resolve(config))
-    include_module = bool(_MODULE_STEP.resolve(config))
-    include_class = bool(_CLASS_STEP.resolve(config))
-
-    chain: list[tuple[str, str, str | None, bool]] = []
-    # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
-    # isn't part of pytest's public API; widen to ``Any`` for the walk.
-    node: Any = item
-    while node is not None:
-        if isinstance(node, pytest.Class):
-            rendered = include_class
-        elif isinstance(node, pytest.Module):
-            rendered = include_module
-        elif isinstance(node, pytest.Package):
-            rendered = include_package
-        else:
-            node = node.parent
-            continue
-        try:
-            doc = (
-                (getattr(node, "obj", None) and getattr(node.obj, "__doc__", None)) or ""
-            ).strip() or None
-        except Exception:
-            doc = None
-        chain.append((node.nodeid, node.name, doc, rendered))
-        node = node.parent
-    return tuple(reversed(chain))
-
-
-# Settings-reference categories. Each maps to a docs subsection and, in the
-# renderer, to the column subset that category actually uses.
-_CAT_BEHAVIOR = "Pytest behavior"
-_CAT_CONNECTION = "Connection"
-_CAT_REPORT = "Report content"
-_CATEGORIES = (_CAT_BEHAVIOR, _CAT_CONNECTION, _CAT_REPORT)
-
-_TOOL_SIFT_KEY = pytest.StashKey[dict]()
-
-
-def _tool_sift(config: pytest.Config | None) -> dict[str, Any]:
-    """Session-cached ``[tool.sift]`` table.
-
-    Every option that reads TOML, plus the typo detector, would otherwise
-    re-parse pyproject.toml on the session-start path — and re-emit the
-    malformed-file warning each time. Parse once per session via the config
-    stash; ``load_tool_sift`` stays the uncached parser for direct callers.
-    """
-    if config is None:
-        return {}
-    cached = config.stash.get(_TOOL_SIFT_KEY, None)
-    if cached is None:
-        cached = load_tool_sift(config)
-        config.stash[_TOOL_SIFT_KEY] = cached
-    return cached
-
-
-@dataclass(frozen=True)
-class _Option:
-    """One setting and the logic to resolve it from wherever it can be set.
-
-    A setting may be read from an env var, a CLI flag, a pytest ini key, or a
-    ``[tool.sift...]`` TOML path. :meth:`resolve` walks the declared surfaces in
-    env > cli > ini > toml order. ``metadata`` is the one exception: a free-form
-    TOML table (``merge=True``) resolved by :meth:`resolve_merged`.
-
-    One registry of these drives ``pytest_addoption``, the resolvers, the docs
-    settings-reference table, and the typo detector, so a setting is added or
-    changed in one place.
-
-    Surface fields (declare only the ones a setting uses):
-
-    - ``cli`` / ``cli_action``: CLI flag (e.g. ``"--sift-offline"``) and
-      argparse action; ``cli_dest`` is derived from the flag.
-    - ``ini`` / ``ini_type`` / ``ini_default``: pytest ini key under
-      ``[tool.pytest.ini_options]`` and its pytest type + default.
-    - ``toml``: tuple path under ``[tool.sift...]``, e.g.
-      ``("pytest", "report", "name")`` -> ``tool.sift.pytest.report.name``.
-    - ``env``: full env var name, e.g. ``"SIFT_API_KEY"``.
-
-    ``category`` groups the option in the docs settings reference (one of
-    ``_CATEGORIES``).
-    """
-
-    name: str
-    help: str
-    category: str
-    cli: str | None = None
-    cli_action: str | None = None
-    ini: str | None = None
-    ini_type: str | None = None
-    ini_default: Any = None
-    toml: tuple[str, ...] | None = None
-    env: str | None = None
-    merge: bool = False
-
-    @property
-    def cli_dest(self) -> str:
-        """Argparse ``dest`` for the option.
-
-        When the option has both a CLI flag and an ini key, the dest matches
-        the ini name so ``config.getoption(ini_name)`` returns the CLI value
-        (and falls through to ``config.getini(ini_name)`` when the flag wasn't
-        passed). Without an ini key, the dest derives from the flag name.
-        """
-        if self.ini:
-            return self.ini
-        if self.cli is None:
-            return self.name
-        return self.cli.lstrip("-").replace("-", "_")
-
-    def __post_init__(self) -> None:
-        if self.cli_action and not self.cli:
-            raise ValueError(f"_Option({self.name!r}): cli_action requires cli")
-        if self.ini_type and not self.ini:
-            raise ValueError(f"_Option({self.name!r}): ini_type requires ini")
-        if self.merge and not self.toml:
-            raise ValueError(f"_Option({self.name!r}): merge=True needs toml")
-        if not any([self.cli, self.ini, self.toml, self.env]):
-            raise ValueError(f"_Option({self.name!r}): declares no surfaces")
-        if self.category not in _CATEGORIES:
-            raise ValueError(f"_Option({self.name!r}): category must be one of {_CATEGORIES}")
-
-    def resolve(self, config: pytest.Config | None) -> Any:
-        """First set value from declared surfaces; ``None`` when unset everywhere.
-
-        Walk order is env > cli > ini > toml. No current option declares both
-        env and cli, so the chain isn't ambiguous in practice.
-        ``getini`` returns the typed default for unset bool/list keys, so this
-        only returns ini values for booleans (always meaningful), non-empty
-        strings, and non-empty lists.
-        """
-        if self.env:
-            env_value = os.getenv(self.env)
-            if env_value not in (None, ""):
-                return env_value
-        if config is None:
-            return None
-        if self.cli:
-            cli_value = config.getoption(self.cli_dest, default=None)
-            if cli_value is not None:
-                return cli_value
-        if self.ini:
-            try:
-                ini_value = config.getini(self.ini)
-            except (KeyError, ValueError):
-                ini_value = None
-            if isinstance(ini_value, bool):
-                return ini_value
-            if isinstance(ini_value, str) and ini_value:
-                return ini_value
-            if isinstance(ini_value, list) and ini_value:
-                return ini_value
-        if self.toml:
-            toml_value = _walk_toml(_tool_sift(config), self.toml)
-            if toml_value not in (None, ""):
-                return toml_value
-        return None
-
-    def resolve_merged(self, config: pytest.Config | None) -> dict[str, str | float | bool]:
-        """For ``merge=True`` dict-shape settings: the free-form TOML table.
-
-        TOML values that don't fit ``dict[str, str | float | bool]`` (nested
-        tables, lists, ``None``) are dropped with a warning so a malformed
-        entry can't crash report creation.
-        """
-        result: dict[str, str | float | bool] = {}
-        if config is not None and self.toml:
-            base = _walk_toml(_tool_sift(config), self.toml)
-            if isinstance(base, dict):
-                for key, value in base.items():
-                    if not isinstance(key, str):
-                        continue
-                    if isinstance(value, (bool, str, int, float)):
-                        # ``bool`` first since ``isinstance(True, int)`` is True.
-                        result[key] = value  # type: ignore[assignment]
-                        continue
-                    warnings.warn(
-                        f"[tool.sift.{'.'.join(self.toml)}] entry {key!r} ignored: "
-                        f"unsupported type {type(value).__name__}.",
-                        SiftPytestPluginWarning,
-                        stacklevel=2,
-                    )
-        return result
-
-
-def _walk_toml(data: dict[str, Any], path: tuple[str, ...]) -> Any:
-    """Walk a parsed TOML tree along ``path``; return None on any missing key."""
-    cur: Any = data
-    for key in path:
-        if not isinstance(cur, dict):
-            return None
-        cur = cur.get(key)
-        if cur is None:
-            return None
-    return cur
-
-
-# ---------------------------------------------------------------------------
-# Settings registry.
-#
-# Add new options here. The registry drives `pytest_addoption`, resolution,
-# the docs settings-reference table, and the unknown-key typo detector, so a
-# setting is declared once instead of wired up in several places.
-#
-# Where each setting lives follows a few principles:
-#   - Secrets (the API key) come from environment variables only, never a
-#     committed file.
-#   - Pytest behavior lives in [tool.pytest.ini_options] so it integrates with
-#     `pytest --help` / `--co` / `--trace-config`.
-#   - Sift report content lives in [tool.sift.pytest.report.*].
-#   - Non-secret endpoints take an env var plus one static home (ini or toml,
-#     not both).
-#   - A CLI flag is added only when there is a real per-run override workflow;
-#     stable project config stays in ini/toml.
-#   - Dynamic per-run values are injected via environment variables (pytest-dotenv
-#     loads .env for local dev; CI sets the same names from its secret store).
-# ---------------------------------------------------------------------------
-
-# Pytest behavior. The CLI flag survives because the per-run override is real.
-_LOG_FILE = _Option(
-    name="log_file",
-    category=_CAT_BEHAVIOR,
-    help="Path to the JSONL log of create/update calls (path | true | false | none).",
-    cli="--sift-log-file",
-    ini="sift_log_file",
-)
-_GIT_METADATA = _Option(
-    name="git_metadata",
-    category=_CAT_BEHAVIOR,
-    help="Capture git repo/branch/commit on the report.",
-    cli="--no-sift-git-metadata",
-    cli_action="store_false",
-    ini="sift_git_metadata",
-    ini_type="bool",
-    ini_default=True,
-)
-_OFFLINE = _Option(
-    name="offline",
-    category=_CAT_BEHAVIOR,
-    help="Skip the session-start ping; route create/update through the JSONL log.",
-    cli="--sift-offline",
-    cli_action="store_true",
-    ini="sift_offline",
-    ini_type="bool",
-    ini_default=False,
-)
-_DISABLED = _Option(
-    name="disabled",
-    category=_CAT_BEHAVIOR,
-    help="Disable Sift entirely (no API calls, no log file). Supersedes --sift-offline.",
-    cli="--sift-disabled",
-    cli_action="store_true",
-    ini="sift_disabled",
-    ini_type="bool",
-    ini_default=False,
-)
-
-_OPEN = _Option(
-    name="open_report",
-    category=_CAT_BEHAVIOR,
-    help="Open the resulting report in a browser at session end (online only; "
-    "no-op when the report URL can't be resolved).",
-    cli="--sift-open-report",
-    cli_action="store_true",
-    ini="sift_open_report",
-    ini_type="bool",
-    ini_default=False,
-)
-
-# Pytest behavior: set-once project defaults (no CLI flag — no per-run override).
-_AUTOUSE = _Option(
-    name="autouse",
-    category=_CAT_BEHAVIOR,
-    help="Default for the Sift autouse fixtures (report_context, step, hierarchy/parametrize parents).",
-    ini="sift_autouse",
-    ini_type="bool",
-    ini_default=True,
-)
-_PACKAGE_STEP = _Option(
-    name="package_step",
-    category=_CAT_BEHAVIOR,
-    help="Open a parent step for each Python package in the test path.",
-    ini="sift_package_step",
-    ini_type="bool",
-    ini_default=True,
-)
-_MODULE_STEP = _Option(
-    name="module_step",
-    category=_CAT_BEHAVIOR,
-    help="Open a parent step for each test module.",
-    ini="sift_module_step",
-    ini_type="bool",
-    ini_default=True,
-)
-_CLASS_STEP = _Option(
-    name="class_step",
-    category=_CAT_BEHAVIOR,
-    help="Open per-class parent steps, including nested classes.",
-    ini="sift_class_step",
-    ini_type="bool",
-    ini_default=True,
-)
-_PARAMETRIZE_NESTING = _Option(
-    name="parametrize_nesting",
-    category=_CAT_BEHAVIOR,
-    help="Cluster parametrized tests under shared parent steps (e.g. test_a -> v=1, v=2).",
-    ini="sift_parametrize_nesting",
-    ini_type="bool",
-    ini_default=True,
-)
-
-# Credentials. The API key is env-only; the URIs accept env + ini.
-_API_KEY = _Option(
-    name="api_key",
-    category=_CAT_CONNECTION,
-    help="Sift API key (secret, env-only).",
-    env="SIFT_API_KEY",
-)
-_GRPC_URI = _Option(
-    name="grpc_uri",
-    category=_CAT_CONNECTION,
-    help="Sift gRPC endpoint URI.",
-    env="SIFT_GRPC_URI",
-    ini="sift_grpc_uri",
-)
-_REST_URI = _Option(
-    name="rest_uri",
-    category=_CAT_CONNECTION,
-    help="Sift REST endpoint URI.",
-    env="SIFT_REST_URI",
-    ini="sift_rest_uri",
-)
-_APP_URL = _Option(
-    name="app_url",
-    category=_CAT_CONNECTION,
-    help="Sift web-app origin for the report link in the terminal footer (e.g. "
-    "https://app.siftstack.com). When unset, the link is derived from the REST URI "
-    "for known Sift hosts.",
-    env="SIFT_APP_URL",
-    ini="sift_app_url",
-)
-
-# Report content. Project defaults in [tool.sift.pytest.report]; CI injects
-# per-run values via SIFT_REPORT_* env vars (pytest-dotenv handles .env files
-# for local dev).
-_REPORT_NAME = _Option(
-    name="report_name",
-    category=_CAT_REPORT,
-    help="Template for the report display name. Placeholders: {target}, {command}, {args}, "
-    "{rootdir}, {timestamp}, {count}, {git_repo}, {git_branch}, {git_commit}.",
-    toml=("pytest", "report", "name"),
-)
-_TEST_CASE = _Option(
-    name="test_case",
-    category=_CAT_REPORT,
-    help="Template for the report's test_case field (same placeholders as report_name).",
-    toml=("pytest", "report", "test_case"),
-)
-_TEST_SYSTEM_NAME = _Option(
-    name="test_system_name",
-    category=_CAT_REPORT,
-    help="Name of the test system / rig. Defaults to the host's name.",
-    env="SIFT_REPORT_TEST_SYSTEM_NAME",
-    toml=("pytest", "report", "test_system_name"),
-)
-_SYSTEM_OPERATOR = _Option(
-    name="system_operator",
-    category=_CAT_REPORT,
-    help="Operator running the test. Defaults to the OS user.",
-    env="SIFT_REPORT_SYSTEM_OPERATOR",
-    toml=("pytest", "report", "system_operator"),
-)
-_SERIAL_NUMBER = _Option(
-    name="serial_number",
-    category=_CAT_REPORT,
-    help="Serial number of the unit under test.",
-    env="SIFT_REPORT_SERIAL_NUMBER",
-    toml=("pytest", "report", "serial_number"),
-)
-_PART_NUMBER = _Option(
-    name="part_number",
-    category=_CAT_REPORT,
-    help="Part number of the unit under test.",
-    env="SIFT_REPORT_PART_NUMBER",
-    toml=("pytest", "report", "part_number"),
-)
-_METADATA = _Option(
-    name="metadata",
-    category=_CAT_REPORT,
-    help="Free-form report metadata, as a TOML table of scalar values. For "
-    "dynamic per-run keys, attach them in conftest via the report_context fixture.",
-    toml=("pytest", "report", "metadata"),
-    merge=True,
-)
-
-_OPTIONS: tuple[_Option, ...] = (
-    _LOG_FILE,
-    _GIT_METADATA,
-    _OFFLINE,
-    _DISABLED,
-    _OPEN,
-    _AUTOUSE,
-    _PACKAGE_STEP,
-    _MODULE_STEP,
-    _CLASS_STEP,
-    _PARAMETRIZE_NESTING,
-    _API_KEY,
-    _GRPC_URI,
-    _REST_URI,
-    _APP_URL,
-    _REPORT_NAME,
-    _TEST_CASE,
-    _TEST_SYSTEM_NAME,
-    _SYSTEM_OPERATOR,
-    _SERIAL_NUMBER,
-    _PART_NUMBER,
-    _METADATA,
-)
-
-
-def pytest_addoption(parser: pytest.Parser) -> None:
-    """Register every CLI flag and pytest ini key declared in ``_OPTIONS``.
-
-    One loop drives both surfaces — adding a setting is one entry in the
-    registry, not three edits across this function and a docs table.
-    """
-    group = parser.getgroup("sift", description="Sift test results")
-    for opt in _OPTIONS:
-        if opt.cli is not None:
-            cli_kwargs: dict[str, Any] = {
-                "dest": opt.cli_dest,
-                "default": None,
-                "help": opt.help,
-            }
-            if opt.cli_action is not None:
-                cli_kwargs["action"] = opt.cli_action
-            group.addoption(opt.cli, **cli_kwargs)
-        if opt.ini is not None:
-            ini_kwargs: dict[str, Any] = {"help": opt.help, "default": opt.ini_default}
-            if opt.ini_type is not None:
-                ini_kwargs["type"] = opt.ini_type
-            parser.addini(opt.ini, **ini_kwargs)
-
-
-def pytest_configure(config: pytest.Config) -> None:
-    """Register the Sift gate markers and warn on unknown ``SIFT_*`` settings."""
-    config.addinivalue_line(
-        "markers",
-        "sift_include: force the Sift autouse fixtures to activate for this test "
-        "regardless of the `sift_autouse` ini default.",
-    )
-    config.addinivalue_line(
-        "markers",
-        "sift_exclude: force the Sift autouse fixtures to skip this test "
-        "regardless of the `sift_autouse` ini default.",
-    )
-    # Surface typos in env vars and [tool.sift...] keys at session start so a
-    # silent no-op (env var that doesn't match anything, table key the loader
-    # ignores) becomes visible. The registry is the source of truth for what's
-    # known.
-    _warn_on_unknown_env_vars()
-    _warn_on_unknown_toml_keys(config)
-
-
-def _render_settings_reference() -> str:
-    """Render the Markdown settings reference from ``_OPTIONS``.
-
-    One ``### <category>`` subsection per category, each table showing only the
-    columns that category uses (so no dead all-``—`` columns). The plugin docs
-    at ``docs/guides/pytest_plugin/configuration.md`` embed this output verbatim
-    so the registry and the docs can't drift;
-    ``test_settings_reference_docs_in_sync`` is the guard rail. Regenerate with::
-
-        uv run python -c "from sift_client.pytest_plugin import _render_settings_reference; print(_render_settings_reference())"
-    """
-
-    def _cli_cell(opt: _Option) -> str:
-        return f"`{opt.cli}`" if opt.cli else "—"
-
-    def _ini_cell(opt: _Option) -> str:
-        return f"`{opt.ini}`" if opt.ini else "—"
-
-    def _toml_cell(opt: _Option) -> str:
-        if not opt.toml:
-            return "—"
-        if opt.merge:
-            return f"`[tool.sift.{'.'.join(opt.toml)}]` (table)"
-        section = ".".join(opt.toml[:-1])
-        return f"`[tool.sift.{section}] {opt.toml[-1]}`"
-
-    def _env_cell(opt: _Option) -> str:
-        if opt.env:
-            return f"`{opt.env}`"
-        return "—"
-
-    # Per-category column layout: only the surfaces that category actually uses.
-    # Each column is (header, cell-renderer).
-    columns_by_category = {
-        _CAT_BEHAVIOR: [
-            ("CLI flag", _cli_cell),
-            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
-        ],
-        _CAT_CONNECTION: [
-            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
-            ("Env var", _env_cell),
-        ],
-        _CAT_REPORT: [
-            ("TOML (`[tool.sift...]`)", _toml_cell),
-            ("Env var", _env_cell),
-        ],
-    }
-
-    def _escape(cell: str) -> str:
-        # Literal pipes inside a Markdown table cell need backslash escaping or
-        # they'd be parsed as column separators.
-        return cell.replace("|", "\\|")
-
-    blocks: list[str] = []
-    for category in _CATEGORIES:
-        opts = [o for o in _OPTIONS if o.category == category]
-        if not opts:
-            continue
-        columns = columns_by_category[category]
-        headers = ["Setting", *(h for h, _ in columns)]
-        lines = [
-            f"### {category}",
-            "",
-            "| " + " | ".join(headers) + " |",
-            "|" + "|".join(["---"] * len(headers)) + "|",
-        ]
-        for opt in opts:
-            cells = [opt.help, *(render(opt) for _, render in columns)]
-            lines.append("| " + " | ".join(_escape(c) for c in cells) + " |")
-        blocks.append("\n".join(lines))
-    return "\n\n".join(blocks)
-
-
-def _warn_on_unknown_env_vars() -> None:
-    """Emit a warning for any ``SIFT_*`` env var not declared in the registry.
-
-    The registry declares each env var by its full name (``opt.env``); a
-    ``SIFT_*`` var that matches none of them is almost always a typo.
-    """
-    import difflib
-
-    known_full = {opt.env for opt in _OPTIONS if opt.env}
-    suggestion_pool = sorted(known_full)
-    for name in sorted(os.environ):
-        if not name.startswith("SIFT_"):
-            continue
-        if name in known_full:
-            continue
-        close = difflib.get_close_matches(name, suggestion_pool, n=1, cutoff=0.6)
-        hint = f" (did you mean `{close[0]}`?)" if close else ""
-        warnings.warn(
-            f"Unknown SIFT_* env var `{name}`{hint}; ignored.",
-            SiftPytestPluginWarning,
-            stacklevel=2,
-        )
-
-
-def _warn_on_unknown_toml_keys(config: pytest.Config) -> None:
-    """Walk ``[tool.sift.pytest.*]`` in pyproject.toml and warn on keys outside the registry.
-
-    Only the ``tool.sift.pytest`` subtree is checked. Other ``tool.sift.*``
-    subtrees are reserved for non-pytest Sift tooling (e.g. ``tool.sift.extras``
-    is consumed by this repo's extras-generation script) and aren't our
-    concern. Free-form subtrees (``merge=True`` options like ``metadata``)
-    stop the walk — their keys are user-defined and not validated.
-    """
-    import difflib
-
-    data = _tool_sift(config)
-    pytest_table = (data or {}).get("pytest")
-    if not isinstance(pytest_table, dict):
-        return
-    # Build leaf/free-form/prefix sets relative to the ``("pytest", ...)`` root
-    # the registry already uses, so the walk runs on the table we just sliced.
-    leaves = {opt.toml for opt in _OPTIONS if opt.toml and not opt.merge}
-    free_form = {opt.toml for opt in _OPTIONS if opt.toml and opt.merge}
-    prefixes: set[tuple[str, ...]] = set()
-    for full in leaves | free_form:
-        for i in range(len(full)):
-            prefixes.add(full[:i])
-
-    def _walk(node: Any, base: tuple[str, ...]) -> None:
-        if base in free_form or not isinstance(node, dict):
-            return
-        for key, value in node.items():
-            path = (*base, str(key))
-            if path in leaves or path in free_form:
-                continue
-            if path in prefixes:
-                _walk(value, path)
-                continue
-            full_name = "tool.sift." + ".".join(path)
-            same_depth = [
-                ".".join(p) for p in (leaves | free_form | prefixes) if len(p) == len(path)
-            ]
-            close = difflib.get_close_matches(".".join(path), same_depth, n=1, cutoff=0.6)
-            hint = f" (did you mean `tool.sift.{close[0]}`?)" if close else ""
-            warnings.warn(
-                f"Unknown sift config key `{full_name}`{hint}; ignored.",
-                SiftPytestPluginWarning,
-                stacklevel=2,
-            )
-
-    _walk(pytest_table, ("pytest",))
-
-
-def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
-    """Stash each item's class chain + parametrize path and cluster siblings.
-
-    Sorts by ``(file_path, hierarchy_chain, parametrize_path)`` so sibling
-    items under a shared parent (package, module, class, or parametrize axis)
-    stay contiguous — otherwise a free function sorting between two class
-    methods would tear down + re-open the class step, producing duplicate
-    parents in the report tree.
-    """
-    for item in items:
-        item.stash[_HIERARCHY_KEY] = _build_hierarchy_chain(item, config)
-        item.stash[_PARAMETRIZE_PATH_KEY] = _build_parametrize_path(item)
-    # Use ``.get(...)`` defensively: a third-party hook may inject items after
-    # our stashing loop runs, and we'd rather sort them at the tail than
-    # KeyError out of collection.
-    items.sort(
-        key=lambda i: (
-            str(i.path),
-            tuple(identity for identity, _, _, _ in i.stash.get(_HIERARCHY_KEY, ())),
-            i.stash.get(_PARAMETRIZE_PATH_KEY, ()),
-        )
-    )
-
-
-def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
-    """Drain any parent steps still open at session end (innermost first).
-
-    Wrapped so a failure in the inner drain does not prevent the outer one
-    from running. With ``module_substep`` removed, this is the sole place
-    where hierarchy parents close — they persist across all tests and only
-    drain when the session ends.
-    """
-    try:
-        _drain_parametrize_stack()
-    finally:
-        _drain_hierarchy_stack()
-
-
-def _is_offline(pytestconfig: pytest.Config | None) -> bool:
-    return bool(_OFFLINE.resolve(pytestconfig))
-
-
-def _is_disabled(pytestconfig: pytest.Config | None) -> bool:
-    return bool(_DISABLED.resolve(pytestconfig))
-
-
-def _sdk_version() -> str:
-    """Return the installed ``sift_stack_py`` version, or ``"unknown"``."""
-    from importlib.metadata import PackageNotFoundError, version
-
-    try:
-        return version("sift_stack_py")
-    except PackageNotFoundError:
-        return "unknown"
-
-
-def _mode_label(config: pytest.Config) -> str:
-    """Resolve the active mode for the terminal header: disabled > offline > online."""
-    if _is_disabled(config):
-        return "disabled"
-    if _is_offline(config):
-        return "offline"
-    return "online"
-
-
-def pytest_report_header(config: pytest.Config) -> str | None:
-    """Emit a session-start header with the SDK version and active mode.
-
-    Suppressed under ``-q`` (negative verbosity), matching how pytest hides its
-    own platform/plugin header.
-    """
-    if config.get_verbosity() < 0:
-        return None
-    return f"Sift: sift-stack-py {_sdk_version()} — {_mode_label(config)} mode"
+"""Sift pytest plugin: records each test as a step in a Sift test report.
 
+Load it from a project's ``conftest.py``::
 
-def _resolve_real_report_id(context: Any) -> str | None:
-    """Resolve the real server-side report id for the online footer link.
-
-    In synchronous online mode (``--sift-log-file=false``) the report is created
-    directly against the API, so ``report.id_`` is already the real id. In the
-    default incremental mode the report is created through the simulate path
-    (a client-side UUID) and the background worker maps it to the real id on
-    replay, recording it in the ``<log>.tracking`` sidecar's ``id_map``. By the
-    time this footer runs the session-scoped report context has torn down and
-    the worker has drained, so the sidecar is final.
-
-    Returns ``None`` when the worker never mapped the report (e.g. it died before
-    replaying the create), meaning no real report exists to link.
-    """
-    report = context.report
-    if not report.id_:
-        # No id was ever assigned (unset/empty); nothing to link.
-        return None
-    sim_id = str(report.id_)
-    if not getattr(report, "is_simulated", False):
-        return sim_id
-    log_file = getattr(context, "log_file", None)
-    if log_file is None:
-        return None
-    from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
-
-    return LogTracking.load(log_file).id_map.get(sim_id)
-
-
-_LABEL_WIDTH = 13
-
-
-def _sift_kv(terminalreporter: Any, label: str, value: str, **value_markup: bool) -> None:
-    """Write an indented ``label  value`` row, bolding the label.
-
-    ``value_markup`` (e.g. ``green=True``, ``cyan=True``) styles only the value.
-    Color is dropped automatically when the terminal has no markup (not a TTY or
-    ``--color=no``), so captured/CI output stays plain text.
-    """
-    terminalreporter.write("  ")
-    terminalreporter.write(f"{label:<{_LABEL_WIDTH}}", bold=True)
-    terminalreporter.write_line(value, **value_markup)
-
-
-# Step-count breakdown order and labels for the footer's "Steps" row.
-_STEP_COUNT_ORDER: tuple[tuple[TestStatus, str], ...] = (
-    (TestStatus.PASSED, "passed"),
-    (TestStatus.FAILED, "failed"),
-    (TestStatus.ERROR, "error"),
-    (TestStatus.ABORTED, "aborted"),
-    (TestStatus.SKIPPED, "skipped"),
-    (TestStatus.IN_PROGRESS, "in progress"),
-)
-
-
-# Per-status color for the footer's step breakdown: green pass, red
-# failure/error/abort, yellow skip; in-progress (and anything else) stays plain.
-_STEP_STATUS_MARKUP: dict[TestStatus, dict[str, bool]] = {
-    TestStatus.PASSED: {"green": True},
-    TestStatus.FAILED: {"red": True},
-    TestStatus.ERROR: {"red": True},
-    TestStatus.ABORTED: {"red": True},
-    TestStatus.SKIPPED: {"yellow": True},
-}
-
-
-def _step_count_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
-    """Build ``(text, markup)`` segments for a step tally, non-zero only."""
-    return [
-        (f"{counts.get(status, 0)} {label}", _STEP_STATUS_MARKUP.get(status, {}))
-        for status, label in _STEP_COUNT_ORDER
-        if counts.get(status, 0)
-    ]
-
-
-def _measurement_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
-    """Build ``(text, markup)`` segments for a measurement tally, non-zero only."""
-    segments: list[tuple[str, dict[str, bool]]] = []
-    if counts.get(True, 0):
-        segments.append((f"{counts[True]} passed", {"green": True}))
-    if counts.get(False, 0):
-        segments.append((f"{counts[False]} failed", {"red": True}))
-    return segments
-
-
-def _write_count_row(
-    terminalreporter: Any, label: str, segments: list[tuple[str, dict[str, bool]]]
-) -> None:
-    """Write a ``label  a · b · c`` row, applying each segment's color markup."""
-    terminalreporter.write("  ")
-    terminalreporter.write(f"{label:<{_LABEL_WIDTH}}", bold=True)
-    for index, (text, markup) in enumerate(segments):
-        if index:
-            terminalreporter.write(" · ")
-        terminalreporter.write(text, **markup)
-    terminalreporter.write_line("")
-
-
-def _report_panel_title(report: Any, terminalreporter: Any) -> str:
-    """``Sift report · <name>`` for the section rule, truncated to the terminal width.
-
-    The report name embeds a timestamp (and, for invocation-based runs, the
-    pytest args), so a long name is truncated with an ellipsis to keep the
-    separator line from wrapping.
-    """
-    base = "Sift report"
-    name = getattr(report, "name", None)
-    if not name:
-        return base
-    title = f"{base} · {name}"
-    fullwidth = getattr(getattr(terminalreporter, "_tw", None), "fullwidth", 80)
-    # Reserve room for the separator characters and spaces write_sep adds.
-    limit = max(len(base), fullwidth - 8)
-    if len(title) > limit:
-        title = title[: limit - 1] + "…"
-    return title
-
-
-def _maybe_open_report(url: str) -> None:
-    """Best-effort open the report URL in a browser (for ``--sift-open-report``).
-
-    Skipped on CI or non-interactive sessions so a committed ``sift_open_report``
-    setting can't spawn a browser on a headless agent; the flag is meant for
-    local development.
-    """
-    import sys
-    import webbrowser
-
-    if os.environ.get("CI") or not sys.stdout.isatty():
-        return
-    try:
-        # webbrowser.open forks/execs the platform opener while the gRPC client's
-        # background threads are live; redirect fd 2 across the fork to swallow
-        # gRPC's prefork notice (same treatment as the plugin's other fork sites).
-        with _quiet_fork_stderr():
-            webbrowser.open(url)
-    except Exception:
-        # Headless / no browser available: opening is a convenience, never fatal.
-        pass
-
-
-def pytest_terminal_summary(terminalreporter: Any, exitstatus: int, config: pytest.Config) -> None:
-    """Emit a session-end Sift report summary, adapting per mode.
-
-    The printed panel is suppressed under ``-q``, but programmatic side effects
-    (stashing the report ref for ``conftest.py``, ``--sift-open-report``) still run so
-    other plugins and CI steps can consume the result. The panel shows the
-    outcome (green/red), step and measurement tallies, and a per-mode action: a
-    report link (online), the upload command (offline), or a disabled note.
-    """
-    quiet = config.get_verbosity() < 0
-
-    if _is_disabled(config):
-        if not quiet:
-            terminalreporter.write_sep("=", "Sift", cyan=True, bold=True)
-            terminalreporter.write_line("Sift disabled — no test report created.")
-        return
-
-    context = REPORT_CONTEXT
-    if context is None:
-        # No gated test ran, so no report context was created. Nothing to show.
-        return
-
-    log_file = getattr(context, "log_file", None)
-    offline = _is_offline(config)
-
-    # Resolve the report link first so stashing and --sift-open-report run even under
-    # -q (programmatic consumers don't care about verbosity). Truthiness, not
-    # ``is not None``: a resolved-but-empty id (degenerate sidecar mapping, unset
-    # proto field) must fall through to the "not uploaded" path, not produce a
-    # ``/test-results/`` link.
-    report_id = None if offline else _resolve_real_report_id(context)
-    report_url = (
-        f"{context.client.app_url}/test-results/{report_id}"
-        if report_id and context.client.app_url
-        else None
-    )
-    if report_id:
-        config.stash[SIFT_REPORT_ID_STASH_KEY] = report_id
-    if report_url is not None:
-        config.stash[SIFT_REPORT_URL_STASH_KEY] = report_url
-        if _OPEN.resolve(config):
-            _maybe_open_report(report_url)
-
-    if quiet:
-        return
-
-    failed = bool(getattr(context, "any_failures", False))
-    status_word, status_markup = (
-        ("FAILED", {"red": True, "bold": True})
-        if failed
-        else ("PASSED", {"green": True, "bold": True})
-    )
-    # Offline results live only in the local log until replayed, so the status
-    # row calls that out instead of repeating the version (already in the header).
-    status_context = (
-        f"{_mode_label(config)} · not uploaded"
-        if offline
-        else f"{_mode_label(config)} · sift-stack-py {_sdk_version()}"
-    )
-
-    report = context.report
-
-    terminalreporter.write_sep(
-        "=", _report_panel_title(report, terminalreporter), cyan=True, bold=True
-    )
-
-    # Identity row: the test case (test path or pytest invocation).
-    if report.test_case:
-        _sift_kv(terminalreporter, "Test case", str(report.test_case))
-
-    # Status row: colored outcome, then compact mode context.
-    terminalreporter.write("  ")
-    terminalreporter.write(f"{'Status':<{_LABEL_WIDTH}}", bold=True)
-    terminalreporter.write(status_word, **status_markup)
-    terminalreporter.write_line(f"      {status_context}")
-
-    # Step + measurement tallies (green pass, red failure, yellow skip).
-    _write_count_row(
-        terminalreporter,
-        "Steps",
-        _step_count_segments(context.step_status_counts) or [("no steps", {})],
-    )
-    measurement_segments = _measurement_segments(context.measurement_counts)
-    if measurement_segments:
-        _write_count_row(terminalreporter, "Measurements", measurement_segments)
-
-    # Provenance row: test system and operator.
-    system = " · ".join(part for part in (report.test_system_name, report.system_operator) if part)
-    if system:
-        _sift_kv(terminalreporter, "System", system)
-
-    # Local log file (write-through backup online, sole sink offline).
-    if log_file is not None:
-        _sift_kv(terminalreporter, "Log file", str(log_file))
-
-    if offline:
-        if log_file is not None:
-            terminalreporter.write_sep("-", "to upload to Sift")
-            terminalreporter.write_line(f"  >> import-test-result-log {log_file}", cyan=True)
-        return
-
-    if not report_id:
-        # Incremental upload never mapped the report (the worker died before
-        # replaying the create), so there's no real report to link.
-        _sift_kv(
-            terminalreporter,
-            "Report",
-            f"not uploaded — replay with: import-test-result-log {log_file}",
-            yellow=True,
-        )
-    elif report_url is not None:
-        _sift_kv(terminalreporter, "Report", report_url, cyan=True)
-    else:
-        _sift_kv(
-            terminalreporter,
-            "Report",
-            f"id {report_id}  (set sift_app_url for a clickable link)",
-        )
-
-    if report_id and getattr(context, "replay_incomplete", False) and log_file is not None:
-        _sift_kv(
-            terminalreporter,
-            "",
-            f"may be incomplete — finish with: import-test-result-log {log_file}",
-            yellow=True,
-        )
-
-
-def _sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
-    """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
-
-    `get_closest_marker` walks the node hierarchy upward, so markers applied
-    at any level (function, class, module, package, session) are honored.
-    """
-    if node.get_closest_marker("sift_exclude"):
-        return False
-    if node.get_closest_marker("sift_include"):
-        return True
-    return default
-
-
-def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
-    """Determine log_file value from CLI flag or ini key.
-
-    Three signal types arrive here:
-
-    * ``None`` — unset; nothing was passed on the CLI and the ini key is
-      absent. Treat as the default "use a temp file."
-    * Python ``False`` — an explicit disable, typically set in a conftest via
-      ``config.option.sift_log_file = False``. Return ``None`` so
-      the rest of the pipeline knows to skip logging entirely.
-    * A string (from CLI or ini) — interpret ``"true"`` / ``"1"`` as the temp
-      file default, ``"false"`` / ``"none"`` as disable, anything else as a
-      file path.
-
-    Rejects ``--sift-log-file=none`` combined with ``--sift-offline`` since
-    offline mode needs the log file as its sole sink.
-    """
-    raw = _LOG_FILE.resolve(pytestconfig)
-    disabled = raw is False or (isinstance(raw, str) and raw.lower() in ("false", "none"))
-    if disabled and _is_offline(pytestconfig):
-        raise pytest.UsageError(
-            "--sift-log-file=none is incompatible with --sift-offline; offline "
-            "mode requires a log file. Pin one with --sift-log-file=<path>, or "
-            "drop --sift-log-file=none to use a temp file."
-        )
-    if raw is False:
-        return None
-    if not raw:
-        return True
-    lower = str(raw).lower()
-    if lower in ("true", "1"):
-        return True
-    if lower in ("false", "none"):
-        return None
-    return Path(raw)
-
-
-def _error_info_from_longrepr(longrepr: Any) -> ErrorInfo:
-    """Fall back to the report's longrepr when no Python exception is available."""
-    return ErrorInfo(error_code=1, error_message=str(longrepr) if longrepr is not None else "")
-
-
-def _resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
-    """Resolve the function step's status from pytest's per-phase reports.
-
-    Reads ``_sift_phase_setup`` / ``_sift_phase_call`` and the test's xfail marker,
-    then mutates ``new_step.current_step`` in place and flips
-    ``new_step._sift_managed_externally`` so ``NewStep.__exit__`` emits the
-    resolved status without re-classifying.
-
-    When the call phase reports ``passed`` and no override is needed (i.e. the
-    test's own status or substep failures should drive the result), this leaves
-    the step alone so the default ``__exit__`` resolution stays in charge.
-    """
-    current_step = new_step.current_step
-    if current_step is None:
-        # The step never opened (the autouse fixture short-circuited or was
-        # disabled). Nothing to resolve.
-        return
-    setup_phase = getattr(item, "_sift_phase_setup", None)
-    call_phase = getattr(item, "_sift_phase_call", None)
-    xfail_marker = item.get_closest_marker("xfail")
-    xfail_runs = xfail_marker.kwargs.get("run", True) if xfail_marker is not None else True
-
-    status: TestStatus | None = None
-    error_info: ErrorInfo | None = None
-    keep_managed = False
-
-    if setup_phase is not None and setup_phase.report.outcome == "failed":
-        status = TestStatus.ERROR
-        excinfo = setup_phase.call.excinfo
-        if excinfo is not None:
-            error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
-        else:
-            error_info = _error_info_from_longrepr(setup_phase.report.longrepr)
-    elif setup_phase is not None and setup_phase.report.outcome == "skipped":
-        status = TestStatus.SKIPPED
-    elif call_phase is None:
-        # Setup completed but the call-phase report never fired — the inner
-        # pytester session was aborted (e.g. by KeyboardInterrupt) before the
-        # plugin could observe the outcome. Leave the step at IN_PROGRESS so
-        # the report does not lie about a clean pass.
-        keep_managed = True
-    else:
-        wasxfail = getattr(call_phase.report, "wasxfail", None)
-        if wasxfail is not None:
-            if call_phase.report.outcome == "failed":
-                # Strict xpass: pytest synthesizes a failure when an xfail(strict=True)
-                # test unexpectedly passes. The xfail mark no longer matches reality.
-                status = TestStatus.FAILED
-            elif call_phase.report.outcome == "skipped":
-                if xfail_marker is not None and xfail_runs is False:
-                    # xfail(run=False): the test body never executed.
-                    status = TestStatus.SKIPPED
-                else:
-                    # xfail + expected failure: the test fulfilled its xfail expectation.
-                    status = TestStatus.PASSED
-            else:
-                # Non-strict xpass: passes that weren't required to fail.
-                status = TestStatus.PASSED
-        elif call_phase.report.outcome == "passed":
-            # Default __exit__ resolves PASSED/FAILED from open_step_results and any
-            # status the test code may have set. Don't override it here.
-            return
-        elif call_phase.report.outcome == "skipped":
-            status = TestStatus.SKIPPED
-        elif call_phase.report.outcome == "failed":
-            excinfo = call_phase.call.excinfo
-            children_passed = new_step.report_context.open_step_results.get(
-                current_step.step_path, True
-            )
-            if excinfo is None:
-                status = TestStatus.FAILED
-            elif isinstance(excinfo.value, AssertionError):
-                status = TestStatus.FAILED
-                error_info = format_assertion_message(excinfo.type, excinfo.value)
-            elif isinstance(excinfo.value, pytest.fail.Exception):
-                status = TestStatus.FAILED
-            elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
-                # Hard exits the plugin can observe: pytest converted the
-                # raise into a call-phase report. The session-aborting variant
-                # (call_phase is None) lands earlier and stays IN_PROGRESS.
-                status = TestStatus.ABORTED
-                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
-            elif xfail_marker is not None:
-                # xfail(raises=X) with a non-matching exception: the contract failed.
-                status = TestStatus.FAILED
-                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
-            elif not children_passed:
-                # A substep already recorded the error and carries the traceback;
-                # the test step only inherits the child-failed signal.
-                status = TestStatus.FAILED
-            else:
-                status = TestStatus.ERROR
-                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
-
-    if status is None and not keep_managed:
-        return
-
-    if status is not None:
-        # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
-        current_step.__dict__["status"] = status
-        if error_info is not None:
-            current_step.__dict__["error_info"] = error_info
-    new_step._sift_managed_externally = True
-
-
-def _finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestReport) -> None:
-    """Upgrade a closed step to FAILED when the teardown phase failed.
-
-    The autouse step fixture has already exited by the time the teardown
-    makereport hook fires, so call ``step.update`` again to override the status
-    server-side and propagate the failure to the still-open parent step.
-    """
-    step: NewStep | None = getattr(item, "_sift_step", None)
-    if step is None:
-        return
-    current_step = step.current_step
-    if current_step is None:
-        return
-    if teardown_report.outcome == "failed" and current_step.status == TestStatus.PASSED:
-        current_step.update({"status": TestStatus.FAILED})
-        step.report_context.mark_step_failed_after_close(current_step)
-
-
-@pytest.hookimpl(tryfirst=True, hookwrapper=True)
-def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """Capture per-phase reports and finalize step status after teardown.
-
-    Stashes both ``rep_<when>`` (the ``CallInfo``, kept for pytest plugins that
-    expect that conventional attribute) and ``_sift_phase_<when>`` (a
-    ``SimpleNamespace(call, report)`` used by ``_resolve_initial_status``). The
-    collection-time skip path is strictly gated on ``_sift_step`` being unset
-    so it does not duplicate steps the fixture already created.
-    """
-    outcome = yield
-    report = outcome.get_result()
-    setattr(item, "rep_" + report.when, call)
-    setattr(item, "_sift_phase_" + report.when, SimpleNamespace(call=call, report=report))
+    pytest_plugins = ["sift_client.pytest_plugin"]
 
-    # Collection-time skip (``@pytest.mark.skip`` / ``skipif``): the autouse
-    # ``step`` fixture never runs, so the hook is the only place that can
-    # record a step. Presence of ``_sift_step`` is the "fixture ran" signal.
-    if (
-        REPORT_CONTEXT
-        and report.when == "setup"
-        and report.outcome == "skipped"
-        and getattr(item, "_sift_step", None) is None
-    ):
-        with REPORT_CONTEXT.new_step(name=item.name) as inline_step:
-            inline_step.current_step.update({"status": TestStatus.SKIPPED})
+This module holds only the plugin's public surface: the catchable warnings,
+the session-state globals a conftest may read, the fixtures a project can
+request or override, and pytest's hook entry points. The implementation
+(settings registry, step stacks, report construction, terminal formatting)
+lives under ``sift_client._internal.pytest_plugin``.
+"""
 
-    if report.when == "teardown":
-        _finalize_after_teardown(item, report)
+from __future__ import annotations
 
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Any, Generator
 
-def _relativize(path: Path, rootpath: Path) -> str:
-    """Path relative to rootdir, or the basename when it sits outside the tree."""
-    try:
-        rel = str(path.relative_to(rootpath))
-    except ValueError:
-        return path.name
-    return "" if rel == "." else rel
+import pytest
 
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.pytest_plugin.modes import (
+    gate_enabled,
+    is_disabled,
+    is_offline,
+    mode_label,
+    sdk_version,
+)
+from sift_client._internal.pytest_plugin.options import (
+    API_KEY_OPTION,
+    APP_URL_OPTION,
+    GRPC_URI_OPTION,
+    OPEN_OPTION,
+    REST_URI_OPTION,
+    register_options,
+    warn_on_unknown_env_vars,
+    warn_on_unknown_toml_keys,
+)
+from sift_client._internal.pytest_plugin.report import (
+    OFFLINE_DEFAULTS,
+    build_disabled_client,
+    finalize_after_teardown,
+    report_context_impl,
+    resolve_report_link,
+    step_impl,
+)
+from sift_client._internal.pytest_plugin.steps import (
+    build_hierarchy_chain,
+    build_parametrize_path,
+    drain_hierarchy_stack,
+    drain_parametrize_stack,
+    hierarchy_key,
+    parametrize_path_key,
+    reconcile_hierarchy,
+    reconcile_parametrize,
+)
+from sift_client._internal.pytest_plugin.terminal import (
+    maybe_open_report,
+    write_disabled_summary,
+    write_report_summary,
+)
+from sift_client.errors import SiftWarning
+from sift_client.sift_types.test_report import TestStatus
+
+if TYPE_CHECKING:
+    from sift_client.util.test_results import ReportContext
+    from sift_client.util.test_results.context_manager import NewStep
 
-def _strip_param(nodeid: str) -> str:
-    """Drop the trailing ``[param]`` from a nodeid, keeping ``file::Class::func``.
+__all__ = [
+    "REPORT_CONTEXT",
+    "SIFT_REPORT_ID_STASH_KEY",
+    "SIFT_REPORT_URL_STASH_KEY",
+    "SiftPytestPluginWarning",
+    "SiftPytestStepDrainError",
+    "SiftPytestStepDrainWarning",
+    "client_has_connection",
+    "report_context",
+    "sift_client",
+    "step",
+]
 
-    The parametrize id is a variation of the test, not its identity — leaving it
-    in would make a re-parametrization silently shift the grouping key. Splits on
-    the last ``::`` segment and cuts at its first ``[``; class/function names
-    never contain ``[``, so nested brackets in a param value can't confuse it.
-    """
-    head, sep, leaf = nodeid.rpartition("::")
-    leaf = leaf.split("[", 1)[0]
-    return f"{head}{sep}{leaf}"
+
+# ---------------------------------------------------------------------------
+# Public warnings.
+# ---------------------------------------------------------------------------
 
 
-def _derive_target(request: pytest.FixtureRequest, args: tuple[str, ...]) -> str:
-    """Describe what was run, from the collected items rather than the command line.
+class SiftPytestPluginWarning(SiftWarning):
+    """Base warning for issues raised by the Sift pytest plugin."""
 
-    Collection is the ground truth of selection — independent of flag order,
-    ``-k`` / ``-m`` filters, or which path form was typed. Every value is
-    anchored to the rootdir (project) name so the shape is uniform; granularity
-    narrows with the selection:
 
-    * a single test -> ``project/tests/test_motor.py::test_spin`` (param stripped)
-    * a single file -> ``project/tests/test_motor.py``
-    * many files    -> their common directory, ``project/tests/motor``
-    * whole tree / nothing collected / paths outside rootdir -> ``project``
+class SiftPytestStepDrainWarning(SiftPytestPluginWarning):
+    """A step's ``__exit__`` raised while the plugin was draining its stack.
 
-    The report is session-level and individual tests are its steps, so the
-    file/directory grain is the natural unit of "what ran" for the report
-    itself. The verbatim invocation stays available via ``{command}`` and the
-    ``pytest_command`` metadata key.
+    Surfaced at module-teardown or session-end so the drain can continue and
+    pytest test outcomes stay unaffected; the underlying exception is included
+    in the message for debugging.
     """
-    rootpath = request.config.rootpath
-    root = rootpath.name
-
-    def _anchor(rel: str) -> str:
-        return f"{root}/{rel}" if rel else root
-
-    items = list(getattr(request.session, "items", ()) or ())
-    if not items:
-        return root
-    if len(items) == 1:
-        return _anchor(_strip_param(items[0].nodeid))
-    paths = {p for p in (getattr(i, "path", None) for i in items) if p is not None}
-    if not paths:
-        return root
-    if len(paths) == 1:
-        return _anchor(_relativize(next(iter(paths)), rootpath))
-    try:
-        common = Path(os.path.commonpath([str(p) for p in paths]))
-    except ValueError:
-        # e.g. paths on different drives (Windows); fall back to the project.
-        return root
-    return _anchor(_relativize(common, rootpath))
 
 
-def _build_template_fields(
-    target: str,
-    command: str,
-    args: tuple[str, ...],
-    request: pytest.FixtureRequest,
-) -> dict[str, Any]:
-    """Build the placeholder mapping shared by the name and test_case templates."""
-    items = getattr(request.session, "items", ()) or ()
-    git = _git_metadata() or {}
-    return {
-        "target": target,
-        "command": command,
-        "args": " ".join(args),
-        "rootdir": request.config.rootpath.name,
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "count": len(items),
-        "git_repo": git.get("git_repo", ""),
-        "git_branch": git.get("git_branch", ""),
-        "git_commit": git.get("git_commit", ""),
-    }
+class SiftPytestStepDrainError(RuntimeError):
+    """Raised when mid-session drain fails, signaling a likely upstream invariant break."""
 
 
-def _format_template(
-    template: str,
-    fields: dict[str, Any],
-    *,
-    fallback: str,
-    option_label: str,
-) -> str:
-    """Format ``template`` with ``fields``; on bad input, warn and return ``fallback``.
+# ---------------------------------------------------------------------------
+# Public session state and stash keys.
+# ---------------------------------------------------------------------------
 
-    A bad template should never block test results from being recorded, so the
-    rendering errors collapse to a warning + fallback rather than aborting the
-    session.
-    """
-    try:
-        return template.format(**fields)
-    except (KeyError, IndexError, ValueError) as exc:
-        warnings.warn(
-            f"Invalid {option_label} template {template!r} ({exc}); using fallback.",
-            SiftPytestPluginWarning,
-            stacklevel=2,
-        )
-        return fallback
+REPORT_CONTEXT: Any = None
 
+# Set at session end with the resolved (real) report id/URL when online and
+# uploaded. Read from a project's conftest in a later hook (e.g.
+# ``pytest_unconfigure``) to post the link, write a file, etc.
+SIFT_REPORT_ID_STASH_KEY = pytest.StashKey[str]()
+SIFT_REPORT_URL_STASH_KEY = pytest.StashKey[str]()
 
-def _report_context_impl(
-    sift_client: SiftClient,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config | None = None,
-) -> Generator[ReportContext, None, None]:
-    args = request.config.invocation_params.args
-    # ``target`` is "what ran", derived from the collected items (see
-    # _derive_target) — invocation-independent, unlike parsing the command
-    # line. Both the display name and test_case default to it; the verbatim
-    # command stays available via {command} and the pytest_command metadata.
-    target = _derive_target(request, args)
-    command = "pytest " + " ".join(args) if args else "pytest"
-    fields = _build_template_fields(target, command, args, request)
-    name_template = _REPORT_NAME.resolve(pytestconfig) or "{target} {timestamp}"
-    name = _format_template(
-        name_template,
-        fields,
-        fallback=f"{target} {fields['timestamp']}",
-        option_label="sift_report_name",
-    )
-    test_case_template = _TEST_CASE.resolve(pytestconfig)
-    test_case = (
-        _format_template(
-            test_case_template,
-            fields,
-            fallback=target,
-            option_label="sift_test_case",
-        )
-        if test_case_template
-        else target
-    )
-    # Metadata starts from the [tool.sift.pytest.report.metadata] TOML table, and
-    # the auto-recorded pytest_command layers in last so the user can't
-    # accidentally overwrite it.
-    report_metadata: dict[str, str | float | bool] = {
-        **_METADATA.resolve_merged(pytestconfig),
-        "pytest_command": command,
-    }
-    # Mode → ReportContext flags:
-    #   online (default): log_file=<temp or user path>, replay_log_file=True
-    #   --sift-offline:   log_file=<temp or user path>, replay_log_file=False
-    #   --sift-disabled:  log_file=False,               replay_log_file=False
-    disabled = sift_client._simulate
-    offline = False if disabled else _is_offline(pytestconfig)
-    log_file: str | Path | bool | None = False if disabled else _resolve_log_file(pytestconfig)
-    include_git_metadata = bool(_GIT_METADATA.resolve(pytestconfig))
-    with ReportContext(
-        sift_client,
-        name=name,
-        test_case=test_case,
-        test_system_name=_TEST_SYSTEM_NAME.resolve(pytestconfig) or None,
-        system_operator=_SYSTEM_OPERATOR.resolve(pytestconfig) or None,
-        serial_number=_SERIAL_NUMBER.resolve(pytestconfig) or None,
-        part_number=_PART_NUMBER.resolve(pytestconfig) or None,
-        log_file=log_file,
-        include_git_metadata=include_git_metadata,
-        replay_log_file=not (disabled or offline),
-        metadata=report_metadata,
-    ) as context:
-        global REPORT_CONTEXT
-        REPORT_CONTEXT = context
-        try:
-            yield context
-        finally:
-            # Drain the hierarchy + parametrize stacks INSIDE the
-            # ReportContext's ``with`` block, so the final ``__exit__``
-            # update calls for those parent steps are written to the log
-            # file BEFORE the import worker drains. Without this, the
-            # worker exits with a partial backlog and the parent steps
-            # are stuck IN_PROGRESS in the Sift report.
-            try:
-                _drain_parametrize_stack()
-            finally:
-                _drain_hierarchy_stack()
-
-
-# Placeholder credentials used in --sift-offline mode when env/ini values
-# are missing. Offline mode never makes network calls, so the values are
-# only syntactically required by SiftConnectionConfig.
-_OFFLINE_DEFAULTS = {
-    "SIFT_API_KEY": "offline",
-    "SIFT_GRPC_URI": "offline.invalid:0",
-    "SIFT_REST_URI": "http://offline.invalid",
-}
-
-
-def _build_disabled_client() -> SiftClient:
-    """Construct a SiftClient for ``--sift-disabled`` mode.
-
-    Tagged with ``_simulate=True`` so test-results writes short-circuit through
-    the existing low-level simulate path without contacting Sift. The URLs are
-    syntactically valid but unreachable; nothing dials them.
-    """
-    client = SiftClient(
-        connection_config=SiftConnectionConfig(
-            api_key="disabled",
-            grpc_url="disabled.invalid:0",
-            rest_url="http://disabled.invalid",
-        )
-    )
-    client._simulate = True
-    return client
+
+# ---------------------------------------------------------------------------
+# Fixtures.
+# ---------------------------------------------------------------------------
 
 
 @pytest.fixture(scope="session")
@@ -1579,7 +128,7 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
     (``SIFT_GRPC_URI``, ``SIFT_REST_URI``) additionally fall back to the
     ``sift_grpc_uri`` / ``sift_rest_uri`` ini keys, since they are stable
     per-org values that are safe to commit. ``SIFT_API_KEY`` is intentionally
-    env-only — use ``pytest-dotenv`` (already a project dependency) to load
+    env-only; use ``pytest-dotenv`` (already a project dependency) to load
     it from a ``.env`` file kept out of version control.
 
     Projects that need custom construction (TLS toggles, custom timeouts,
@@ -1594,30 +143,30 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
     mode the credential resolution is skipped entirely and placeholders are
     always used.
     """
-    if _is_disabled(pytestconfig):
-        return _build_disabled_client()
+    if is_disabled(pytestconfig):
+        return build_disabled_client()
     resolved = {
-        "SIFT_API_KEY": _API_KEY.resolve(pytestconfig),
-        "SIFT_GRPC_URI": _GRPC_URI.resolve(pytestconfig),
-        "SIFT_REST_URI": _REST_URI.resolve(pytestconfig),
+        "SIFT_API_KEY": API_KEY_OPTION.resolve(pytestconfig),
+        "SIFT_GRPC_URI": GRPC_URI_OPTION.resolve(pytestconfig),
+        "SIFT_REST_URI": REST_URI_OPTION.resolve(pytestconfig),
     }
     missing = [env for env, value in resolved.items() if not value]
-    if missing and not _is_offline(pytestconfig):
+    if missing and not is_offline(pytestconfig):
         raise pytest.UsageError(
             "Sift credentials missing: "
             + ", ".join(missing)
-            + ". Set the environment variable(s) — pytest-dotenv loads them "
-            "from a `.env` file automatically — or set the URIs under "
+            + ". Set the environment variable(s) (pytest-dotenv loads them "
+            "from a `.env` file automatically), or set the URIs under "
             "`sift_grpc_uri` / `sift_rest_uri` in `[tool.pytest.ini_options]` "
             "in pyproject.toml, or override the sift_client fixture in your "
             "conftest.py, or pass --sift-offline / --sift-disabled to run "
             "without contacting Sift."
         )
     for env in missing:
-        resolved[env] = _OFFLINE_DEFAULTS[env]
+        resolved[env] = OFFLINE_DEFAULTS[env]
     # Web-app origin for the report link: the SIFT_APP_URL env var wins, then the
     # sift_app_url ini key, else host-based derivation in SiftClient.app_url.
-    app_url = _APP_URL.resolve(pytestconfig)
+    app_url = APP_URL_OPTION.resolve(pytestconfig)
     return SiftClient(
         connection_config=SiftConnectionConfig(
             api_key=resolved["SIFT_API_KEY"] or "",
@@ -1628,6 +177,41 @@ def sift_client(pytestconfig: pytest.Config) -> SiftClient:
     )
 
 
+@pytest.fixture(scope="session")
+def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRequest) -> bool:
+    """Verify the ``SiftClient`` can reach Sift via ``/ping``.
+
+    Consulted at session start by ``report_context`` in online mode. A failed
+    ping aborts the session via ``pytest.exit``. Override this fixture in your
+    conftest to use a
+    different reachability signal (e.g. a cached auth token) for environments
+    where pinging is the wrong check. Returns ``False`` in ``--sift-disabled``
+    mode without constructing a client.
+    """
+    if is_disabled(pytestconfig):
+        return False
+    sift_client = request.getfixturevalue("sift_client")
+    sift_client.ping.ping()
+    return True
+
+
+def _set_report_context(
+    contexts: Generator[ReportContext, None, None],
+) -> Generator[ReportContext, None, None]:
+    """Publish each yielded ReportContext to the module-level ``REPORT_CONTEXT``.
+
+    ``report_context_impl`` stays pure: it builds and yields the context.
+    Ownership of the reassignable global lives here so the terminal-summary and
+    makereport hooks (which read ``REPORT_CONTEXT``) see it. The global is set
+    after the context opens and before tests run, then the impl's ``finally``
+    still drains the step stacks before the context exits.
+    """
+    global REPORT_CONTEXT
+    for context in contexts:
+        REPORT_CONTEXT = context
+        yield context
+
+
 @pytest.fixture(scope="session")
 def report_context(
     request: pytest.FixtureRequest, pytestconfig: pytest.Config
@@ -1661,13 +245,13 @@ def report_context(
     The log-file destination is controlled by
     ``--sift-log-file``; defaults to a temp file when unset.
     """
-    if _is_disabled(pytestconfig):
-        yield from _report_context_impl(
-            _build_disabled_client(), request, pytestconfig=pytestconfig
+    if is_disabled(pytestconfig):
+        yield from _set_report_context(
+            report_context_impl(build_disabled_client(), request, pytestconfig=pytestconfig)
         )
         return
     sift_client = request.getfixturevalue("sift_client")
-    if not _is_offline(pytestconfig):
+    if not is_offline(pytestconfig):
         try:
             request.getfixturevalue("client_has_connection")
         except pytest.UsageError:
@@ -1681,39 +265,32 @@ def report_context(
                 "--sift-disabled to skip Sift entirely.",
                 returncode=4,
             )
-    yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
-
-
-def _step_impl(
-    report_context: ReportContext, request: pytest.FixtureRequest
-) -> Generator[NewStep, None, None]:
-    node = request.node
-    # Items get a parametrize path stashed in ``pytest_collection_modifyitems``;
-    # modules/other nodes fall back to their node name. The leaf frame
-    # (``path[-1]``) is the test-specific display name — parents are opened
-    # by ``_parametrize_parents``. When parametrize-nesting is disabled, fall
-    # back to the bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf
-    # remains uniquely identifiable.
-    if _PARAMETRIZE_NESTING.resolve(request.config):
-        path = node.stash.get(_PARAMETRIZE_PATH_KEY, ())
-        name = path[-1] if path else str(node.name)
-    else:
-        name = str(node.name)
-    # ``node.obj`` may not exist (e.g., ``pytest.DoctestItem``) or may raise
-    # when accessed — fall back to no description in those cases rather than
-    # erroring out a perfectly valid test. ``getattr``'s default only
-    # suppresses ``AttributeError``; the try/except catches everything else
-    # (RuntimeError from a misbehaving ``__doc__`` descriptor, etc.).
-    try:
-        existing_docstring = getattr(getattr(node, "obj", None), "__doc__", None) or None
-    except Exception:
-        existing_docstring = None
-    with report_context.new_step(
-        name=name, description=existing_docstring, assertion_as_fail_not_error=False
-    ) as new_step:
-        node._sift_step = new_step
-        yield new_step
-        _resolve_initial_status(new_step, node)
+    yield from _set_report_context(
+        report_context_impl(sift_client, request, pytestconfig=pytestconfig)
+    )
+
+
+@pytest.fixture(autouse=True)
+def step(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+    _parametrize_parents: None,
+) -> Generator[NewStep | None, None, None]:
+    """Create an outer step for the function when the Sift gate is on.
+
+    Resolves the gate via `gate_enabled`: the `sift_exclude` marker forces off,
+    `sift_include` forces on, otherwise the `sift_autouse` ini default applies.
+    When on, requests the session `report_context` lazily; the first gated test
+    in the session triggers its creation, subsequent gated tests reuse it. In
+    ``--sift-disabled`` mode the report context is backed by a
+    ``SiftClient(_simulate=True)`` placeholder, so every write returns a
+    synthesized response without contacting Sift.
+    """
+    if not gate_enabled(request.node, pytestconfig):
+        yield None
+        return
+    rc = request.getfixturevalue("report_context")
+    yield from step_impl(rc, request)
 
 
 @pytest.fixture(autouse=True)
@@ -1723,75 +300,13 @@ def _hierarchy_parents(
 ) -> None:
     """Open/close hierarchy parent steps (packages, modules, classes) for the current item.
 
-    Same diff-stack pattern as ``_parametrize_parents`` but operates on
-    ``_HIERARCHY_KEY``. The chain is built outer-to-inner from the item's
-    collection-tree ancestors; which node types are included is decided at
-    build time by ``sift_package_step`` / ``sift_module_step`` /
-    ``sift_class_step``. When the chain changes (pop or push), the parametrize
-    stack is drained first since parametrize parents nest INSIDE these.
-
-    Gated off when the item is excluded (avoids eager ``report_context`` setup).
+    Gated off when the item is excluded (avoids eager ``report_context`` setup);
+    otherwise delegates to ``reconcile_hierarchy``, which diffs the item's
+    ancestor chain against the open stack and opens/closes parents to match.
     """
-    default = bool(_AUTOUSE.resolve(pytestconfig))
-    if not _sift_enabled_for(request.node, default):
-        return None
-    # Fall back to computing the chain on-demand for items that bypassed
-    # ``pytest_collection_modifyitems`` (e.g., dynamically inserted by another
-    # plugin's later hook). Defaulting to ``()`` would incorrectly drain the
-    # entire open hierarchy stack for those items.
-    desired = request.node.stash.get(_HIERARCHY_KEY, _STASH_MISSING)
-    if desired is _STASH_MISSING:
-        desired = _build_hierarchy_chain(request.node, pytestconfig)
-    common = 0
-    # Compare on identity (nodeid) — same-named ancestors at different paths
-    # MUST stay distinct.
-    while (
-        common < len(_HIERARCHY_STACK)
-        and common < len(desired)
-        and _HIERARCHY_STACK[common][0] == desired[common][0]
-    ):
-        common += 1
-    # Any change to the hierarchy chain orphans parametrize parents from the
-    # previous test — drain them before mutating the hierarchy stack so
-    # ReportContext's top-of-stack invariant holds. Strict mode: a per-frame
-    # ``__exit__`` failure here signals a real upstream drift between the
-    # plugin stacks and ReportContext; raise it as a test error instead of a
-    # silenceable warning.
-    if common < len(_HIERARCHY_STACK) or common < len(desired):
-        _drain_parametrize_stack(swallow_errors=False)
-    # Symmetric per-frame guard for the hierarchy pop so one bad ``__exit__``
-    # doesn't leave _HIERARCHY_STACK partially drained for every subsequent test.
-    while len(_HIERARCHY_STACK) > common:
-        _identity, name, ns = _HIERARCHY_STACK.pop()
-        _close_frame(name, ns)
-    if not desired[common:]:
-        return None
-    # Fetch ``report_context`` lazily — but only when there's at least one
-    # rendered frame to push. Pure diff-only frames (e.g. a Package frame when
-    # ``sift_package_step=false``) just update _HIERARCHY_STACK with ns=None.
-    rc = None
-    # Roll back any partial push so a mid-loop exception doesn't leave half
-    # the chain orphaned on the stack. Per-frame guard inside the rollback so
-    # a failing ``__exit__`` doesn't shadow the original exception or leak
-    # the remaining opened frames.
-    opened: list[tuple[str, str, Any]] = []
-    try:
-        for identity, name, doc, rendered in desired[common:]:
-            if rendered:
-                if rc is None:
-                    rc = request.getfixturevalue("report_context")
-                ns = rc.new_step(name=name, description=doc, assertion_as_fail_not_error=False)
-                ns.__enter__()
-                opened.append((identity, name, ns))
-            else:
-                opened.append((identity, name, None))
-    except BaseException:
-        while opened:
-            _identity, name, ns = opened.pop()
-            _close_frame(name, ns)
-        raise
-    _HIERARCHY_STACK.extend(opened)
-    return None
+    if not gate_enabled(request.node, pytestconfig):
+        return
+    reconcile_hierarchy(request, pytestconfig)
 
 
 @pytest.fixture(autouse=True)
@@ -1802,97 +317,160 @@ def _parametrize_parents(
 ) -> None:
     """Open/close shared parametrize parent steps for the current item.
 
-    Diffs the item's desired parametrize path against the open stack: pops the
-    stale tail, then opens new parents (everything except the innermost frame —
-    the ``step`` fixture creates that as the leaf). Parents persist across
-    sibling items so a tree like ``test_x[a=1]`` / ``test_x[a=2]`` shares one
-    ``test_x`` container.
-
-    Gated off when the current item is excluded so that excluded items don't
-    eagerly request ``report_context`` (which would defeat its lazy creation),
-    or when ``sift_parametrize_nesting=false``. Parents persist until the
-    diff against a subsequent test's chain pops them, or until
-    ``pytest_sessionfinish`` drains anything left at session end.
+    Ordered after ``_hierarchy_parents`` so parametrize parents nest inside the
+    hierarchy ones. Gated off when the item is excluded (so excluded items don't
+    eagerly request ``report_context``); otherwise delegates to
+    ``reconcile_parametrize``, which also no-ops when
+    ``sift_parametrize_nesting=false``. Parents persist until a later test's
+    chain pops them, or until ``pytest_sessionfinish`` drains the rest.
     """
-    default = bool(_AUTOUSE.resolve(pytestconfig))
-    if not _sift_enabled_for(request.node, default):
-        return None
-    if not _PARAMETRIZE_NESTING.resolve(pytestconfig):
-        return None
-    # Fall back to on-demand computation for dynamically-inserted items;
-    # see _hierarchy_parents for the same rationale.
-    desired = request.node.stash.get(_PARAMETRIZE_PATH_KEY, _STASH_MISSING)
-    if desired is _STASH_MISSING:
-        desired = _build_parametrize_path(request.node)
-    parents = desired[:-1]
-    common = 0
-    while (
-        common < len(_PARAMETRIZE_STACK)
-        and common < len(parents)
-        and _PARAMETRIZE_STACK[common][0] == parents[common]
+    if not gate_enabled(request.node, pytestconfig):
+        return
+    reconcile_parametrize(request, pytestconfig)
+
+
+# ---------------------------------------------------------------------------
+# Hooks (in lifecycle fire order).
+# ---------------------------------------------------------------------------
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Register every CLI flag and pytest ini key declared in ``PLUGIN_OPTIONS``."""
+    register_options(parser)
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the Sift gate markers and warn on unknown ``SIFT_*`` settings."""
+    config.addinivalue_line(
+        "markers",
+        "sift_include: force the Sift autouse fixtures to activate for this test "
+        "regardless of the `sift_autouse` ini default.",
+    )
+    config.addinivalue_line(
+        "markers",
+        "sift_exclude: force the Sift autouse fixtures to skip this test "
+        "regardless of the `sift_autouse` ini default.",
+    )
+    # Surface typos in env vars and [tool.sift...] keys at session start so a
+    # silent no-op (env var that doesn't match anything, table key the loader
+    # ignores) becomes visible. The registry is the source of truth for what's
+    # known.
+    warn_on_unknown_env_vars()
+    warn_on_unknown_toml_keys(config)
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
+    """Stash each item's class chain + parametrize path and cluster siblings.
+
+    Sorts by ``(file_path, hierarchy_chain, parametrize_path)`` so sibling
+    items under a shared parent (package, module, class, or parametrize axis)
+    stay contiguous; otherwise a free function sorting between two class
+    methods would tear down + re-open the class step, producing duplicate
+    parents in the report tree.
+    """
+    for item in items:
+        item.stash[hierarchy_key] = build_hierarchy_chain(item, config)
+        item.stash[parametrize_path_key] = build_parametrize_path(item)
+    # Use ``.get(...)`` defensively: a third-party hook may inject items after
+    # our stashing loop runs, and we'd rather sort them at the tail than
+    # KeyError out of collection.
+    items.sort(
+        key=lambda i: (
+            str(i.path),
+            tuple(identity for identity, _, _, _ in i.stash.get(hierarchy_key, ())),
+            i.stash.get(parametrize_path_key, ()),
+        )
+    )
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
+    """Capture per-phase reports and finalize step status after teardown.
+
+    Stashes both ``rep_<when>`` (the ``CallInfo``, kept for pytest plugins that
+    expect that conventional attribute) and ``_sift_phase_<when>`` (a
+    ``SimpleNamespace(call, report)`` used by ``resolve_initial_status``). The
+    collection-time skip path is strictly gated on ``_sift_step`` being unset
+    so it does not duplicate steps the fixture already created.
+    """
+    outcome = yield
+    report = outcome.get_result()
+    setattr(item, "rep_" + report.when, call)
+    setattr(item, "_sift_phase_" + report.when, SimpleNamespace(call=call, report=report))
+
+    # Collection-time skip (``@pytest.mark.skip`` / ``skipif``): the autouse
+    # ``step`` fixture never runs, so the hook is the only place that can
+    # record a step. Presence of ``_sift_step`` is the "fixture ran" signal.
+    if (
+        REPORT_CONTEXT
+        and report.when == "setup"
+        and report.outcome == "skipped"
+        and getattr(item, "_sift_step", None) is None
     ):
-        common += 1
-    # Per-frame guard so one bad ``__exit__`` doesn't leave _PARAMETRIZE_STACK
-    # partially drained for every subsequent test.
-    while len(_PARAMETRIZE_STACK) > common:
-        name, ns = _PARAMETRIZE_STACK.pop()
-        _close_frame(name, ns)
-    if not parents[common:]:
-        return None
-    rc = request.getfixturevalue("report_context")
-    opened: list[tuple[str, Any]] = []
+        with REPORT_CONTEXT.new_step(name=item.name) as inline_step:
+            inline_step.current_step.update({"status": TestStatus.SKIPPED})
+
+    if report.when == "teardown":
+        finalize_after_teardown(item, report)
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
+    """Drain any parent steps still open at session end (innermost first).
+
+    Wrapped so a failure in the inner drain does not prevent the outer one
+    from running. With ``module_substep`` removed, this is the sole place
+    where hierarchy parents close; they persist across all tests and only
+    drain when the session ends.
+    """
     try:
-        for display in parents[common:]:
-            ns = rc.new_step(name=display, assertion_as_fail_not_error=False)
-            ns.__enter__()
-            opened.append((display, ns))
-    except BaseException:
-        while opened:
-            name, ns = opened.pop()
-            _close_frame(name, ns)
-        raise
-    _PARAMETRIZE_STACK.extend(opened)
-    return None
+        drain_parametrize_stack()
+    finally:
+        drain_hierarchy_stack()
 
 
-@pytest.fixture(autouse=True)
-def step(
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-    _parametrize_parents: None,
-) -> Generator[NewStep | None, None, None]:
-    """Create an outer step for the function when the Sift gate is on.
+def pytest_report_header(config: pytest.Config) -> str | None:
+    """Emit a session-start header with the SDK version and active mode.
 
-    Resolves the gate via `_sift_enabled_for(request.node, ini_default)`:
-    `sift_exclude` marker forces off, `sift_include` forces on, otherwise the
-    `sift_autouse` ini default applies. When on, requests the
-    session `report_context` lazily — the first gated test in the session
-    triggers its creation, subsequent gated tests reuse it. In
-    ``--sift-disabled`` mode the report context is backed by a
-    ``SiftClient(_simulate=True)`` placeholder, so every write returns a
-    synthesized response without contacting Sift.
+    Suppressed under ``-q`` (negative verbosity), matching how pytest hides its
+    own platform/plugin header.
     """
-    default = bool(_AUTOUSE.resolve(pytestconfig))
-    if not _sift_enabled_for(request.node, default):
-        yield None
-        return
-    rc = request.getfixturevalue("report_context")
-    yield from _step_impl(rc, request)
+    if config.get_verbosity() < 0:
+        return None
+    return f"Sift: sift-stack-py {sdk_version()} — {mode_label(config)} mode"
 
 
-@pytest.fixture(scope="session")
-def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRequest) -> bool:
-    """Verify the ``SiftClient`` can reach Sift via ``/ping``.
+def pytest_terminal_summary(terminalreporter: Any, exitstatus: int, config: pytest.Config) -> None:
+    """Emit a session-end Sift report summary, adapting per mode.
 
-    Consulted at session start by ``report_context`` in online mode. A failed
-    ping aborts the session via ``pytest.exit``. Override this fixture in your
-    conftest to use a
-    different reachability signal (e.g. a cached auth token) for environments
-    where pinging is the wrong check. Returns ``False`` in ``--sift-disabled``
-    mode without constructing a client.
+    The printed panel is suppressed under ``-q``, but programmatic side effects
+    (stashing the report ref for ``conftest.py``, ``--sift-open-report``) still run so
+    other plugins and CI steps can consume the result. The panel itself is
+    rendered by ``write_report_summary``; this hook handles the side effects.
     """
-    if _is_disabled(pytestconfig):
-        return False
-    sift_client = request.getfixturevalue("sift_client")
-    sift_client.ping.ping()
-    return True
+    quiet = config.get_verbosity() < 0
+
+    if is_disabled(config):
+        if not quiet:
+            write_disabled_summary(terminalreporter)
+        return
+
+    context = REPORT_CONTEXT
+    if context is None:
+        # No gated test ran, so no report context was created. Nothing to show.
+        return
+
+    offline = is_offline(config)
+    # Resolve the link first so stashing and --sift-open-report run even under -q;
+    # programmatic consumers don't care about verbosity.
+    report_id, report_url = resolve_report_link(context, offline)
+    if report_id:
+        config.stash[SIFT_REPORT_ID_STASH_KEY] = report_id
+    if report_url is not None:
+        config.stash[SIFT_REPORT_URL_STASH_KEY] = report_url
+        if OPEN_OPTION.resolve(config):
+            maybe_open_report(report_url)
+
+    if quiet:
+        return
+
+    write_report_summary(terminalreporter, context, config, report_id, report_url, offline)

From 56f711df815adbe34f3cb3bdb10494e4f3b62bb8 Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Mon, 8 Jun 2026 09:04:29 -0700
Subject: [PATCH 18/19] Python(feat): broaden step failure check to cover
 substeps (#619)

---
 python/CHANGELOG.md                           |  2 +-
 .../docs/examples/pytest_plugin_quickstart.md |  6 +-
 .../guides/pytest_plugin/configuration.md     |  2 +-
 python/docs/guides/pytest_plugin/index.md     |  4 +-
 .../guides/pytest_plugin/report_structure.md  |  4 +-
 python/examples/pytest_plugin/README.md       |  8 +--
 .../tests/with_sift/test_with_sift_demo.py    | 33 +++++++----
 .../_tests/pytest_plugin/test_pass_fail.py    | 25 ++++++--
 python/lib/sift_client/pytest_plugin.py       | 10 ++--
 .../util/test_results/context_manager.py      | 58 ++++++++++++-------
 10 files changed, 97 insertions(+), 55 deletions(-)

diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index 2ae4b6a88..01cd82631 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -13,7 +13,7 @@ Highlights:
 - **Hierarchical report tree.** Packages, modules, classes, and parametrize axes above a test each become a parent step, so the report mirrors your test layout. Arbitrary substeps can be opened inside a test.
 - **Three running modes.** Online (default) pings Sift at session start and streams create/update calls during the run; offline records to a JSONL log for later replay; disabled evaluates bounds locally without contacting Sift. Select with `--sift-offline` or `--sift-disabled`.
 - **Graceful connection handling.** Online mode aborts at session start if Sift is unreachable or credentials are invalid, so a misconfigured job fails fast. If the connection drops mid-run, tests keep running and the log keeps writing locally; remaining entries upload afterward via the import command the plugin prints on exit.
-- **Pass/fail mapping.** Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard exit) maps to a `TestStatus` and propagates to parent steps and the report. `step.measure(...)` returns a pass/fail boolean without raising, so all measurements land in the report even when one fails; `step.fail_if_measurements_failed()` fails the test at the end without adding assertion noise to `error_info`.
+- **Pass/fail mapping.** Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard exit) maps to a `TestStatus` and propagates to parent steps and the report. `step.measure(...)` returns a pass/fail boolean without raising, so all measurements land in the report even when one fails; `step.pytest_fail_if_step_failed()` fails the test at the end if the step or any descendant failed (out-of-bounds measurements, failed substeps, `report_outcome` failures) without adding assertion noise to `error_info`.
 - **Assertion messages as error info.** Assertion failure messages are reported as the step's error info.
 - **Git metadata.** Repo, branch, and commit are captured on the report automatically.
 - **Terminal output.** The plugin prints a session header with the SDK version and active mode, and an end-of-run `Sift report` panel showing the test case, outcome, step and measurement breakdowns (color-coded), test system/operator, plus a link to the report (online), the saved log and upload command (offline), or a disabled note. Both suppress under `-q`. `SiftClient.app_url` exposes the web-app origin; set `sift_app_url` for on-prem or custom deployments. `--sift-open-report` opens the report in a browser at session end.
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
index 30012f9b4..cf19c11fb 100644
--- a/python/docs/examples/pytest_plugin_quickstart.md
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -140,7 +140,7 @@ TestReport (FAILED, since failures propagate up from leaves)
         │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
         ├── test_measure_series                                      PASSED
         ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
-        ├── test_fail_if_measurements_failed_at_end                               FAILED  (pytest FAILED)
+        ├── test_pytest_fail_if_step_failed_at_end                                FAILED  (pytest FAILED)
         ├── test_report_level_metadata                               PASSED
         └── TestClassStep
             ├── test_parametrize
@@ -162,8 +162,8 @@ The `with_sift` module shows two patterns for handling measurement results:
 `test_failed_measurement_marks_sift_step_failed` lets the test keep passing
 in pytest while the Sift step is `FAILED` (useful when measurements are
 diagnostic data you want to collect regardless of outcome); and
-`test_fail_if_measurements_failed_at_end` takes every measurement first and
-then calls `step.fail_if_measurements_failed()` once at the end, so every
+`test_pytest_fail_if_step_failed_at_end` takes every measurement first and
+then calls `step.pytest_fail_if_step_failed()` once at the end, so every
 measurement still lands in the report even when one fails. The end-of-test
 call is the recommended pattern: it fails via `pytest.fail` (no assertion
 noise in `error_info`), and unlike asserting on an individual
diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
index a05897cd4..47427055b 100644
--- a/python/docs/guides/pytest_plugin/configuration.md
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -117,7 +117,7 @@ def sift_client() -> SiftClient:
 | Name | Kind | Scope | Purpose |
 |---|---|---|---|
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
-| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `fail_if_measurements_failed`, and `current_step`. |
+| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `pytest_fail_if_step_failed`, and `current_step`. |
 | `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently; see [settings reference](#settings-reference). |
 | `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
 | `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
diff --git a/python/docs/guides/pytest_plugin/index.md b/python/docs/guides/pytest_plugin/index.md
index a649204a4..93879692c 100644
--- a/python/docs/guides/pytest_plugin/index.md
+++ b/python/docs/guides/pytest_plugin/index.md
@@ -45,7 +45,7 @@ def test_battery_voltage(step):
         bounds={"min": 4.8, "max": 5.2},
         unit="V",
     )
-    step.fail_if_measurements_failed()
+    step.pytest_fail_if_step_failed()
 ```
 
 Run it:
@@ -59,7 +59,7 @@ A `TestReport` shows up in Sift once the session finishes.
 !!! tip "Fail at the end, not per measurement"
     `step.measure(...)` returns a pass/fail boolean and never raises, so a
     failing measurement marks the step failed without aborting the test. Take
-    every measurement first, then call `step.fail_if_measurements_failed()` once
+    every measurement first, then call `step.pytest_fail_if_step_failed()` once
     at the end, so every measurement still lands in the report even when one
     fails. It fails the test via `pytest.fail` (no assertion noise in
     `error_info`), and unlike asserting on an individual `step.measure(...)` call
diff --git a/python/docs/guides/pytest_plugin/report_structure.md b/python/docs/guides/pytest_plugin/report_structure.md
index 811fd7cf0..dd0d8ed54 100644
--- a/python/docs/guides/pytest_plugin/report_structure.md
+++ b/python/docs/guides/pytest_plugin/report_structure.md
@@ -29,7 +29,7 @@ def test_measure_a_single_value(step):
     )
     # An out-of-bounds measurement already marks the step FAILED. Call this at
     # the end to also fail pytest, without an assertion message in error_info.
-    step.fail_if_measurements_failed()
+    step.pytest_fail_if_step_failed()
 
 
 def test_measure_strings_and_booleans(step):
@@ -354,7 +354,7 @@ def test_only_outliers_recorded(step):
     )
     # Returns False because 99.9 is out of bounds. The step is already
     # marked failed; call this only if you also want pytest to fail.
-    step.fail_if_measurements_failed()
+    step.pytest_fail_if_step_failed()
 ```
 
 !!! note "`measure_all` requires at least one bound"
diff --git a/python/examples/pytest_plugin/README.md b/python/examples/pytest_plugin/README.md
index 0a94b7f97..fcc60fd5f 100644
--- a/python/examples/pytest_plugin/README.md
+++ b/python/examples/pytest_plugin/README.md
@@ -76,7 +76,7 @@ TestReport (FAILED, since failures propagate up from leaves)
         │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
         ├── test_measure_series                                      PASSED
         ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
-        ├── test_fail_if_measurements_failed_at_end                               FAILED  (pytest FAILED)
+        ├── test_pytest_fail_if_step_failed_at_end                                FAILED  (pytest FAILED)
         ├── test_report_level_metadata                               PASSED
         └── TestClassStep
             ├── test_parametrize
@@ -98,8 +98,8 @@ The `with_sift` module shows two patterns for handling measurement results:
 `test_failed_measurement_marks_sift_step_failed` lets the test keep passing
 in pytest while the Sift step is `FAILED` (useful when measurements are
 diagnostic data you want to collect regardless of outcome); and
-`test_fail_if_measurements_failed_at_end` takes every measurement first and
-then calls `step.fail_if_measurements_failed()` once at the end, so every
+`test_pytest_fail_if_step_failed_at_end` takes every measurement first and
+then calls `step.pytest_fail_if_step_failed()` once at the end, so every
 measurement still lands in the report even when one fails. The end-of-test
 call is the recommended pattern: it fails via `pytest.fail` (no assertion
 noise in `error_info`), and unlike asserting on an individual
@@ -117,5 +117,5 @@ Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
 | `conftest.py` | Plugin registration via `pytest_plugins` (a single line) |
 | `pyproject.toml` | Pytest nesting/git-metadata knobs at their defaults; report `name`, `test_case`, and `metadata` under `[tool.sift.pytest.report]` |
 | `tests/pytest_only/test_pytest_only_demo.py` | Plain pytest tests with no Sift APIs. The plugin captures pass/fail automatically; covers functions, fixtures, parametrize, classes, plus one each of `AssertionError` (FAILED), `pytest.skip` (SKIPPED), and a raised `ValueError` (ERROR) |
-| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `step.fail_if_measurements_failed()` end-of-test call that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
+| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `step.pytest_fail_if_step_failed()` end-of-test call that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
 | `tests/{pytest_only,with_sift}/__init__.py` | Each Python package (directory with `__init__.py`) becomes a parent step in the report tree |
diff --git a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
index 7cbe8f8ce..c25c605c5 100644
--- a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
+++ b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
@@ -40,6 +40,11 @@ def test_substeps(step) -> None:
     Metadata can be attached at the step level by passing ``metadata=...`` to
     ``substep``; the same keyword is accepted by ``report_context.new_step``
     and propagates to the resulting ``TestStep``.
+
+    A failed substep marks this step FAILED in the report without raising, so
+    the end-of-test ``step.pytest_fail_if_step_failed()`` call is needed here
+    too: it folds substep failures (not just direct measurements) into the
+    pytest outcome.
     """
     with step.substep(name="phase_1", metadata={"phase_index": 1}) as s1:
         s1.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
@@ -48,6 +53,9 @@ def test_substeps(step) -> None:
         with s2.substep(name="phase_2a") as s2a:
             s2a.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
 
+    # Fails pytest if any substep above failed; no-op when they all passed.
+    step.pytest_fail_if_step_failed()
+
 
 def test_measure_series(step) -> None:
     """``measure_avg`` and ``measure_all`` are the series variants of ``measure``.
@@ -94,25 +102,28 @@ def test_failed_measurement_marks_sift_step_failed(step) -> None:
     )
 
 
-def test_fail_if_measurements_failed_at_end(step) -> None:
-    """Recommended pattern: take every measurement first, then call
-    ``step.fail_if_measurements_failed()`` once at the end.
+def test_pytest_fail_if_step_failed_at_end(step) -> None:
+    """Recommended pattern: do every measurement and substep first, then call
+    ``step.pytest_fail_if_step_failed()`` once at the end.
 
     Asserting on individual ``step.measure(...)`` calls raises
     ``AssertionError`` on the first failure, so any measurements after the
     failing one never run and never land in the Sift report. The end-of-test
-    call is strictly better for diagnostic completeness: every measurement is
-    recorded, including the failures, and the aggregate result is then folded
-    into the pytest outcome. It fails via ``pytest.fail`` rather than an
-    assertion, so the failed step carries no assertion noise in ``error_info``.
-
-    The ``b`` measurement below is deliberately out of bounds. ``c`` still
-    runs and is recorded; only the final call fails the test.
+    call is strictly better for diagnostic completeness: every measurement and
+    substep is recorded, including the failures, and the aggregate result is
+    then folded into the pytest outcome. It fails via ``pytest.fail`` rather
+    than an assertion, so the failed step carries no assertion noise in
+    ``error_info``.
+
+    It fails on any failure the report would record: out-of-bounds
+    measurements, failed substeps, and ``report_outcome`` failures. The ``b``
+    measurement below is deliberately out of bounds. ``c`` still runs and is
+    recorded; only the final call fails the test.
     """
     step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
     step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})  # out of bounds
     step.measure(name="c", value=1.5, bounds={"min": 0.0, "max": 2.0})  # still recorded
-    step.fail_if_measurements_failed()
+    step.pytest_fail_if_step_failed()
 
 
 def test_report_level_metadata(step, report_context) -> None:
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index d5f9674ce..52aa6f23c 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -137,8 +137,8 @@ def test_x():
     assert capture.final_status("test_x") == TestStatus.FAILED
 
 
-def test_fail_if_measurements_failed_fails_without_error_info(inner):
-    # An out-of-bounds measurement plus step.fail_if_measurements_failed()
+def test_pytest_fail_if_step_failed_fails_without_error_info(inner):
+    # An out-of-bounds measurement plus step.pytest_fail_if_step_failed()
     # fails the test via pytest.fail, so the step is FAILED with no assertion
     # message in error_info (the reason this helper exists over `assert`).
     _run(
@@ -146,20 +146,35 @@ def test_fail_if_measurements_failed_fails_without_error_info(inner):
         """
         def test_x(step):
             step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})
-            step.fail_if_measurements_failed()
+            step.pytest_fail_if_step_failed()
         """,
     )
     assert capture.final_status("test_x") == TestStatus.FAILED
     assert capture.final_error_message("test_x") is None
 
 
-def test_fail_if_measurements_failed_passes_when_in_bounds(inner):
+def test_pytest_fail_if_step_failed_fails_on_failed_substep(inner):
+    # A failed substep (here via report_outcome) leaves no out-of-bounds
+    # measurement on the step, but the report still marks the step FAILED.
+    # pytest_fail_if_step_failed must fail the test so the verdict matches.
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.report_outcome("check", False, "deliberately failing")
+            step.pytest_fail_if_step_failed()
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_pytest_fail_if_step_failed_passes_when_in_bounds(inner):
     _run(
         inner,
         """
         def test_x(step):
             step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
-            step.fail_if_measurements_failed()
+            step.pytest_fail_if_step_failed()
         """,
     )
     assert capture.final_status("test_x") == TestStatus.PASSED
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index 43f689894..a381d78cd 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Generator
+from typing import Any, Generator
 
 import pytest
 
@@ -61,15 +61,15 @@
 )
 from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import TestStatus
-
-if TYPE_CHECKING:
-    from sift_client.util.test_results import ReportContext
-    from sift_client.util.test_results.context_manager import NewStep
+from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import NewStep
 
 __all__ = [
     "REPORT_CONTEXT",
     "SIFT_REPORT_ID_STASH_KEY",
     "SIFT_REPORT_URL_STASH_KEY",
+    "NewStep",
+    "ReportContext",
     "SiftPytestPluginWarning",
     "SiftPytestStepDrainError",
     "SiftPytestStepDrainWarning",
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 497404c45..5cd2c6729 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -521,10 +521,10 @@ def __init__(
         # Per-step measurement-failure count for ``measurements_passed``.
         # Tracks only direct ``measure*`` calls on this NewStep instance;
         # substep / ``report_outcome`` failures are intentionally not folded
-        # in here (see ``measurements_passed`` vs ``passed``).
+        # in here. ``pytest_fail_if_step_failed`` covers the broader case.
         self._failed_measurement_count = 0
         # Out-of-bounds measurements recorded on this step, retained so
-        # ``fail_if_measurements_failed`` can name them in the failure message.
+        # ``pytest_fail_if_step_failed`` can name them in the failure message.
         self._failed_measurements: list[TestMeasurement] = []
 
     def __enter__(self):
@@ -539,34 +539,50 @@ def measurements_passed(self) -> bool:
         """True if every measurement recorded directly on this step has passed.
 
         Counts only ``step.measure``, ``step.measure_avg``, and
-        ``step.measure_all`` calls on this ``NewStep`` instance. Pair it with
-        ``fail_if_measurements_failed()`` at the end of a test to fail pytest on
-        any out-of-bounds measurement without short-circuiting on the first
-        failure (asserting on individual ``measure(...)`` return values skips
-        every measurement after the failing one).
+        ``step.measure_all`` calls on this ``NewStep`` instance; substep and
+        ``report_outcome`` failures are not folded in. For the end-of-test
+        failure that mirrors the report, use ``pytest_fail_if_step_failed()``,
+        which also covers failed substeps.
         """
         return self._failed_measurement_count == 0
 
-    def fail_if_measurements_failed(self, message: str = "measurements out of bounds") -> None:
-        """Fail the pytest test if any measurement on this step was out of bounds.
+    def pytest_fail_if_step_failed(self, message: str = "step failed") -> None:
+        """Fail the running pytest test if this step or any descendant failed.
 
-        Use instead of ``assert step.measurements_passed``: it fails via
-        ``pytest.fail`` so the step resolves to FAILED without attaching an
-        assertion message to ``error_info``. No-op when every measurement
-        passed. Call once at the end of the test so every measurement is still
-        recorded before the failure fires.
+        Covers every signal that resolves the step to FAILED in the report:
+        out-of-bounds measurements recorded directly on the step, failed
+        substeps, and ``report_outcome`` failures. Call it once at the end of a
+        test so the pytest verdict matches the report instead of passing green
+        while the report shows a failure.
 
-        The failure message names each out-of-bounds measurement with its
-        recorded value and bounds. ``message`` is used as the header line.
+        It fails via ``pytest.fail(pytrace=False)`` so the step resolves to
+        FAILED without an assertion traceback in ``error_info``. No-op when the
+        step and all of its descendants passed. Call after the work is done so
+        every measurement and substep is recorded before the failure fires.
+
+        The failure message names each out-of-bounds measurement and each
+        failed substep. ``message`` is used as the header line.
         """
-        if self.measurements_passed:
+        step = self.current_step
+        # ``open_step_results[step_path]`` is the same signal ``__exit__`` reads
+        # to resolve status: it is flipped False by a direct measurement failure
+        # (record_step_outcome) and by any failed child as it propagates upward
+        # (propagate_step_result). Default True covers a step that never opened.
+        if step is None or self.report_context.open_step_results.get(step.step_path, True):
             return
         import pytest
 
-        failed = self._failed_measurements
-        header = f"{message} ({len(failed)}):" if failed else message
-        body = [f"  - {m}" for m in failed]
-        pytest.fail("\n".join([header, *body]), pytrace=False)
+        prefix = f"{step.step_path}."
+        failed_substeps = [
+            s
+            for s in self.report_context.created_steps
+            if s.step_path.startswith(prefix)
+            and s.status not in (TestStatus.PASSED, TestStatus.SKIPPED, TestStatus.IN_PROGRESS)
+        ]
+        details = [f"  - measurement {m}" for m in self._failed_measurements]
+        details += [f"  - substep {s.step_path!r}: {s.status.name}" for s in failed_substeps]
+        header = f"{message} ({len(details)}):" if details else message
+        pytest.fail("\n".join([header, *details]), pytrace=False)
 
     def update_step_from_result(
         self,

From 46ca0d409fd8f254569408ea19bfbd87c7c9808b Mon Sep 17 00:00:00 2001
From: Alex Luck <luck@siftstack.com>
Date: Mon, 8 Jun 2026 09:52:20 -0700
Subject: [PATCH 19/19] =?UTF-8?q?Python(feat):=20order-independent=20pytes?=
 =?UTF-8?q?t=20report=20tree=20with=20early=20parent=20=E2=80=A6=20(#616)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../guides/pytest_plugin/configuration.md     |   3 +-
 .../pytest_plugin/pass_fail_behavior.md       |  34 +-
 .../guides/pytest_plugin/report_structure.md  |   6 +
 .../_internal/pytest_plugin/report.py         |  83 ++-
 .../_internal/pytest_plugin/steps.py          | 597 ++++++++++-------
 .../pytest_plugin/_step_status_capture.py     |  37 ++
 .../pytest_plugin/step_status_states.md       |  13 +-
 .../_tests/pytest_plugin/test_hierarchy.py    | 605 ++++++++++++++++--
 .../_tests/pytest_plugin/test_pass_fail.py    |  18 +-
 python/lib/sift_client/pytest_plugin.py       | 145 +++--
 .../util/test_results/context_manager.py      | 152 ++++-
 python/pyproject.toml                         |   7 +
 python/uv.lock                                |  22 +
 13 files changed, 1278 insertions(+), 444 deletions(-)

diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
index 47427055b..a8e291006 100644
--- a/python/docs/guides/pytest_plugin/configuration.md
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -118,8 +118,7 @@ def sift_client() -> SiftClient:
 |---|---|---|---|
 | `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
 | `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `pytest_fail_if_step_failed`, and `current_step`. |
-| `_hierarchy_parents` | internal fixture (autouse) | function | Opens a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor of the current test. Each layer is gated independently; see [settings reference](#settings-reference). |
-| `_parametrize_parents` | internal fixture (autouse) | function | Opens a parent step for each `@pytest.mark.parametrize` axis (and fixture parametrization), nested inside the hierarchy parents. |
+| `_sift_parents` | internal fixture (autouse) | function | Resolves the report-tree parents for the current test: a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor, then one per `@pytest.mark.parametrize` axis (and fixture parametrization) nested inside them. Parents are created once and reused across tests in any order, so test execution order is never changed. Each layer is gated independently; see [settings reference](#settings-reference). |
 | `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
 
 ## Settings reference
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
index 2ce3d0697..d0862778c 100644
--- a/python/docs/guides/pytest_plugin/pass_fail_behavior.md
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -13,9 +13,9 @@ The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
 | `PASSED`      | The step completed and every check it owns succeeded.                                                                  |
 | `FAILED`      | An assertion, a `pytest.fail(...)`, a failed `report_outcome`, or a failing measurement marked it.                     |
 | `ERROR`       | An unexpected exception escaped the test body or a fixture (setup or teardown).                                        |
-| `ABORTED`     | A hard exit (`SystemExit`, observed `KeyboardInterrupt`) interrupted the test.                                         |
+| `ABORTED`     | A hard exit (`SystemExit` or `KeyboardInterrupt`) cut the test off; resolved while pytest tears the session down.      |
 | `SKIPPED`     | The test was skipped at collection time, at runtime, or from a fixture.                                                |
-| `IN_PROGRESS` | Test in progress or the plugin never observed a final outcome (e.g. a session-aborting interrupt killed pytest first). |
+| `IN_PROGRESS` | A transient creation state. It survives into the report only if the process is killed so abruptly that teardown never runs. |
 
 ## Normal test outcomes
 
@@ -34,15 +34,27 @@ mapping to `FAILED`. A non-assertion exception gets its formatted traceback
 
 ## Hard exits
 
-Hard exits the plugin can observe map to `ABORTED`. If pytest tears the
-session down before the plugin sees the exit, the step stays at
-`IN_PROGRESS` instead of resolving.
-
-| Scenario                                       | Trigger                   | Outcome                                                              |
-| ---------------------------------------------- | ------------------------- | -------------------------------------------------------------------- |
-| `SystemExit` from the test body                | `sys.exit(1)`             | `ABORTED`                                                            |
-| `KeyboardInterrupt` the plugin observes        | `raise KeyboardInterrupt` | `ABORTED`                                                            |
-| Session-aborting `KeyboardInterrupt`           | Ctrl-C terminates pytest  | `IN_PROGRESS` (session ends before the plugin's hooks fire)          |
+Hard exits map to `ABORTED`. The step is resolved during fixture teardown, not
+at the instant of the exit:
+
+- When the exit produces a call-phase report (`sys.exit(1)`, `SystemExit`), the
+  plugin reads the status off that report.
+- When a `KeyboardInterrupt` aborts the session before any call-phase report
+  (Ctrl-C, or `raise KeyboardInterrupt` in the body), pytest still runs fixture
+  finalizers as it unwinds. The plugin sees setup completed with no call outcome
+  and resolves the cut-off step to `ABORTED` there.
+
+The status only reaches the report if those finalizers run. If the process is
+killed before they do (`SIGKILL`, the OOM killer, power loss), nothing is written
+and the step keeps the `IN_PROGRESS` it was created with. That is the only path
+that leaves a step `IN_PROGRESS` in a finalized report.
+
+| Scenario                                       | Trigger                            | Outcome                                          |
+| ---------------------------------------------- | ---------------------------------- | ------------------------------------------------ |
+| `SystemExit` from the test body                | `sys.exit(1)`                      | `ABORTED` (read from the call-phase report)      |
+| `KeyboardInterrupt` from the test body         | `raise KeyboardInterrupt`          | `ABORTED` (resolved during teardown)             |
+| Session-aborting `KeyboardInterrupt`           | Ctrl-C terminates pytest           | `ABORTED` (resolved during teardown)             |
+| Process killed before finalizers run           | `SIGKILL` / OOM / power loss       | `IN_PROGRESS` (nothing written after creation)   |
 
 ### Abort propagation through nested substeps
 
diff --git a/python/docs/guides/pytest_plugin/report_structure.md b/python/docs/guides/pytest_plugin/report_structure.md
index dd0d8ed54..188bee4ca 100644
--- a/python/docs/guides/pytest_plugin/report_structure.md
+++ b/python/docs/guides/pytest_plugin/report_structure.md
@@ -98,6 +98,12 @@ individually opt-out via ini flags (`sift_package_step`, `sift_module_step`,
 `sift_class_step`, `sift_parametrize_nesting`). Class/module/package docstrings
 become the matching step's description.
 
+A parent step is created `IN_PROGRESS` and resolves to its final status as soon
+as the last test in its subtree finishes — independent of test execution order,
+so with incremental upload the report tree fills in progressively rather than
+all at once at the end. Its time window spans from its first test starting to its
+last test finishing.
+
 ### Linking a Run to the report
 
 `report_context` is the session-scoped fixture; mutating it in one test affects
diff --git a/python/lib/sift_client/_internal/pytest_plugin/report.py b/python/lib/sift_client/_internal/pytest_plugin/report.py
index 5ce0590f1..e125c3e03 100644
--- a/python/lib/sift_client/_internal/pytest_plugin/report.py
+++ b/python/lib/sift_client/_internal/pytest_plugin/report.py
@@ -32,9 +32,9 @@
     TEST_SYSTEM_NAME_OPTION,
 )
 from sift_client._internal.pytest_plugin.steps import (
-    drain_hierarchy_stack,
-    drain_parametrize_stack,
+    finalize_parents,
     parametrize_path_key,
+    strip_param,
 )
 from sift_client.sift_types.test_report import ErrorInfo, TestStatus
 from sift_client.util.test_results import ReportContext
@@ -124,7 +124,6 @@ def resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
 
     status: TestStatus | None = None
     error_info: ErrorInfo | None = None
-    keep_managed = False
 
     if setup_phase is not None and setup_phase.report.outcome == "failed":
         status = TestStatus.ERROR
@@ -136,11 +135,13 @@ def resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
     elif setup_phase is not None and setup_phase.report.outcome == "skipped":
         status = TestStatus.SKIPPED
     elif call_phase is None:
-        # Setup completed but the call-phase report never fired; the inner
-        # pytester session was aborted (e.g. by KeyboardInterrupt) before the
-        # plugin could observe the outcome. Leave the step at IN_PROGRESS so
-        # the report does not lie about a clean pass.
-        keep_managed = True
+        # Setup completed but the call-phase report never fired; the session was
+        # aborted (e.g. by KeyboardInterrupt) before the plugin could observe the
+        # outcome. Resolve to ABORTED rather than leaving it IN_PROGRESS, since the
+        # test was cut off and a finalized report should not carry a step that
+        # still reads as in-progress. No call ``excinfo`` exists here, so there is
+        # no traceback to attach.
+        status = TestStatus.ABORTED
     else:
         wasxfail = getattr(call_phase.report, "wasxfail", None)
         if wasxfail is not None:
@@ -179,7 +180,7 @@ def resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
             elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
                 # Hard exits the plugin can observe: pytest converted the
                 # raise into a call-phase report. The session-aborting variant
-                # (call_phase is None) lands earlier and stays IN_PROGRESS.
+                # (call_phase is None) lands in the branch above, also ABORTED.
                 status = TestStatus.ABORTED
                 error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
             elif xfail_marker is not None:
@@ -194,14 +195,13 @@ def resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
                 status = TestStatus.ERROR
                 error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
 
-    if status is None and not keep_managed:
+    if status is None:
         return
 
-    if status is not None:
-        # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
-        current_step.__dict__["status"] = status
-        if error_info is not None:
-            current_step.__dict__["error_info"] = error_info
+    # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
+    current_step.__dict__["status"] = status
+    if error_info is not None:
+        current_step.__dict__["error_info"] = error_info
     new_step._sift_managed_externally = True
 
 
@@ -232,19 +232,6 @@ def _relativize(path: Path, rootpath: Path) -> str:
     return "" if rel == "." else rel
 
 
-def _strip_param(nodeid: str) -> str:
-    """Drop the trailing ``[param]`` from a nodeid, keeping ``file::Class::func``.
-
-    The parametrize id is a variation of the test, not its identity; leaving it
-    in would make a re-parametrization silently shift the grouping key. Splits on
-    the last ``::`` segment and cuts at its first ``[``; class/function names
-    never contain ``[``, so nested brackets in a param value can't confuse it.
-    """
-    head, sep, leaf = nodeid.rpartition("::")
-    leaf = leaf.split("[", 1)[0]
-    return f"{head}{sep}{leaf}"
-
-
 def derive_target(request: pytest.FixtureRequest, args: tuple[str, ...]) -> str:
     """Describe what was run, from the collected items rather than the command line.
 
@@ -273,7 +260,7 @@ def _anchor(rel: str) -> str:
     if not items:
         return root
     if len(items) == 1:
-        return _anchor(_strip_param(items[0].nodeid))
+        return _anchor(strip_param(items[0].nodeid))
     paths = {p for p in (getattr(i, "path", None) for i in items) if p is not None}
     if not paths:
         return root
@@ -434,16 +421,13 @@ def report_context_impl(
         try:
             yield context
         finally:
-            # Drain the hierarchy + parametrize stacks INSIDE the
-            # ReportContext's ``with`` block, so the final ``__exit__``
-            # update calls for those parent steps are written to the log
-            # file BEFORE the import worker drains. Without this, the
-            # worker exits with a partial backlog and the parent steps
-            # are stuck IN_PROGRESS in the Sift report.
-            try:
-                drain_parametrize_stack()
-            finally:
-                drain_hierarchy_stack()
+            # Close any report-tree parents still open INSIDE the ReportContext's
+            # ``with`` block, so their final ``__exit__`` update calls are written
+            # to the log file BEFORE the import worker drains. Without this, the
+            # worker exits with a partial backlog and the parent steps are stuck
+            # IN_PROGRESS in the Sift report. Most parents already closed early as
+            # their subtrees finished; this is the backstop for the rest.
+            finalize_parents()
 
 
 # Placeholder credentials used in --sift-offline mode when env/ini values
@@ -478,12 +462,12 @@ def step_impl(
     report_context: ReportContext, request: pytest.FixtureRequest
 ) -> Generator[NewStep, None, None]:
     node = request.node
-    # Items get a parametrize path stashed in ``pytest_collection_modifyitems``;
+    # Items get a parametrize path stashed in ``pytest_itemcollected``;
     # modules/other nodes fall back to their node name. The leaf frame
-    # (``path[-1]``) is the test-specific display name; parents are opened
-    # by ``_parametrize_parents``. When parametrize-nesting is disabled, fall
-    # back to the bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf
-    # remains uniquely identifiable.
+    # (``path[-1]``) is the test-specific display name; parents are opened by
+    # ``_sift_parents``. When parametrize-nesting is disabled, fall back to the
+    # bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf remains
+    # uniquely identifiable.
     if PARAMETRIZE_NESTING_OPTION.resolve(request.config):
         path = node.stash.get(parametrize_path_key, ())
         name = path[-1] if path else str(node.name)
@@ -498,8 +482,17 @@ def step_impl(
         existing_docstring = getattr(getattr(node, "obj", None), "__doc__", None) or None
     except Exception:
         existing_docstring = None
+    # Attach the leaf under the parent ``_sift_parents`` resolved for this item
+    # (None -> a report-root step). ``push=True`` keeps the leaf on the step stack
+    # so any in-test ``substep`` nests under it.
+    parent_ns: NewStep | None = getattr(node, "_sift_parent", None)
+    parent_step = parent_ns.current_step if parent_ns is not None else None
     with report_context.new_step(
-        name=name, description=existing_docstring, assertion_as_fail_not_error=False
+        name=name,
+        description=existing_docstring,
+        assertion_as_fail_not_error=False,
+        parent=parent_step,
+        push=True,
     ) as new_step:
         node._sift_step = new_step
         yield new_step
diff --git a/python/lib/sift_client/_internal/pytest_plugin/steps.py b/python/lib/sift_client/_internal/pytest_plugin/steps.py
index 9904ceecb..26779cb73 100644
--- a/python/lib/sift_client/_internal/pytest_plugin/steps.py
+++ b/python/lib/sift_client/_internal/pytest_plugin/steps.py
@@ -1,19 +1,22 @@
-"""Parent-step stacks: the parametrize and hierarchy frames shared across items.
-
-Holds the collection-phase stash keys and the two module-level frame stacks
-(``parametrize_stack`` / ``hierarchy_stack``), the helpers that build a chain
-for an item and drain the stacks, and the per-item reconcilers the autouse
-fixtures delegate to. Frames are shared across sibling test items and drained
-innermost-first at session end.
+"""Report-tree parent steps: an identity-keyed registry built without reordering.
+
+Each test's package/module/class ancestors ("hierarchy" parents) and each
+``@pytest.mark.parametrize`` axis ("parametrize" parents) become parent steps the
+leaf nests under. Parents are kept in identity-keyed registries — created once and
+reused by every descendant regardless of execution order — so the plugin never
+reorders test items. A parent is closed as soon as the last leaf in its subtree
+finishes (``release_finished_leaf``), with ``finalize_parents`` as the session-end
+backstop for anything still open.
 """
 
 from __future__ import annotations
 
 import warnings
-from typing import Any, Tuple
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple
 
 import pytest
 
+from sift_client._internal.pytest_plugin.modes import gate_enabled
 from sift_client._internal.pytest_plugin.options import (
     CLASS_STEP_OPTION,
     MODULE_STEP_OPTION,
@@ -21,110 +24,75 @@
     PARAMETRIZE_NESTING_OPTION,
 )
 
-STASH_MISSING = object()
-
-parametrize_path_key = pytest.StashKey[Tuple[str, ...]]()
-# Each frame: (path_key, open step). Frames are shared across sibling test items
-# and drained at session end.
-parametrize_stack: list[tuple[str, Any]] = []
-
-hierarchy_key = pytest.StashKey[Tuple[Tuple[str, str, "str | None", bool], ...]]()
-# Outer-to-inner frames for the item's collection-tree ancestors. Each chain
-# entry is ``(identity, name, doc, rendered)``:
-#   - ``identity``: a globally-unique key (``node.nodeid``) used for diff
-#     comparison. Two ancestors at the same depth with the same display name
-#     but reached via different paths (e.g., ``proj_a/utils`` and
-#     ``proj_b/utils`` in a monorepo) get distinct identities, so they never
-#     silently merge in the diff.
-#   - ``name``: the human-readable step name used when ``rendered`` opens the
-#     Sift step.
-#   - ``doc``: docstring used for the step description if rendered.
-#   - ``rendered``: True iff the corresponding ``sift_*_step`` ini flag is on.
-#     Non-rendered frames participate in the diff but do not call
-#     ``rc.new_step(...)``; they appear with ``ns=None`` in the stack.
-#
-# Stack entries: ``(identity, name, open_step_or_None)``. Frames are shared
-# across sibling test items and drained at session end. Drained AFTER
-# parametrize_stack since parametrize parents nest inside hierarchy parents.
-hierarchy_stack: list[tuple[str, str, Any]] = []
-
-
-def drain_step_stack(stack: list, *, swallow_errors: bool = True) -> None:
-    """Pop and close every frame.
-
-    With ``swallow_errors=True`` (default, used at teardown / session end),
-    per-frame failures are surfaced as ``SiftPytestStepDrainWarning`` so a
-    single misbehaving ``__exit__`` can't block the rest of the stack from
-    cleaning up or cascade out of pytest's finalizer chain.
+if TYPE_CHECKING:
+    from typing import Callable
 
-    With ``swallow_errors=False`` (mid-session, when a class transition forces
-    parametrize parents to close), the stack is still fully drained but the
-    first per-frame exception is re-raised at the end as a
-    ``SiftPytestStepDrainError`` so a real upstream invariant violation
-    surfaces as a test error instead of a silenceable warning.
-    """
-    from sift_client.pytest_plugin import SiftPytestStepDrainError, SiftPytestStepDrainWarning
-
-    errors: list[tuple[str, BaseException]] = []
-    while stack:
-        entry = stack.pop()
-        # Tolerate either ``(name, ns)`` (parametrize stack) or
-        # ``(identity, name, ns)`` (hierarchy stack) entries.
-        name, ns = entry[-2], entry[-1]
-        if ns is None:
-            # Non-rendered diff-only frame (e.g. a Package frame when
-            # ``sift_package_step=false``); nothing to close.
-            continue
-        try:
-            ns.__exit__(None, None, None)
-        except Exception as exc:
-            if swallow_errors:
-                warnings.warn(
-                    f"Sift plugin: closing step {name!r} during drain raised "
-                    f"{type(exc).__name__}: {exc}",
-                    SiftPytestStepDrainWarning,
-                    stacklevel=2,
-                )
-            else:
-                errors.append((name, exc))
-    if errors:
-        first_name, first_exc = errors[0]
-        raise SiftPytestStepDrainError(
-            f"Sift plugin: {len(errors)} step(s) raised while draining mid-session; "
-            f"first failure on {first_name!r}: {type(first_exc).__name__}: {first_exc}"
-        ) from first_exc
-
-
-def drain_parametrize_stack(*, swallow_errors: bool = True) -> None:
-    drain_step_stack(parametrize_stack, swallow_errors=swallow_errors)
-
-
-def drain_hierarchy_stack(*, swallow_errors: bool = True) -> None:
-    drain_step_stack(hierarchy_stack, swallow_errors=swallow_errors)
-
-
-def close_frame(name: str, ns: Any) -> None:
-    """Close a single frame, warning on per-frame failure.
-
-    Used by the mid-session hierarchy-stack pop and the rollback paths so a
-    misbehaving ``__exit__`` neither shadows the original exception nor leaks
-    sibling frames. ``ns=None`` indicates a non-rendered diff-only frame; skip.
-    """
-    from sift_client.pytest_plugin import SiftPytestStepDrainWarning
+    from sift_client.util.test_results import ReportContext
+    from sift_client.util.test_results.context_manager import NewStep
 
-    if ns is None:
-        return
-    try:
-        ns.__exit__(None, None, None)
-    except Exception as exc:
-        warnings.warn(
-            f"Sift plugin: closing step {name!r} raised {type(exc).__name__}: {exc}",
-            SiftPytestStepDrainWarning,
-            stacklevel=2,
-        )
-
-
-def build_parametrize_path(item: pytest.Item) -> tuple[str, ...]:
+# --- Report-tree type aliases ---------------------------------------------
+# The plugin juggles a few small tuple/dict shapes for the parent step tree;
+# naming them keeps the signatures below readable. Defined with ``typing``
+# generics (not ``list``/``tuple``) because some are used in runtime
+# ``StashKey[...]`` subscriptions, which must stay importable on Python 3.8.
+#
+# A hierarchy parent's identity is just a ``str`` (the ancestor node's
+# ``nodeid``); a parametrize parent's identity is a ``ParametrizeKey``: the
+# test's param-stripped node id followed by its outer-to-inner axis frames
+# (e.g. ``("pkg/test_m.py::TestC::test_a", "v=1")``).
+ParametrizeKey = Tuple[str, ...]
+# Outer-to-inner display-name axis path stashed per parametrized item
+# (``(originalname, "v=1", ...)``); the leaf is its last frame.
+ParametrizePath = Tuple[str, ...]
+# One collection-tree ancestor: ``(identity, display name, docstring, rendered)``.
+# ``rendered`` is True iff that layer's ``sift_*_step`` ini flag opens a step.
+HierarchyFrame = Tuple[str, str, Optional[str], bool]
+# Outer-to-inner ancestor frames stashed per item.
+HierarchyChain = Tuple[HierarchyFrame, ...]
+# A rendered parent to open, as returned by ``resolved_parents``.
+HierarchyParent = Tuple[str, str, Optional[str]]  # (identity, name, docstring)
+ParametrizeParent = Tuple[ParametrizeKey, str]  # (registry key, frame name)
+# A gated-in leaf's parents: its rendered hierarchy identities and parametrize keys.
+LeafParents = Tuple[List[str], List[ParametrizeKey]]
+
+parametrize_path_key = pytest.StashKey[ParametrizePath]()
+
+hierarchy_key = pytest.StashKey[HierarchyChain]()
+# See ``HierarchyFrame`` above for the chain entry shape. ``identity`` is the
+# node's ``nodeid``: two ancestors at the same depth with the same display name
+# but reached via different paths (e.g., ``proj_a/utils`` and ``proj_b/utils`` in
+# a monorepo) get distinct identities, so they never silently merge. Non-rendered
+# frames open no step; the next rendered descendant attaches to the nearest
+# rendered ancestor instead.
+
+# Open report-tree parent steps, keyed by identity so they are created once and
+# reused by every descendant regardless of test execution order. The leaf step
+# for each test is created under its resolved parent (see ``report.step_impl``),
+# so no global ordering of test items is required. Parents live OUTSIDE
+# ``ReportContext.step_stack`` (created with ``push=False``) and are closed early
+# by ``release_finished_leaf``, or at session end by ``finalize_parents``.
+#
+# Hierarchy parents (packages / modules / classes) keyed by the ancestor node's
+# ``nodeid``:
+hierarchy_parents: dict[str, NewStep] = {}
+# Parametrize parents keyed by ``ParametrizeKey``, so sibling parametrizations of
+# one test share a parent while parametrizations under different
+# tests/classes/modules never collide:
+parametrize_parents: dict[ParametrizeKey, NewStep] = {}
+
+# Remaining descendant leaves per open-able parent, keyed exactly like the
+# registries above. Populated from the collected (and selected) items in
+# ``tally_expected_parents`` and decremented as each test finishes; when a count
+# reaches zero the parent's whole subtree is done and it is closed early (see
+# ``release_finished_leaf``) instead of waiting for session end.
+expected_hierarchy: dict[str, int] = {}
+expected_parametrize: dict[ParametrizeKey, int] = {}
+# Each gated-in leaf's parent identities, so ``release_finished_leaf`` — which
+# only receives a nodeid — knows which counters to decrement.
+leaf_parents: dict[str, LeafParents] = {}
+
+
+def build_parametrize_path(item: pytest.Item) -> ParametrizePath:
     """Outer-to-inner step display names for a parametrized item.
 
     Pytest stores ``callspec.params`` with the BOTTOM decorator's axis first;
@@ -143,23 +111,21 @@ def build_parametrize_path(item: pytest.Item) -> tuple[str, ...]:
 def build_hierarchy_chain(
     item: pytest.Item | pytest.Collector,
     config: pytest.Config,
-) -> tuple[tuple[str, str, str | None, bool], ...]:
+) -> HierarchyChain:
     """Outer-to-inner ``(identity, name, docstring, rendered)`` for collection ancestors.
 
     Walks ``item.parent`` upward and ALWAYS collects every ``pytest.Package``,
-    ``pytest.Module``, and ``pytest.Class`` ancestor; they all participate in
-    the diff that keeps the report tree coherent across tests, so two
-    same-named ancestors reached via different paths (e.g., ``proj_a/utils``
-    and ``proj_b/utils`` in a monorepo where the ``proj_*`` dirs are
-    ``pytest.Dir`` nodes the walker skips) cannot silently merge.
+    ``pytest.Module``, and ``pytest.Class`` ancestor; they all carry the identity
+    that keeps the report tree coherent across tests, so two same-named ancestors
+    reached via different paths (e.g., ``proj_a/utils`` and ``proj_b/utils`` in a
+    monorepo where the ``proj_*`` dirs are ``pytest.Dir`` nodes the walker skips)
+    cannot silently merge.
 
-    The ``identity`` field is ``node.nodeid``, globally unique per collected
-    node. The diff compares on identity, not the display ``name``.
+    The ``identity`` field is ``node.nodeid``, globally unique per collected node.
 
     The ``rendered`` flag is True iff the layer's ini flag is on
     (``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``).
-    Non-rendered frames participate in the diff for identity but don't open a
-    Sift step.
+    Non-rendered frames carry identity but don't open a Sift step.
 
     The ``node.obj`` access is a pytest property that imports the underlying
     Python object and can raise *any* exception (ImportError, custom
@@ -171,7 +137,7 @@ def build_hierarchy_chain(
     include_module = bool(MODULE_STEP_OPTION.resolve(config))
     include_class = bool(CLASS_STEP_OPTION.resolve(config))
 
-    chain: list[tuple[str, str, str | None, bool]] = []
+    chain: list[HierarchyFrame] = []
     # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
     # isn't part of pytest's public API; widen to ``Any`` for the walk.
     node: Any = item
@@ -196,115 +162,292 @@ def build_hierarchy_chain(
     return tuple(reversed(chain))
 
 
-def reconcile_hierarchy(request: pytest.FixtureRequest, config: pytest.Config) -> None:
-    """Open/close hierarchy parents so the open stack matches the item's chain.
+def resolved_parents(
+    node: pytest.Item,
+    config: pytest.Config,
+) -> tuple[list[HierarchyParent], list[ParametrizeParent]]:
+    """The rendered report-tree parents for ``node`` — the single source of truth.
+
+    Shared by ``get_or_create_parent_chain`` (which opens these parents) and the
+    early-close counters in ``tally_expected_parents`` (which count them), so the
+    two can never key on different identities. Returns ``(hierarchy, parametrize)``
+    outer-to-inner:
+
+    * hierarchy: ``(identity, name, doc)`` for each rendered package/module/class
+      ancestor. ``identity`` is the node's ``nodeid`` (the registry key).
+    * parametrize: ``(registry key, frame name)`` for each parametrize axis except
+      the innermost (the leaf is the ``step`` fixture's job). Empty when
+      ``sift_parametrize_nesting`` is off or the item isn't parametrized.
+
+    Reads the per-item stash written in ``pytest_itemcollected``; recomputes for
+    items a later hook injected without going through it.
+    """
+    if hierarchy_key in node.stash:
+        chain = node.stash[hierarchy_key]
+    else:
+        chain = build_hierarchy_chain(node, config)
+    # Non-rendered frames open no step; the next rendered descendant attaches to
+    # the nearest rendered ancestor, so they are simply dropped here.
+    hierarchy = [(identity, name, doc) for identity, name, doc, rendered in chain if rendered]
+
+    parametrize: list[ParametrizeParent] = []
+    if PARAMETRIZE_NESTING_OPTION.resolve(config):
+        if parametrize_path_key in node.stash:
+            path = node.stash[parametrize_path_key]
+        else:
+            path = build_parametrize_path(node)
+        if path:
+            # Key parametrize parents by the test's param-stripped identity plus
+            # the outer frame prefix, so sibling params share a parent but params
+            # under different tests never merge.
+            key: ParametrizeKey = (strip_param(node.nodeid),)
+            for frame in path[:-1]:
+                key = (*key, frame)
+                parametrize.append((key, frame))
+    return hierarchy, parametrize
+
+
+def strip_param(nodeid: str) -> str:
+    """Drop the trailing ``[param]`` from a nodeid, keeping ``file::Class::func``.
+
+    The parametrize id is a variation of the test, not its identity — leaving it
+    in would make a re-parametrization silently shift the grouping key. Splits on
+    the last ``::`` segment and cuts at its first ``[``; class/function names
+    never contain ``[``, so nested brackets in a param value can't confuse it.
+    """
+    head, sep, leaf = nodeid.rpartition("::")
+    leaf = leaf.split("[", 1)[0]
+    return f"{head}{sep}{leaf}"
+
+
+def get_or_create_parent_chain(
+    node: pytest.Item,
+    config: pytest.Config,
+    request: pytest.FixtureRequest,
+) -> NewStep | None:
+    """Resolve the innermost report-tree parent for ``node``, creating any missing ancestors.
+
+    Walks the item's rendered hierarchy ancestors (outer-to-inner) and then its
+    parametrize axes (see ``resolved_parents``), get-or-creating one parent step
+    per identity in the registries. Each new parent is opened under the running
+    parent (``push=False``, so it stays off ``ReportContext.step_stack``) and
+    reused by every later descendant — no contiguity of sibling items is required,
+    so test execution order is irrelevant.
+
+    Returns the innermost parent the leaf should attach to, or ``None`` when no
+    rendered parent applies (the leaf becomes a report-root step). ``report_context``
+    is fetched lazily, only when a parent actually needs creating, so excluded
+    items never trigger eager context setup.
+    """
+    rc_cache: list[ReportContext] = []
+
+    def rc() -> ReportContext:
+        if not rc_cache:
+            rc_cache.append(request.getfixturevalue("report_context"))
+        return rc_cache[0]
+
+    return _resolve_parent_chain(node, config, rc)
+
+
+def resolve_parent_chain_in_context(
+    node: pytest.Item,
+    config: pytest.Config,
+    context: ReportContext,
+) -> NewStep | None:
+    """``get_or_create_parent_chain`` for callers holding a ``ReportContext`` directly.
+
+    The collection-skip path runs from ``pytest_runtest_makereport`` (the autouse
+    fixtures never ran for a marker-skipped item), so it has no ``FixtureRequest``
+    to resolve ``report_context`` from, only the session ``ReportContext``. It
+    must still nest the skipped item's step under the same registry parents a
+    running sibling uses, so it shares the create-once logic here.
+    """
+    return _resolve_parent_chain(node, config, lambda: context)
+
+
+def _resolve_parent_chain(
+    node: pytest.Item,
+    config: pytest.Config,
+    rc: Callable[[], ReportContext],
+) -> NewStep | None:
+    """Shared body of the two parent-chain resolvers; ``rc`` supplies the context.
+
+    ``rc`` is called only when a parent actually needs creating, so a caller that
+    passes a lazy getter keeps the "no eager context setup" guarantee.
+    """
+    hierarchy, parametrize = resolved_parents(node, config)
+    parent_step: Any = None  # TestStep of the running innermost parent, or None (root).
+    innermost: NewStep | None = None
+
+    for identity, name, doc in hierarchy:
+        ns = hierarchy_parents.get(identity)
+        if ns is None:
+            ns = rc().new_step(
+                name=name,
+                description=doc,
+                assertion_as_fail_not_error=False,
+                parent=parent_step,
+                push=False,
+            )
+            ns.__enter__()
+            hierarchy_parents[identity] = ns
+        parent_step = ns.current_step
+        innermost = ns
+
+    for key, frame in parametrize:
+        ns = parametrize_parents.get(key)
+        if ns is None:
+            ns = rc().new_step(
+                name=frame,
+                assertion_as_fail_not_error=False,
+                parent=parent_step,
+                push=False,
+            )
+            ns.__enter__()
+            parametrize_parents[key] = ns
+        parent_step = ns.current_step
+        innermost = ns
+
+    return innermost
+
 
-    Diffs the item's desired ``(package, module, class)`` chain against
-    ``hierarchy_stack`` on identity (nodeid), pops the stale tail, and pushes
-    new rendered frames. Which node types render is decided at build time by
-    ``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``; when the
-    chain changes, the parametrize stack is drained first since parametrize
-    parents nest INSIDE these.
+def close_parent(ns: NewStep) -> None:
+    """Close one open report-tree parent, stamping its last-descendant finish time.
+
+    Shared by mid-session early close (``release_finished_leaf``) and the
+    session-end drain (``finalize_parents``). The ``end_time`` override comes from
+    ``ReportContext.parent_end_times`` so the parent's window ends at its latest
+    descendant rather than wall-clock at close. A misbehaving ``__exit__`` is
+    surfaced as a warning so it never blocks the remaining parents or cascades out
+    of pytest's finalizer chain.
     """
-    # Fall back to computing the chain on-demand for items that bypassed
-    # ``pytest_collection_modifyitems`` (e.g., dynamically inserted by another
-    # plugin's later hook). Defaulting to ``()`` would incorrectly drain the
-    # entire open hierarchy stack for those items.
-    desired = request.node.stash.get(hierarchy_key, STASH_MISSING)
-    if desired is STASH_MISSING:
-        desired = build_hierarchy_chain(request.node, config)
-    common = 0
-    # Compare on identity (nodeid); same-named ancestors at different paths
-    # MUST stay distinct.
-    while (
-        common < len(hierarchy_stack)
-        and common < len(desired)
-        and hierarchy_stack[common][0] == desired[common][0]
-    ):
-        common += 1
-    # Any change to the hierarchy chain orphans parametrize parents from the
-    # previous test. Drain them before mutating the hierarchy stack so
-    # ReportContext's top-of-stack invariant holds. Strict mode: a per-frame
-    # ``__exit__`` failure here signals a real upstream drift between the
-    # plugin stacks and ReportContext; raise it as a test error instead of a
-    # silenceable warning.
-    if common < len(hierarchy_stack) or common < len(desired):
-        drain_parametrize_stack(swallow_errors=False)
-    # Symmetric per-frame guard for the hierarchy pop so one bad ``__exit__``
-    # doesn't leave hierarchy_stack partially drained for every subsequent test.
-    while len(hierarchy_stack) > common:
-        _identity, name, ns = hierarchy_stack.pop()
-        close_frame(name, ns)
-    if not desired[common:]:
+    from sift_client.pytest_plugin import REPORT_CONTEXT, SiftPytestStepDrainWarning
+
+    step = ns.current_step
+    if step is None:
         return
-    # Fetch ``report_context`` lazily, but only when there's at least one
-    # rendered frame to push. Pure diff-only frames (e.g. a Package frame when
-    # ``sift_package_step=false``) just update hierarchy_stack with ns=None.
-    rc = None
-    # Roll back any partial push so a mid-loop exception doesn't leave half
-    # the chain orphaned on the stack. Per-frame guard inside the rollback so
-    # a failing ``__exit__`` doesn't shadow the original exception or leak
-    # the remaining opened frames.
-    opened: list[tuple[str, str, Any]] = []
+    if REPORT_CONTEXT is not None:
+        ns._sift_end_time_override = REPORT_CONTEXT.parent_end_times.get(step.step_path)
     try:
-        for identity, name, doc, rendered in desired[common:]:
-            if rendered:
-                if rc is None:
-                    rc = request.getfixturevalue("report_context")
-                ns = rc.new_step(name=name, description=doc, assertion_as_fail_not_error=False)
-                ns.__enter__()
-                opened.append((identity, name, ns))
-            else:
-                opened.append((identity, name, None))
-    except BaseException:
-        while opened:
-            _identity, name, ns = opened.pop()
-            close_frame(name, ns)
-        raise
-    hierarchy_stack.extend(opened)
-
-
-def reconcile_parametrize(request: pytest.FixtureRequest, config: pytest.Config) -> None:
-    """Open/close shared parametrize parents so the open stack matches the item.
-
-    Diffs the item's desired parametrize path against ``parametrize_stack``:
-    pops the stale tail, then opens new parents (everything except the innermost
-    frame, which the ``step`` fixture creates as the leaf). Parents persist
-    across sibling items so a tree like ``test_x[a=1]`` / ``test_x[a=2]`` shares
-    one ``test_x`` container. No-op when ``sift_parametrize_nesting=false``.
+        ns.__exit__(None, None, None)
+    except Exception as exc:
+        warnings.warn(
+            f"Sift plugin: closing parent step {step.name!r} raised {type(exc).__name__}: {exc}",
+            SiftPytestStepDrainWarning,
+            stacklevel=2,
+        )
+
+
+def close_parents_innermost_first(parents: list[NewStep]) -> None:
+    """Close the given open parents deepest-``step_path`` first.
+
+    Innermost-first means a child parent's ``propagate_step_result`` (status) and
+    ``note_close`` (finish time) reach its parent's bookkeeping before that parent
+    resolves — so a failing/late subtree rolls up correctly whether parents close
+    mid-session or at session end.
     """
-    if not PARAMETRIZE_NESTING_OPTION.resolve(config):
-        return
-    # Fall back to on-demand computation for dynamically-inserted items;
-    # see reconcile_hierarchy for the same rationale.
-    desired = request.node.stash.get(parametrize_path_key, STASH_MISSING)
-    if desired is STASH_MISSING:
-        desired = build_parametrize_path(request.node)
-    parents = desired[:-1]
-    common = 0
-    while (
-        common < len(parametrize_stack)
-        and common < len(parents)
-        and parametrize_stack[common][0] == parents[common]
-    ):
-        common += 1
-    # Per-frame guard so one bad ``__exit__`` doesn't leave parametrize_stack
-    # partially drained for every subsequent test.
-    while len(parametrize_stack) > common:
-        name, ns = parametrize_stack.pop()
-        close_frame(name, ns)
-    if not parents[common:]:
+    parents.sort(
+        key=lambda ns: ns.current_step.step_path.count(".") if ns.current_step else -1,
+        reverse=True,
+    )
+    for ns in parents:
+        close_parent(ns)
+
+
+def finalize_parents() -> None:
+    """Close every still-open report-tree parent at session end, innermost-first.
+
+    The backstop for anything ``release_finished_leaf`` did not already close
+    early (e.g. a parent whose subtree never fully ran because the session was
+    aborted). Idempotent: the registries and counters are cleared up front, so the
+    second drain site (``pytest_sessionfinish`` after ``report_context_impl``) is
+    a no-op.
+    """
+    parents = [*parametrize_parents.values(), *hierarchy_parents.values()]
+    parametrize_parents.clear()
+    hierarchy_parents.clear()
+    expected_hierarchy.clear()
+    expected_parametrize.clear()
+    leaf_parents.clear()
+    close_parents_innermost_first(parents)
+
+
+def tally_expected_parents(session: pytest.Session) -> None:
+    """Count each open-able parent's descendant leaves, for mid-session early close.
+
+    Runs after all ``modifyitems`` and deselection (``pytest_collection_finish``),
+    so ``session.items`` is the final, selected set. Only gated-in items are
+    counted — that keeps ``sift_exclude``-d siblings (and an entirely gated-off
+    session, e.g. the dev suite's own outer run) out of the tallies, so a
+    partially-excluded class still closes when its included tests finish. The maps
+    are rebuilt every session because pytester runs inner sessions in-process,
+    sharing this module state.
+    """
+    expected_hierarchy.clear()
+    expected_parametrize.clear()
+    leaf_parents.clear()
+    for item in session.items:
+        if not gate_enabled(item, session.config):
+            continue
+        hierarchy, parametrize = resolved_parents(item, session.config)
+        h_ids = [identity for identity, _, _ in hierarchy]
+        p_keys = [key for key, _ in parametrize]
+        if not h_ids and not p_keys:
+            continue  # leaf is a report-root step; no parent to close
+        leaf_parents[item.nodeid] = (h_ids, p_keys)
+        for identity in h_ids:
+            expected_hierarchy[identity] = expected_hierarchy.get(identity, 0) + 1
+        for key in p_keys:
+            expected_parametrize[key] = expected_parametrize.get(key, 0) + 1
+
+
+def _decrement_parent_counts(
+    keys: list[Any],
+    expected: dict[Any, int],
+    registry: dict[Any, NewStep],
+    ready: list[NewStep],
+) -> None:
+    """Decrement each key's remaining-descendant count by one.
+
+    When a count reaches zero the parent's subtree is complete: drop it from both
+    the count map and the registry and queue its still-open step (if any) onto
+    ``ready`` for closing. The hierarchy and parametrize branches of
+    ``release_finished_leaf`` differ only in which (count, registry) pair they
+    pass here.
+    """
+    for key in keys:
+        remaining = expected.get(key)
+        if remaining is None:
+            continue
+        if remaining <= 1:
+            expected.pop(key, None)
+            closing = registry.pop(key, None)
+            if closing is not None:
+                ready.append(closing)
+        else:
+            expected[key] = remaining - 1
+
+
+def release_finished_leaf(nodeid: str) -> None:
+    """Decrement the finished item's parents; close any whose subtree is now done.
+
+    Called from ``pytest_runtest_logfinish``, which fires once per item for every
+    outcome (pass / fail / skip / error). When a parent's remaining-leaf count
+    reaches zero its whole subtree has finished, so it is closed now rather than
+    at session end — giving incremental uploads a progressively-resolving report
+    under any execution order. Closes innermost-first so a child parent rolls its
+    result and finish time up before its own parent resolves; several levels can
+    complete on the same leaf (e.g. the last param variant closes its parametrize
+    parent, class, and module at once). Items not in ``leaf_parents`` (gated-off,
+    or injected after collection) are ignored; anything left open is handled by
+    ``finalize_parents``.
+    """
+    entry = leaf_parents.pop(nodeid, None)
+    if entry is None:
         return
-    rc = request.getfixturevalue("report_context")
-    opened: list[tuple[str, Any]] = []
-    try:
-        for display in parents[common:]:
-            ns = rc.new_step(name=display, assertion_as_fail_not_error=False)
-            ns.__enter__()
-            opened.append((display, ns))
-    except BaseException:
-        while opened:
-            name, ns = opened.pop()
-            close_frame(name, ns)
-        raise
-    parametrize_stack.extend(opened)
+    h_ids, p_keys = entry
+    ready: list[NewStep] = []
+    _decrement_parent_counts(h_ids, expected_hierarchy, hierarchy_parents, ready)
+    _decrement_parent_counts(p_keys, expected_parametrize, parametrize_parents, ready)
+    if ready:
+        close_parents_innermost_first(ready)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
index 77e09bdf5..74c498fd1 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -28,6 +28,10 @@ class CapturedStep:
     parent_step_id: str | None
     statuses: list[TestStatus] = field(default_factory=list)
     error_messages: list[str] = field(default_factory=list)
+    # ``startTime`` from the create entry; ``endTime`` is the latest seen across
+    # create/update entries. Both are RFC3339 strings.
+    start_time: str | None = None
+    end_time: str | None = None
 
 
 _PROTO_STATUS_NAMES = {
@@ -68,6 +72,8 @@ def parse_log(log_path: Path) -> dict[str, CapturedStep]:
                 parent_step_id=test_step.get("parentStepId") or None,
                 statuses=[_status(test_step.get("status"))],
                 error_messages=[error_message] if error_message else [],
+                start_time=test_step.get("startTime"),
+                end_time=test_step.get("endTime"),
             )
         elif request_type == "UpdateTestStep":
             step_id = test_step.get("testStepId")
@@ -76,6 +82,8 @@ def parse_log(log_path: Path) -> dict[str, CapturedStep]:
                 steps[step_id].statuses.append(_status(new_status))
                 if error_message:
                     steps[step_id].error_messages.append(error_message)
+                if test_step.get("endTime") is not None:
+                    steps[step_id].end_time = test_step.get("endTime")
     return steps
 
 
@@ -127,6 +135,32 @@ def final_error_message(name: str) -> str | None:
     return step.error_messages[-1] if step and step.error_messages else None
 
 
+def log_events(log_path: Path) -> list[tuple[str, str, TestStatus]]:
+    """Ordered ``(request_type, step_name, status)`` tuples as they appear in the log.
+
+    Unlike ``load_steps`` (which collapses each step to its final state), this
+    preserves write order, so tests can assert *when* a step resolved relative to
+    other entries — e.g. that a container's terminal ``UpdateTestStep`` precedes a
+    later sibling's ``CreateTestStep`` (proof it closed mid-session, not at the
+    end). ``UpdateTestStep`` entries carry only an id, so the name is resolved
+    from the preceding ``CreateTestStep``.
+    """
+    if not log_path.exists():
+        return []
+    id_to_name: dict[str, str] = {}
+    events: list[tuple[str, str, TestStatus]] = []
+    for request_type, response_id, json_str in iter_log_data_lines(log_path):
+        test_step = json.loads(json_str).get("testStep", {})
+        status = _status(test_step.get("status"))
+        if request_type == "CreateTestStep" and response_id:
+            name = test_step.get("name", "")
+            id_to_name[response_id] = name
+            events.append((request_type, name, status))
+        elif request_type == "UpdateTestStep":
+            events.append((request_type, id_to_name.get(test_step.get("testStepId"), ""), status))
+    return events
+
+
 def load_steps(log_path: Path) -> list[dict]:
     """Load the offline log as a list of step records keyed by hierarchy fields.
 
@@ -144,6 +178,9 @@ def load_steps(log_path: Path) -> list[dict]:
             "name": s.name,
             "parent_step_id": s.parent_step_id,
             "step_path": s.step_path,
+            "statuses": s.statuses,
+            "start_time": s.start_time,
+            "end_time": s.end_time,
         }
         for s in parse_log(log_path).values()
     ]
diff --git a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
index 7e366a512..cbd748c53 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
+++ b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
@@ -6,10 +6,13 @@ Reference for the pass/fail scenarios covered by
 [`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
 
 `TestStatus` values come from `sift_client.sift_types.test_report.TestStatus`:
-`PASSED`, `FAILED`, `ERROR`, `SKIPPED`, `ABORTED`, `IN_PROGRESS`. Hard process
-exits the plugin can observe (`SystemExit`, `KeyboardInterrupt` when pytest
-delivers a call-phase report) map to `ABORTED`. A session-aborting interrupt
-that fires before the plugin sees it leaves the step in `IN_PROGRESS`.
+`PASSED`, `FAILED`, `ERROR`, `SKIPPED`, `ABORTED`, `IN_PROGRESS`. Hard exits map
+to `ABORTED`, resolved during fixture teardown: from the call-phase report when
+there is one (`SystemExit`), or, when a `KeyboardInterrupt` aborts the session
+before that report, from setup having completed with no call outcome. The status
+reaches the report only because pytest runs finalizers as it unwinds; a step
+keeps the `IN_PROGRESS` it was created with only if the process is killed before
+those finalizers run.
 
 ## Case ID scheme
 
@@ -36,7 +39,7 @@ be traced back to its row here without rereading the scenario:
 | `CALL-03` | Generic exception in call phase | `raise ValueError("boom")`           | `ERROR`                                                                                                  |
 | `CALL-04` | `pytest.fail("...")` from body  | `pytest.fail("intentional failure")` | `FAILED`                                                                                                 |
 | `CALL-05` | `SystemExit` from the test body | `sys.exit(1)`                        | `ABORTED`                                                                                                |
-| `CALL-06` | `KeyboardInterrupt` in body     | `raise KeyboardInterrupt`            | `IN_PROGRESS` — session aborts before the plugin sees the interrupt; `ABORTED` if the plugin does see it |
+| `CALL-06` | `KeyboardInterrupt` in body     | `raise KeyboardInterrupt`            | `ABORTED` — the session aborts before a call-phase report, but fixture teardown still runs, so the cut-off step resolves to `ABORTED` rather than staying `IN_PROGRESS` |
 | `CALL-07` | Substep raises non-Assertion exception | `with step.substep(...): raise ValueError("boom")` | Substep `ERROR`, test step `FAILED` (child-failed signal outranks the propagating exception) |
 
 ## Skip paths
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
index 39ee0ccf6..18b03c194 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -12,17 +12,36 @@
 
 from __future__ import annotations
 
+from datetime import datetime, timezone
 from textwrap import dedent
+from types import SimpleNamespace
 from typing import TYPE_CHECKING
 
 import pytest
 
 from sift_client._tests.pytest_plugin import _step_status_capture as capture
+from sift_client.sift_types.test_report import TestStatus
 
 if TYPE_CHECKING:
     from pathlib import Path
 
 
+def _parse_ts(ts: str) -> datetime:
+    """Parse a protobuf-JSON RFC3339 timestamp across Python 3.8-3.14.
+
+    ``datetime.fromisoformat`` only accepts ``Z`` / arbitrary fractional digits
+    on 3.11+, so parse the second-precision base with ``strptime`` and apply the
+    fractional part by hand (protobuf emits 0/3/6/9 digits).
+    """
+    body = ts.rstrip("Z").split("+", 1)[0]
+    base, _, frac = body.partition(".")
+    # All Sift timestamps are UTC; tag it so comparisons stay unambiguous.
+    parsed = datetime.strptime(base, "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
+    if frac:
+        parsed = parsed.replace(microsecond=int(frac.ljust(6, "0")[:6]))
+    return parsed
+
+
 _INNER_CONFTEST = 'pytest_plugins = ["sift_client.pytest_plugin"]\n'
 
 
@@ -86,6 +105,42 @@ def test_b(self):
     assert by_name["test_b"][0]["parent_step_id"] == class_id
 
 
+def test_collection_skipped_method_nests_under_its_class(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A collection-time skipped method nests under its class parent.
+
+    ``@pytest.mark.skip`` is evaluated before the autouse fixtures run, so the
+    skipped item's step comes from the makereport hook rather than the ``step``
+    fixture. The report-tree parents live off the step stack, so that inline step
+    must still resolve and attach to the class parent rather than the report root.
+    Order is pinned so the non-skipped sibling opens the class first.
+    """
+    pytester.makepyfile(
+        test_skip_nest=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                def test_run(self):
+                    pass
+
+                @pytest.mark.skip(reason="x")
+                def test_skipped(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=1, skipped=1)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert len(by_name["TestFoo"]) == 1
+    class_id = by_name["TestFoo"][0]["id"]
+    assert by_name["test_run"][0]["parent_step_id"] == class_id
+    assert by_name["test_skipped"][0]["parent_step_id"] == class_id
+    assert by_name["test_skipped"][0]["statuses"][-1] == TestStatus.SKIPPED
+
+
 def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, log_file: Path) -> None:
     pytester.makepyfile(
         test_nested=dedent(
@@ -264,7 +319,7 @@ def test_a(self):
     assert _ancestor_names(steps, leaf)[:3] == ["test_a", "TestFoo", "test_doc.py"]
 
 
-def test_transition_between_class_chains_drains_parametrize(
+def test_two_class_chains_keep_parametrize_isolated(
     pytester: pytest.Pytester, log_file: Path
 ) -> None:
     pytester.makepyfile(
@@ -309,62 +364,67 @@ def test_y(self, w):
 # ---------------------------------------------------------------------------
 
 
-def test_drain_step_stack_continues_past_failing_exit() -> None:
-    """Lenient mode: a misbehaving ``__exit__`` must not block the rest of the stack."""
-    from sift_client._internal.pytest_plugin.steps import drain_step_stack
-    from sift_client.pytest_plugin import SiftPytestStepDrainWarning
+class _FakeParent:
+    """Minimal stand-in for an open ``NewStep`` parent in the plugin registries."""
 
-    class _Good:
-        def __init__(self) -> None:
-            self.closed = False
+    def __init__(self, name: str, step_path: str, *, raises: str | None = None) -> None:
+        self.current_step = SimpleNamespace(name=name, step_path=step_path)
+        self._raises = raises
+        self.closed = False
 
-        def __exit__(self, *_: object) -> None:
-            self.closed = True
+    def __exit__(self, *_: object) -> None:
+        if self._raises is not None:
+            raise RuntimeError(self._raises)
+        self.closed = True
 
-    class _Bad:
-        def __exit__(self, *_: object) -> None:
-            raise RuntimeError("boom")
 
-    g1, g2, bad = _Good(), _Good(), _Bad()
-    stack: list[tuple[str, object]] = [("g1", g1), ("bad", bad), ("g2", g2)]
-    with pytest.warns(SiftPytestStepDrainWarning, match="boom"):
-        drain_step_stack(stack)
-    assert stack == []
-    assert g1.closed
-    assert g2.closed
-
-
-def test_drain_step_stack_strict_drains_fully_then_raises() -> None:
-    """Strict mode: drain every frame, then raise with the FIRST failure chained."""
-    from sift_client._internal.pytest_plugin.steps import drain_step_stack
-    from sift_client.pytest_plugin import SiftPytestStepDrainError
+@pytest.fixture
+def clean_parent_registries():
+    """Save/restore the module-level parent registries and REPORT_CONTEXT.
 
-    class _Good:
-        def __init__(self) -> None:
-            self.closed = False
+    The ``finalize_parents`` resilience test pokes the globals directly, so
+    isolate them from any real session state. Registries and ``finalize_parents``
+    live in ``_internal.pytest_plugin.steps``; ``REPORT_CONTEXT`` is the public
+    session global on ``sift_client.pytest_plugin``.
+    """
+    from sift_client import pytest_plugin
+    from sift_client._internal.pytest_plugin import steps
 
-        def __exit__(self, *_: object) -> None:
-            self.closed = True
+    saved = (
+        dict(steps.hierarchy_parents),
+        dict(steps.parametrize_parents),
+        pytest_plugin.REPORT_CONTEXT,
+    )
+    steps.hierarchy_parents.clear()
+    steps.parametrize_parents.clear()
+    pytest_plugin.REPORT_CONTEXT = None  # skip the end_time override lookup
+    try:
+        yield steps
+    finally:
+        steps.hierarchy_parents.clear()
+        steps.hierarchy_parents.update(saved[0])
+        steps.parametrize_parents.clear()
+        steps.parametrize_parents.update(saved[1])
+        pytest_plugin.REPORT_CONTEXT = saved[2]
+
+
+def test_finalize_parents_continues_past_failing_exit(clean_parent_registries) -> None:
+    """Lenient mode: a misbehaving parent ``__exit__`` must not block the others."""
+    from sift_client.pytest_plugin import SiftPytestStepDrainWarning
 
-    class _Bad:
-        def __init__(self, label: str) -> None:
-            self.label = label
+    steps = clean_parent_registries
+    good = _FakeParent("good", "1")
+    bad = _FakeParent("bad", "1.1", raises="boom")
+    steps.hierarchy_parents["good"] = good
+    steps.parametrize_parents[("t", "bad")] = bad
 
-        def __exit__(self, *_: object) -> None:
-            raise RuntimeError(f"boom-{self.label}")
+    with pytest.warns(SiftPytestStepDrainWarning, match="boom"):
+        steps.finalize_parents()
 
-    g, b1, b2 = _Good(), _Bad("first"), _Bad("second")
-    # Stack drains LIFO: pop order is b2, b1, g. So b2's failure is the first
-    # one collected and surfaces in __cause__.
-    stack: list[tuple[str, object]] = [("g", g), ("b1", b1), ("b2", b2)]
-    with pytest.raises(SiftPytestStepDrainError, match="2 step.*'b2'") as exc_info:
-        drain_step_stack(stack, swallow_errors=False)
-    # Stack fully drained even though it raised.
-    assert stack == []
-    assert g.closed
-    # Original exception chained for debuggability.
-    assert isinstance(exc_info.value.__cause__, RuntimeError)
-    assert "boom-second" in str(exc_info.value.__cause__)
+    assert good.closed
+    # Registries cleared regardless of the per-parent failure.
+    assert steps.hierarchy_parents == {}
+    assert steps.parametrize_parents == {}
 
 
 def test_failing_test_in_class_does_not_orphan_class_step(
@@ -858,3 +918,452 @@ def test_chain(a, b):
     chain = _ancestor_names(steps, leaf)
     # leaf b=… → a=… → test_chain → test_chain.py (module step) → root
     assert chain == ["b='x'", "a=1", "test_chain", "test_chain.py"]
+
+
+# ---------------------------------------------------------------------------
+# Order independence
+# ---------------------------------------------------------------------------
+
+
+def test_interleaved_execution_does_not_duplicate_parents(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """Sibling methods need not run contiguously to share one class parent.
+
+    A conftest hook interleaves the two classes' methods
+    (``A::a1, B::b1, A::a2, B::b2``) — the order the removed sort used to
+    forbid, and the order pytest's own fixture-scope reordering can produce.
+    Each class must still open exactly once and every method parent to the
+    right class.
+    """
+    # Overwrite the conftest with one that registers the plugin AND reorders
+    # items so the two classes interleave. The log_file fixture's pytest.ini
+    # (offline + log path) still applies.
+    pytester.makeconftest(
+        dedent(
+            """
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+            def pytest_collection_modifyitems(config, items):
+                a = [i for i in items if "TestA::" in i.nodeid]
+                b = [i for i in items if "TestB::" in i.nodeid]
+                interleaved = []
+                for x, y in zip(a, b):
+                    interleaved.append(x)
+                    interleaved.append(y)
+                items[:] = interleaved
+            """
+        )
+    )
+    pytester.makepyfile(
+        test_inter=dedent(
+            """
+            class TestA:
+                def test_a1(self):
+                    pass
+
+                def test_a2(self):
+                    pass
+
+            class TestB:
+                def test_b1(self):
+                    pass
+
+                def test_b2(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Each class opens exactly once despite the interleaved run order.
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+    a_id = by_name["TestA"][0]["id"]
+    b_id = by_name["TestB"][0]["id"]
+    assert by_name["test_a1"][0]["parent_step_id"] == a_id
+    assert by_name["test_a2"][0]["parent_step_id"] == a_id
+    assert by_name["test_b1"][0]["parent_step_id"] == b_id
+    assert by_name["test_b2"][0]["parent_step_id"] == b_id
+
+
+# ---------------------------------------------------------------------------
+# Parent status resolution
+# ---------------------------------------------------------------------------
+
+
+def test_parent_status_passed_when_all_children_pass(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_ok=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert by_name["TestFoo"][0]["statuses"][-1] == TestStatus.PASSED
+    assert by_name["test_ok.py"][0]["statuses"][-1] == TestStatus.PASSED
+
+
+def test_parent_status_failed_propagates_up_and_isolates_siblings(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A failing leaf marks its class and the module FAILED, but a sibling class
+    whose tests all pass stays PASSED.
+    """
+    pytester.makepyfile(
+        test_fail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert by_name["TestFoo"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["test_fail.py"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["TestBar"][0]["statuses"][-1] == TestStatus.PASSED
+
+
+def test_parent_status_failure_propagates_through_parametrize(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """One failing parametrization fails the whole chain: parametrize parent →
+    class → module.
+    """
+    pytester.makepyfile(
+        test_pfail=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    if v == 1:
+                        raise AssertionError("boom")
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1, failed=1)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert by_name["test_a"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["TestFoo"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["test_pfail.py"][0]["statuses"][-1] == TestStatus.FAILED
+
+
+def test_parent_opens_in_progress_and_resolves_exactly_once(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A parent is created IN_PROGRESS and gets exactly one terminal status at
+    session end — it is never reopened, even as later siblings run under it.
+
+    This locks in the "stay in-progress until every child is done, then resolve
+    once" behavior: a parent emits a CreateTestStep (IN_PROGRESS) and a single
+    UpdateTestStep (terminal), so its status timeline is exactly two entries.
+    """
+    pytester.makepyfile(
+        test_once=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    by_name = _by_name(capture.load_steps(log_file))
+    # Created in-progress, resolved once — no intermediate churn, no reopen.
+    assert by_name["TestFoo"][0]["statuses"] == [TestStatus.IN_PROGRESS, TestStatus.PASSED]
+    assert by_name["test_once.py"][0]["statuses"] == [TestStatus.IN_PROGRESS, TestStatus.PASSED]
+
+
+# ---------------------------------------------------------------------------
+# Parent timing
+# ---------------------------------------------------------------------------
+
+
+def test_parent_timing_spans_its_children(pytester: pytest.Pytester, log_file: Path) -> None:
+    """A parent's [start, end] window covers its whole subtree: it starts no
+    later than its first child and ends exactly at its last child's finish.
+    """
+    pytester.makepyfile(
+        test_span=dedent(
+            """
+            import time
+
+            class TestFoo:
+                def test_a(self):
+                    time.sleep(0.02)
+
+                def test_b(self):
+                    time.sleep(0.02)
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=2)
+    by_name = _by_name(capture.load_steps(log_file))
+    klass = by_name["TestFoo"][0]
+    module = by_name["test_span.py"][0]
+    leaves = [by_name["test_a"][0], by_name["test_b"][0]]
+    leaf_starts = [_parse_ts(leaf["start_time"]) for leaf in leaves]
+    leaf_ends = [_parse_ts(leaf["end_time"]) for leaf in leaves]
+
+    # Parent opened before (or with) its earliest child, and start precedes end.
+    assert _parse_ts(klass["start_time"]) <= min(leaf_starts)
+    assert _parse_ts(klass["start_time"]) <= _parse_ts(klass["end_time"])
+    # Parent end is exactly the latest descendant finish — not a session-end stamp.
+    assert _parse_ts(klass["end_time"]) == max(leaf_ends)
+    # The module parent spans the class and rolls the same finish up a level.
+    assert _parse_ts(module["start_time"]) <= _parse_ts(klass["start_time"])
+    assert _parse_ts(module["end_time"]) == max(leaf_ends)
+
+
+def test_parent_end_time_reflects_a_later_child_under_interleaving(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """When a parent's children run non-contiguously, its end_time tracks the
+    LAST child to finish — even one that runs after a different parent's child.
+
+    Execution order is pinned to ``a1, b1, a2`` via a conftest hook, so
+    ``TestA``'s second child (``a2``) closes after ``TestB``'s child. ``TestA``
+    must end at ``a2``'s finish, not ``a1``'s.
+    """
+    pytester.makeconftest(
+        dedent(
+            """
+            pytest_plugins = ["sift_client.pytest_plugin"]
+            import pytest
+
+            _ORDER = ["test_a1", "test_b1", "test_a2"]
+
+            @pytest.hookimpl(trylast=True)
+            def pytest_collection_modifyitems(config, items):
+                # trylast so this runs after any reordering plugin and wins.
+                items.sort(key=lambda i: _ORDER.index(i.name) if i.name in _ORDER else 99)
+            """
+        )
+    )
+    pytester.makepyfile(
+        test_il=dedent(
+            """
+            import time
+
+            class TestA:
+                def test_a1(self):
+                    pass
+
+                def test_a2(self):
+                    time.sleep(0.02)
+
+            class TestB:
+                def test_b1(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=3)
+    by_name = _by_name(capture.load_steps(log_file))
+    a_end = by_name["TestA"][0]["end_time"]
+    a1_end = by_name["test_a1"][0]["end_time"]
+    a2_end = by_name["test_a2"][0]["end_time"]
+    # TestA ends at its later child (a2), not the one that happened to run first.
+    assert a_end == a2_end
+    assert a_end != a1_end
+
+
+# ---------------------------------------------------------------------------
+# Early close — parents resolve as soon as their descendants finish
+# ---------------------------------------------------------------------------
+
+
+def _index(
+    events: list[tuple],
+    request_type: str,
+    name: str,
+    *,
+    terminal: bool = False,
+    status: TestStatus | None = None,
+) -> int:
+    """Index of the first matching log event.
+
+    ``status`` matches that exact status; ``terminal`` matches any resolved
+    (non-``IN_PROGRESS``) status.
+    """
+
+    def matches(rt: str, nm: str, st: TestStatus) -> bool:
+        if rt != request_type or nm != name:
+            return False
+        if status is not None:
+            return st == status
+        return not terminal or st != TestStatus.IN_PROGRESS
+
+    return next(i for i, (rt, nm, st) in enumerate(events) if matches(rt, nm, st))
+
+
+_INTERLEAVE_CONFTEST = """
+pytest_plugins = ["sift_client.pytest_plugin"]
+import pytest
+
+_ORDER = ["test_a1", "test_b1", "test_a2"]
+
+@pytest.hookimpl(trylast=True)
+def pytest_collection_modifyitems(config, items):
+    # trylast so this wins over any reordering plugin; pins A::a1, B::b1, A::a2.
+    items.sort(key=lambda i: _ORDER.index(i.name) if i.name in _ORDER else 99)
+"""
+
+
+def test_parent_closes_mid_session_not_at_end(pytester: pytest.Pytester, log_file: Path) -> None:
+    """A container resolves as soon as its last child finishes — before the next
+    container even opens — rather than all flipping at session end.
+    """
+    pytester.makepyfile(
+        test_mid=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=3)
+    events = capture.log_events(log_file)
+    # TestFoo reaches a terminal status before TestBar is even created.
+    assert _index(events, "UpdateTestStep", "TestFoo", terminal=True) < _index(
+        events, "CreateTestStep", "TestBar"
+    )
+
+
+def test_failing_parent_resolves_failed_mid_session(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """Early close carries status too: a class with a failing test resolves FAILED
+    as soon as its subtree finishes, before the next class opens.
+    """
+    pytester.makepyfile(
+        test_midfail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=1, failed=1)
+    events = capture.log_events(log_file)
+    foo_failed = _index(events, "UpdateTestStep", "TestFoo", status=TestStatus.FAILED)
+    assert foo_failed < _index(events, "CreateTestStep", "TestBar")
+
+
+def test_close_is_completion_driven_not_order_driven(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A single-child container closes the moment that child finishes, even though
+    a sibling container's test (collected earlier) runs afterward.
+
+    Order is pinned to ``a1, b1, a2``: ``TestB`` (only child ``b1``) must resolve
+    before ``test_a2`` runs, proving close is driven by descendant completion, not
+    by reaching some position in the item list.
+    """
+    pytester.makeconftest(_INTERLEAVE_CONFTEST)
+    pytester.makepyfile(
+        test_cd=dedent(
+            """
+            class TestA:
+                def test_a1(self):
+                    pass
+
+                def test_a2(self):
+                    pass
+
+            class TestB:
+                def test_b1(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=3)
+    events = capture.log_events(log_file)
+    # TestB resolves before test_a2 is even created.
+    assert _index(events, "UpdateTestStep", "TestB", terminal=True) < _index(
+        events, "CreateTestStep", "test_a2"
+    )
+
+
+def test_excluded_sibling_does_not_stall_parent_close(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A ``sift_exclude``-d method is not counted toward its class's descendants,
+    so the class still closes promptly once its included tests finish.
+
+    If the excluded test inflated the count, ``TestFoo`` could never reach zero
+    and would only resolve at the session-end drain — i.e. after ``TestBar`` is
+    created. Asserting it resolves *before* ``TestBar`` proves the gate filter.
+    """
+    pytester.makepyfile(
+        test_excl_close=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.sift_exclude
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=3)
+    events = capture.log_events(log_file)
+    assert _index(events, "UpdateTestStep", "TestFoo", terminal=True) < _index(
+        events, "CreateTestStep", "TestBar"
+    )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
index 52aa6f23c..112ef4055 100644
--- a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -61,6 +61,12 @@ def _run(pytester, body: str) -> None:
         "--sift-offline",
         f"--sift-log-file={log_path}",
         "--no-sift-git-metadata",
+        # Pin the inner session to definition order so ``test_sift_warmup`` runs
+        # before a marker-skipped ``test_x`` (see ``_WARMUP``). ``-p no:randomly``
+        # is a no-op when pytest-randomly isn't installed, and keeps these tests
+        # deterministic when it is.
+        "-p",
+        "no:randomly",
     )
 
 
@@ -180,13 +186,13 @@ def test_x(step):
     assert capture.final_status("test_x") == TestStatus.PASSED
 
 
-def test_keyboard_interrupt_leaves_step_in_progress(inner):
+def test_keyboard_interrupt_resolves_step_to_aborted(inner):
     # Case: CALL-06
     # KeyboardInterrupt aborts the session before the call-phase makereport
-    # fires; the plugin can't observe the interrupt. The contract is that
-    # the step is left in IN_PROGRESS rather than being silently resolved
-    # to PASSED — a session-aborting interrupt should not look like a clean
-    # pass in the report.
+    # fires; the plugin can't observe the interrupt directly. Setup completed
+    # but no call outcome was seen, so the step resolves to ABORTED rather than
+    # being left IN_PROGRESS (a finalized report should not carry a step that
+    # still reads as in-progress) or coerced to PASSED.
     try:
         _run(
             inner,
@@ -199,7 +205,7 @@ def test_x():
         pass
     outer = capture.test_step("test_x")
     assert outer is not None
-    assert outer.statuses[-1] == TestStatus.IN_PROGRESS
+    assert outer.statuses[-1] == TestStatus.ABORTED
 
 
 def test_substep_exception_records_error_with_failed_parent(inner):
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
index a381d78cd..7e4c3c120 100644
--- a/python/lib/sift_client/pytest_plugin.py
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -47,12 +47,13 @@
 from sift_client._internal.pytest_plugin.steps import (
     build_hierarchy_chain,
     build_parametrize_path,
-    drain_hierarchy_stack,
-    drain_parametrize_stack,
+    finalize_parents,
+    get_or_create_parent_chain,
     hierarchy_key,
     parametrize_path_key,
-    reconcile_hierarchy,
-    reconcile_parametrize,
+    release_finished_leaf,
+    resolve_parent_chain_in_context,
+    tally_expected_parents,
 )
 from sift_client._internal.pytest_plugin.terminal import (
     maybe_open_report,
@@ -71,7 +72,6 @@
     "NewStep",
     "ReportContext",
     "SiftPytestPluginWarning",
-    "SiftPytestStepDrainError",
     "SiftPytestStepDrainWarning",
     "client_has_connection",
     "report_context",
@@ -90,18 +90,14 @@ class SiftPytestPluginWarning(SiftWarning):
 
 
 class SiftPytestStepDrainWarning(SiftPytestPluginWarning):
-    """A step's ``__exit__`` raised while the plugin was draining its stack.
+    """A parent step's ``__exit__`` raised while the plugin was closing it.
 
-    Surfaced at module-teardown or session-end so the drain can continue and
-    pytest test outcomes stay unaffected; the underlying exception is included
-    in the message for debugging.
+    Surfaced when a parent step is closed (early as its subtree finishes, or at
+    session end) so the close can continue and pytest test outcomes stay
+    unaffected; the underlying exception is included in the message for debugging.
     """
 
 
-class SiftPytestStepDrainError(RuntimeError):
-    """Raised when mid-session drain fails, signaling a likely upstream invariant break."""
-
-
 # ---------------------------------------------------------------------------
 # Public session state and stash keys.
 # ---------------------------------------------------------------------------
@@ -220,10 +216,9 @@ def report_context(
 
     The fixture is no longer autouse; it's instantiated on the first call
     to ``request.getfixturevalue("report_context")``, which today happens
-    inside the gated ``step``, ``_hierarchy_parents``, and
-    ``_parametrize_parents`` fixtures. If every test in the session is
-    excluded via the marker gate, this fixture is never resolved and no
-    ReportContext (or teardown subprocess) is created.
+    inside the gated ``step`` and ``_sift_parents`` fixtures. If every test in
+    the session is excluded via the marker gate, this fixture is never resolved
+    and no ReportContext (or teardown subprocess) is created.
 
     What gets yielded depends on the mode:
 
@@ -274,7 +269,7 @@ def report_context(
 def step(
     request: pytest.FixtureRequest,
     pytestconfig: pytest.Config,
-    _parametrize_parents: None,
+    _sift_parents: None,
 ) -> Generator[NewStep | None, None, None]:
     """Create an outer step for the function when the Sift gate is on.
 
@@ -294,39 +289,24 @@ def step(
 
 
 @pytest.fixture(autouse=True)
-def _hierarchy_parents(
+def _sift_parents(
     request: pytest.FixtureRequest,
     pytestconfig: pytest.Config,
 ) -> None:
-    """Open/close hierarchy parent steps (packages, modules, classes) for the current item.
-
-    Gated off when the item is excluded (avoids eager ``report_context`` setup);
-    otherwise delegates to ``reconcile_hierarchy``, which diffs the item's
-    ancestor chain against the open stack and opens/closes parents to match.
-    """
-    if not gate_enabled(request.node, pytestconfig):
-        return
-    reconcile_hierarchy(request, pytestconfig)
+    """Resolve (get-or-create) the report-tree parent for the current item.
 
+    Builds the item's hierarchy (packages / modules / classes) and parametrize
+    parents via ``get_or_create_parent_chain`` and stashes the innermost one on
+    the node as ``_sift_parent`` for the ``step`` fixture to nest the leaf under.
+    Parents are keyed by identity and reused across sibling items in any order, so
+    no reordering of test items is needed.
 
-@pytest.fixture(autouse=True)
-def _parametrize_parents(
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-    _hierarchy_parents: None,
-) -> None:
-    """Open/close shared parametrize parent steps for the current item.
-
-    Ordered after ``_hierarchy_parents`` so parametrize parents nest inside the
-    hierarchy ones. Gated off when the item is excluded (so excluded items don't
-    eagerly request ``report_context``); otherwise delegates to
-    ``reconcile_parametrize``, which also no-ops when
-    ``sift_parametrize_nesting=false``. Parents persist until a later test's
-    chain pops them, or until ``pytest_sessionfinish`` drains the rest.
+    Gated off when the item is excluded so excluded items never eagerly create
+    ``report_context`` (preserving its lazy, first-gated-test creation).
     """
     if not gate_enabled(request.node, pytestconfig):
         return
-    reconcile_parametrize(request, pytestconfig)
+    request.node._sift_parent = get_or_create_parent_chain(request.node, pytestconfig, request)
 
 
 # ---------------------------------------------------------------------------
@@ -359,28 +339,32 @@ def pytest_configure(config: pytest.Config) -> None:
     warn_on_unknown_toml_keys(config)
 
 
-def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
-    """Stash each item's class chain + parametrize path and cluster siblings.
+def pytest_itemcollected(item: pytest.Item) -> None:
+    """Cache each test item's hierarchy chain and parametrize path at collection.
+
+    This is a per-item hook, not ``pytest_collection_modifyitems`` — the plugin
+    never touches the ``items`` list or its order, so it cannot conflict with a
+    user's (or another plugin's) collection-ordering hook. The report tree is
+    built from an identity-keyed registry (see ``get_or_create_parent_chain``),
+    so item order is irrelevant to nesting; ``pytest-randomly``,
+    ``pytest-ordering``, and pytest's own fixture-scope reordering are all
+    preserved untouched.
 
-    Sorts by ``(file_path, hierarchy_chain, parametrize_path)`` so sibling
-    items under a shared parent (package, module, class, or parametrize axis)
-    stay contiguous; otherwise a free function sorting between two class
-    methods would tear down + re-open the class step, producing duplicate
-    parents in the report tree.
+    The stash is a cache the autouse fixtures read back; both keys have an
+    on-demand recompute fallback, so an item a later hook injects without going
+    through this hook still resolves correctly.
     """
-    for item in items:
-        item.stash[hierarchy_key] = build_hierarchy_chain(item, config)
-        item.stash[parametrize_path_key] = build_parametrize_path(item)
-    # Use ``.get(...)`` defensively: a third-party hook may inject items after
-    # our stashing loop runs, and we'd rather sort them at the tail than
-    # KeyError out of collection.
-    items.sort(
-        key=lambda i: (
-            str(i.path),
-            tuple(identity for identity, _, _, _ in i.stash.get(hierarchy_key, ())),
-            i.stash.get(parametrize_path_key, ()),
-        )
-    )
+    item.stash[hierarchy_key] = build_hierarchy_chain(item, item.config)
+    item.stash[parametrize_path_key] = build_parametrize_path(item)
+
+
+def pytest_collection_finish(session: pytest.Session) -> None:
+    """Tally each parent's descendant leaves so parents can close mid-session.
+
+    Delegates to ``tally_expected_parents``; runs after deselection so the counts
+    reflect only the selected, gated-in items. See ``release_finished_leaf``.
+    """
+    tally_expected_parents(session)
 
 
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
@@ -407,25 +391,40 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
         and report.outcome == "skipped"
         and getattr(item, "_sift_step", None) is None
     ):
-        with REPORT_CONTEXT.new_step(name=item.name) as inline_step:
+        # Nest the inline step under the same registry parents a running sibling
+        # would use. The autouse ``_sift_parents`` fixture never ran for a
+        # marker-skipped item, and the report-tree parents live off the step
+        # stack, so without resolving the parent here the step lands at the
+        # report root instead of under its module/class.
+        parent_ns = resolve_parent_chain_in_context(item, item.config, REPORT_CONTEXT)
+        parent_step = parent_ns.current_step if parent_ns is not None else None
+        with REPORT_CONTEXT.new_step(name=item.name, parent=parent_step) as inline_step:
             inline_step.current_step.update({"status": TestStatus.SKIPPED})
 
     if report.when == "teardown":
         finalize_after_teardown(item, report)
 
 
+def pytest_runtest_logfinish(nodeid: str, location: tuple[str, int | None, str]) -> None:
+    """Close report-tree parents whose subtree finished with this item.
+
+    Fires once per item (pass / fail / skip / error); delegates to
+    ``release_finished_leaf``, which decrements the item's parents' remaining-leaf
+    counts and closes any that reach zero — so containers resolve progressively
+    rather than all at session end.
+    """
+    release_finished_leaf(nodeid)
+
+
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
-    """Drain any parent steps still open at session end (innermost first).
+    """Close any report-tree parents still open at session end (innermost first).
 
-    Wrapped so a failure in the inner drain does not prevent the outer one
-    from running. With ``module_substep`` removed, this is the sole place
-    where hierarchy parents close; they persist across all tests and only
-    drain when the session ends.
+    Normally a no-op: ``report_context_impl`` finalizes the parents inside the
+    ``ReportContext`` block so their updates reach the log before the import
+    worker drains, and most parents already closed early as their subtrees
+    finished. This is the idempotent backstop for anything still open.
     """
-    try:
-        drain_parametrize_stack()
-    finally:
-        drain_hierarchy_stack()
+    finalize_parents()
 
 
 def pytest_report_header(config: pytest.Config) -> str | None:
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 5cd2c6729..84b97dab8 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -44,6 +44,13 @@
 
 logger = logging.getLogger(__name__)
 
+# Sentinel for ``create_step``/``new_step``'s ``parent`` argument. Distinguishes
+# "parent omitted -> use the top of the step stack" (the default, linear
+# behavior) from an explicit ``parent=None`` (create at the report root). The
+# pytest plugin passes an explicit parent to build its report tree out of
+# execution order; everyday ``new_step``/``substep`` callers omit it.
+_USE_STACK_TOP = object()
+
 
 def format_truncated_traceback(
     exc: type[BaseException] | None,
@@ -139,8 +146,18 @@ class ReportContext(AbstractContextManager):
     log_file: Path | None
     step_is_open: bool
     step_stack: list[TestStep]
-    step_number_at_depth: dict[int, int]
+    # Per-parent child counter keyed by the parent's ``step_path`` (``""`` is the
+    # root bucket). Drives parent-relative path numbering so two parents at the
+    # same depth never collide and a step's path is stable regardless of the
+    # order siblings are created in.
+    child_counts: dict[str, int]
     open_step_results: dict[str, bool]
+    # Latest child ``end_time`` seen for each parent, keyed by the parent's
+    # ``step_path``. A parent that stays open across the whole run (e.g. a
+    # hierarchy/parametrize parent the pytest plugin holds in its registry) is
+    # closed with this time, so its duration spans first-child-start to
+    # last-descendant-finish rather than wall-clock at session end.
+    parent_end_times: dict[str, datetime]
     any_failures: bool
     # Every step created in this report (including hierarchy/parametrize
     # parents), retained after close so end-of-run summaries can tally final
@@ -204,8 +221,9 @@ def __init__(
         self.replay_log_file = replay_log_file
         self.step_is_open = False
         self.step_stack = []
-        self.step_number_at_depth = {}
+        self.child_counts = {}
         self.open_step_results = {}
+        self.parent_end_times = {}
         self.any_failures = False
         self.created_steps = []
         self.created_measurements = []
@@ -368,29 +386,53 @@ def new_step(
         description: str | None = None,
         assertion_as_fail_not_error: bool = True,
         metadata: dict[str, str | float | bool] | None = None,
+        *,
+        parent: TestStep | None | object = _USE_STACK_TOP,
+        push: bool = True,
     ) -> NewStep:
-        """Alias to return a new step context manager from this report context. Use create_step for actually creating a TestStep in the current context."""
+        """Alias to return a new step context manager from this report context. Use create_step for actually creating a TestStep in the current context.
+
+        ``parent`` and ``push`` default to the linear, stack-based behavior used
+        by everyday callers. The pytest plugin passes an explicit ``parent`` with
+        ``push=False`` to open report-tree parents that persist outside the stack;
+        see :meth:`create_step`.
+        """
         return NewStep(
             self,
             name=name,
             description=description,
             assertion_as_fail_not_error=assertion_as_fail_not_error,
             metadata=metadata,
+            parent=parent,
+            push=push,
         )
 
-    def get_next_step_path(self) -> str:
-        """Get the next step path for the current depth."""
-        top_step = self.step_stack[-1] if self.step_stack else None
-        step_path = top_step.step_path if top_step else ""
-        next_step_number = self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        prefix = f"{step_path}." if step_path else ""
-        return f"{prefix}{next_step_number}"
+    def _resolve_parent(self, parent: TestStep | None | object) -> TestStep | None:
+        """Resolve a ``parent`` argument to a concrete parent step (or None for root)."""
+        if parent is _USE_STACK_TOP:
+            return self.step_stack[-1] if self.step_stack else None
+        return parent  # type: ignore[return-value]
+
+    def get_next_step_path(self, parent: TestStep | None | object = _USE_STACK_TOP) -> str:
+        """Preview the path the next step under ``parent`` would get (no side effects).
+
+        Parent-relative: a child's path is ``<parent path>.<nth child>``, or
+        ``<n>`` at the root. Defaults to the top of the step stack so existing
+        callers see the same value the next stacked ``create_step`` will assign.
+        """
+        parent_step = self._resolve_parent(parent)
+        parent_path = parent_step.step_path if parent_step else ""
+        next_number = self.child_counts.get(parent_path, 0) + 1
+        return f"{parent_path}.{next_number}" if parent_path else str(next_number)
 
     def create_step(
         self,
         name: str,
         description: str | None = None,
         metadata: dict[str, str | float | bool] | None = None,
+        *,
+        parent: TestStep | None | object = _USE_STACK_TOP,
+        push: bool = True,
     ) -> TestStep:
         """Create a new step in the report context.
 
@@ -400,12 +442,23 @@ def create_step(
             metadata: [Optional] Structured key/value metadata to attach to the step. For
                 metadata shared across every step in a report, prefer the `metadata` attribute
                 of the enclosing `TestReport`.
+            parent: The parent step to nest under. ``_USE_STACK_TOP`` (the
+                default) parents to the current top of the step stack — the
+                linear behavior. An explicit ``TestStep`` parents under that step
+                regardless of stack state; explicit ``None`` creates a root step.
+            push: Whether to push the new step onto the step stack. True (the
+                default) for leaf/in-test steps so their substeps nest under
+                them. The pytest plugin passes False for hierarchy/parametrize
+                parents, which live in its own registry and would otherwise
+                trap unrelated steps beneath them.
 
         Returns:
             The created step.
         """
-        step_path = self.get_next_step_path()
-        parent_step = self.step_stack[-1] if self.step_stack else None
+        parent_step = self._resolve_parent(parent)
+        parent_path = parent_step.step_path if parent_step else ""
+        next_number = self.child_counts.get(parent_path, 0) + 1
+        step_path = f"{parent_path}.{next_number}" if parent_path else str(next_number)
 
         step = self.client.test_results.create_step(
             TestStepCreate(
@@ -424,10 +477,9 @@ def create_step(
         )
 
         # Update the step tracking structures.
-        self.step_number_at_depth[len(self.step_stack)] = (
-            self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        )
-        self.step_stack.append(step)
+        self.child_counts[parent_path] = next_number
+        if push:
+            self.step_stack.append(step)
         self.open_step_results[step.step_path] = True
         # Retained for end-of-run tallies; never popped (unlike step_stack).
         self.created_steps.append(step)
@@ -473,15 +525,41 @@ def propagate_step_result(self, step: TestStep, status: TestStatus) -> bool:
                 self.open_step_results[".".join(path_parts[:-1])] = False
         return succeeded
 
-    def exit_step(self, step: TestStep):
-        """Exit a step and update the report context."""
-        self.step_number_at_depth[len(self.step_stack)] = 0
-        stack_top = self.step_stack.pop()
-        self.open_step_results.pop(step.step_path)
+    def note_close(self, step: TestStep) -> None:
+        """Record a just-closed step's ``end_time`` against its parent.
 
-        if stack_top.id_ != step.id_:
+        Lets a long-lived parent (one closed later, out of band) adopt the finish
+        time of its latest child instead of wall-clock at its own close. Keyed by
+        the parent's ``step_path`` (the child path minus its last segment).
+        """
+        end_time = step.end_time
+        if end_time is None:
+            return
+        path_parts = step.step_path.split(".")
+        if len(path_parts) <= 1:
+            return
+        parent_path = ".".join(path_parts[:-1])
+        previous = self.parent_end_times.get(parent_path)
+        if previous is None or end_time > previous:
+            self.parent_end_times[parent_path] = end_time
+
+    def exit_step(self, step: TestStep):
+        """Exit a step and update the report context.
+
+        Stacked steps (leaves and their in-test substeps) close in strict LIFO
+        order, so a step that isn't the current top of the stack is a real
+        invariant break. Steps created with an explicit parent and ``push=False``
+        (the pytest plugin's hierarchy/parametrize parents) never sit on the
+        stack and may close in any order — clearing ``open_step_results`` is all
+        that's needed; their result was already propagated to their own parent.
+        """
+        self.open_step_results.pop(step.step_path, None)
+        if self.step_stack and self.step_stack[-1].id_ == step.id_:
+            self.step_stack.pop()
+            return
+        if any(s.id_ == step.id_ for s in self.step_stack):
             raise ValueError(
-                "The popped step was not the top of the stack. This should never happen."
+                "exit_step called out of LIFO order for a stacked step. This should never happen."
             )
 
 
@@ -496,6 +574,9 @@ class NewStep(AbstractContextManager):
     # status was already resolved upstream and ``__exit__`` should skip
     # re-classifying. Read via ``getattr`` so unset is treated as False.
     _sift_managed_externally: bool = False
+    # Set by the pytest plugin when finalizing a long-lived parent so ``__exit__``
+    # stamps its last-descendant finish time instead of wall-clock at close.
+    _sift_end_time_override: datetime | None = None
 
     def __init__(
         self,
@@ -504,6 +585,9 @@ def __init__(
         description: str | None = None,
         assertion_as_fail_not_error: bool = True,
         metadata: dict[str, str | float | bool] | None = None,
+        *,
+        parent: TestStep | None | object = _USE_STACK_TOP,
+        push: bool = True,
     ):
         """Initialize a new step context.
 
@@ -513,10 +597,14 @@ def __init__(
             description: The description of the step.
             assertion_as_fail_not_error: Mark steps with assertion errors as failed instead of error+traceback (some users want assertions to work as simple failures especially when using pytest).
             metadata: [Optional] Structured key/value metadata to attach to the step.
+            parent: Parent step to nest under; see :meth:`ReportContext.create_step`.
+            push: Whether the step joins the step stack; see :meth:`ReportContext.create_step`.
         """
         self.report_context = report_context
         self.client = report_context.client
-        self.current_step = self.report_context.create_step(name, description, metadata=metadata)
+        self.current_step = self.report_context.create_step(
+            name, description, metadata=metadata, parent=parent, push=push
+        )
         self.assertion_as_fail_not_error = assertion_as_fail_not_error
         # Per-step measurement-failure count for ``measurements_passed``.
         # Tracks only direct ``measure*`` calls on this NewStep instance;
@@ -589,6 +677,7 @@ def update_step_from_result(
         exc: type[Exception] | None,
         exc_value: Exception | None,
         tb: traceback.TracebackException | None,
+        end_time: datetime | None = None,
     ) -> bool:
         """Update the step based on its substeps and if there was an exception while executing the step.
 
@@ -596,6 +685,10 @@ def update_step_from_result(
             exc: The class of Exception that was raised.
             exc_value: The exception value.
             tb: The traceback object.
+            end_time: Explicit end_time to stamp. Defaults to now(); the pytest
+                plugin passes the last-child finish time when closing a long-lived
+                parent so its duration reflects its subtree rather than its own
+                late close.
 
         returns: The false if step failed or errored, true otherwise.
         """
@@ -653,10 +746,11 @@ def update_step_from_result(
         current_step.update(
             {
                 "status": status,
-                "end_time": datetime.now(timezone.utc),
+                "end_time": end_time if end_time is not None else datetime.now(timezone.utc),
                 "error_info": error_info,
             },
         )
+        self.report_context.note_close(current_step)
 
         return result
 
@@ -670,20 +764,24 @@ def __exit__(self, exc, exc_value, tb):
             if current_step is None:
                 # The step was never opened; nothing to propagate.
                 return True
+            override = getattr(self, "_sift_end_time_override", None)
             result = self.report_context.propagate_step_result(current_step, current_step.status)
             current_step.update(
                 {
                     "status": current_step.status,
-                    "end_time": datetime.now(timezone.utc),
+                    "end_time": override if override is not None else datetime.now(timezone.utc),
                     "error_info": current_step.error_info,
                 },
             )
+            self.report_context.note_close(current_step)
             self.report_context.exit_step(current_step)
             if hasattr(self, "force_result"):
                 result = self.force_result
             return result
 
-        result = self.update_step_from_result(exc, exc_value, tb)
+        result = self.update_step_from_result(
+            exc, exc_value, tb, end_time=getattr(self, "_sift_end_time_override", None)
+        )
 
         # Now that the step is updated. Let the report context handle removing it from the stack and updating the report context.
         self.report_context.exit_step(self.current_step)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index b04bce6d3..61b2b03d2 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -85,6 +85,7 @@ dev = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     'ruff~=0.12.10',
     'tomlkit~=0.13.3',
@@ -105,6 +106,7 @@ dev-all = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     "rosbags~=0.0 ; python_full_version >= '3.8.2'",
     'ruff~=0.12.10',
@@ -120,6 +122,7 @@ development = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     'ruff~=0.12.10',
     'tomlkit~=0.13.3',
@@ -158,6 +161,7 @@ docs-build = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     "rosbags~=0.0 ; python_full_version >= '3.8.2'",
     'ruff~=0.12.10',
@@ -206,6 +210,9 @@ development = [
     "pytest-benchmark==4.0.0",
     "pytest-mock==3.14.0",
     "pytest-dotenv==0.5.2",
+    # 3.15.0 is the last line supporting Python 3.8; pinned (rather than 4.x,
+    # which needs 3.10+) so randomization is active on the 3.8 CI test job too.
+    "pytest-randomly==3.15.0",
     "ruff~=0.12.10",
     "tomlkit~=0.13.3"
 ]
diff --git a/python/uv.lock b/python/uv.lock
index d6391b311..dc463b99b 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -3615,6 +3615,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f2/3b/b26f90f74e2986a82df6e7ac7e319b8ea7ccece1caec9f8ab6104dc70603/pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f", size = 9863, upload-time = "2024-03-21T22:14:02.694Z" },
 ]
 
+[[package]]
+name = "pytest-randomly"
+version = "3.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata", version = "8.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "importlib-metadata", version = "8.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/d4/6e924a0b2855736d942703dec88dfc98b4fe0881c8fa849b6b0fbb9182fa/pytest_randomly-3.15.0.tar.gz", hash = "sha256:b908529648667ba5e54723088edd6f82252f540cc340d748d1fa985539687047", size = 21743, upload-time = "2023-08-15T18:04:59.857Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/24/d3/00e575657422055c4ea220b2f80e8cc6026ab7130372b7067444d1b0ac10/pytest_randomly-3.15.0-py3-none-any.whl", hash = "sha256:0516f4344b29f4e9cdae8bce31c4aeebf59d0b9ef05927c33354ff3859eeeca6", size = 8685, upload-time = "2023-08-15T18:04:57.913Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -4400,6 +4414,7 @@ dev = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "ruff" },
     { name = "tomlkit" },
 ]
@@ -4427,6 +4442,7 @@ dev-all = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "rosbags", version = "0.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.8.2' and python_full_version < '3.10'" },
     { name = "rosbags", version = "0.11.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "ruff" },
@@ -4444,6 +4460,7 @@ development = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "ruff" },
     { name = "tomlkit" },
 ]
@@ -4489,6 +4506,7 @@ docs-build = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "rosbags", version = "0.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.8.2' and python_full_version < '3.10'" },
     { name = "rosbags", version = "0.11.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "ruff" },
@@ -4628,6 +4646,10 @@ requires-dist = [
     { name = "pytest-mock", marker = "extra == 'dev-all'", specifier = "==3.14.0" },
     { name = "pytest-mock", marker = "extra == 'development'", specifier = "==3.14.0" },
     { name = "pytest-mock", marker = "extra == 'docs-build'", specifier = "==3.14.0" },
+    { name = "pytest-randomly", marker = "extra == 'dev'", specifier = "==3.15.0" },
+    { name = "pytest-randomly", marker = "extra == 'dev-all'", specifier = "==3.15.0" },
+    { name = "pytest-randomly", marker = "extra == 'development'", specifier = "==3.15.0" },
+    { name = "pytest-randomly", marker = "extra == 'docs-build'", specifier = "==3.15.0" },
     { name = "pyyaml", specifier = "~=6.0" },
     { name = "rapidyaml", specifier = "~=0.11" },
     { name = "requests", specifier = "~=2.25" },