diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
index cc9fc06a8..01cd82631 100644
--- a/python/CHANGELOG.md
+++ b/python/CHANGELOG.md
@@ -3,6 +3,38 @@ All notable changes to this project will be documented in this file.
 
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## [v0.17.0] - Unreleased
+
+### What's New
+#### Pytest Plugin
+The client now ships a pytest plugin that turns a pytest run into a `TestReport` in Sift. Register it with a single `pytest_plugins = ["sift_client.pytest_plugin"]` line in your top-level `conftest.py`. Each test function becomes a `TestStep`, measurements appear as rows under that step, and failures roll up through nested substeps to the report. Enable it for a test by taking the autouse `step` fixture as an argument and calling `step.measure(...)` to record values against bounds.
+
+Highlights:
+- **Hierarchical report tree.** Packages, modules, classes, and parametrize axes above a test each become a parent step, so the report mirrors your test layout. Arbitrary substeps can be opened inside a test.
+- **Three running modes.** Online (default) pings Sift at session start and streams create/update calls during the run; offline records to a JSONL log for later replay; disabled evaluates bounds locally without contacting Sift. Select with `--sift-offline` or `--sift-disabled`.
+- **Graceful connection handling.** Online mode aborts at session start if Sift is unreachable or credentials are invalid, so a misconfigured job fails fast. If the connection drops mid-run, tests keep running and the log keeps writing locally; remaining entries upload afterward via the import command the plugin prints on exit.
+- **Pass/fail mapping.** Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard exit) maps to a `TestStatus` and propagates to parent steps and the report. `step.measure(...)` returns a pass/fail boolean without raising, so all measurements land in the report even when one fails; `step.pytest_fail_if_step_failed()` fails the test at the end if the step or any descendant failed (out-of-bounds measurements, failed substeps, `report_outcome` failures) without adding assertion noise to `error_info`.
+- **Assertion messages as error info.** Assertion failure messages are reported as the step's error info.
+- **Git metadata.** Repo, branch, and commit are captured on the report automatically.
+- **Terminal output.** The plugin prints a session header with the SDK version and active mode, and an end-of-run `Sift report` panel showing the test case, outcome, step and measurement breakdowns (color-coded), test system/operator, plus a link to the report (online), the saved log and upload command (offline), or a disabled note. Both suppress under `-q`. `SiftClient.app_url` exposes the web-app origin; set `sift_app_url` for on-prem or custom deployments. `--sift-open-report` opens the report in a browser at session end.
+- **Configurable report content via `[tool.sift.pytest.report]` and `SIFT_REPORT_*` env vars.** Static defaults (`name`, `test_case`, `test_system_name`, `system_operator`, `serial_number`, `part_number`, and `metadata`) live under `[tool.sift.pytest.report]` in `pyproject.toml`. `name` and `test_case` accept the `{target}`, `{command}`, `{args}`, `{rootdir}`, `{timestamp}`, `{count}`, `{git_repo}`, `{git_branch}`, `{git_commit}` placeholders. `[tool.sift.pytest.report.metadata]` is a TOML table whose typed values land on the report's metadata alongside git fields and the auto-recorded `pytest_command`. For dynamic per-run injection (CI, hardware-bench unit cycling), set `SIFT_REPORT_TEST_SYSTEM_NAME` / `_SYSTEM_OPERATOR` / `_SERIAL_NUMBER` / `_PART_NUMBER` env vars, which pytest-dotenv loads from `.env` for local dev. Env entries win over TOML.
+
+See the [Pytest Plugin guide](https://github.com/sift-stack/sift/blob/main/python/docs/guides/pytest_plugin/index.md) and the runnable quickstart example for full configuration.
+
+### Full Changelog
+- [Pytest plugin improvements](https://github.com/sift-stack/sift/pull/567)
+- [Graceful handling of missing connection](https://github.com/sift-stack/sift/pull/569)
+- [Hierarchical pytest report tree](https://github.com/sift-stack/sift/pull/570)
+- [Pass/fail behavior improvements](https://github.com/sift-stack/sift/pull/568)
+- [Report assertion message as error info](https://github.com/sift-stack/sift/pull/587)
+- [Pytest docs reorganization](https://github.com/sift-stack/sift/pull/589)
+- [Configurable report name template and preserved pytest command](https://github.com/sift-stack/sift/pull/591)
+- [Use in-process transport to improve test performance](https://github.com/sift-stack/sift/pull/590)
+- [End-of-run report summary panel and session header](https://github.com/sift-stack/sift/pull/594)
+- [Exit instead of raise on connection failure](https://github.com/sift-stack/sift/pull/606)
+- [Flexible report naming and consolidated settings registry](https://github.com/sift-stack/sift/pull/602)
+- [Fix incremental upload resume bug](https://github.com/sift-stack/sift/pull/611)
+
 ## [v0.16.2] - May 21, 2026
 
 ### Bugfixes
diff --git a/python/docs/examples/index.md b/python/docs/examples/index.md
index b6a964b35..baf2601e5 100644
--- a/python/docs/examples/index.md
+++ b/python/docs/examples/index.md
@@ -6,7 +6,11 @@ This section contains interactive Jupyter notebook examples demonstrating how to
 
 - **[Basic Usage](basic.ipynb)** - Introduction to the Sift Python client, covering basic operations and API usage
 - **[Data Ingestion](ingestion.ipynb)** - Learn how to ingest telemetry data into Sift using various methods
-- **[Pytest Plugin](pytest_plugin.md)** - Turn a pytest run into a Sift TestReport with measurements, nested steps, and pass/fail outcomes
+- **[Pytest Plugin Quickstart](pytest_plugin_quickstart.md)** - Guided tour of the runnable demo project under `python/examples/pytest_plugin/`
+
+For the conceptual reference on the pytest plugin (fixtures, configuration,
+report structure, and pass/fail behavior), see the
+[Pytest Plugin guide](../guides/pytest_plugin/index.md).
 
 ## Running Examples Locally
 
diff --git a/python/docs/examples/pytest_plugin.md b/python/docs/examples/pytest_plugin.md
index cf56dd75e..986e05e1e 100644
--- a/python/docs/examples/pytest_plugin.md
+++ b/python/docs/examples/pytest_plugin.md
@@ -1,695 +1,14 @@
-# Pytest Plugin
-
-The Sift Python client ships a pytest plugin that turns a pytest run into a
-`TestReport` in Sift. Each test function becomes a `TestStep`, measurements
-land as rows under that step, and failures propagate up through nested
-substeps to the report itself.
-
-This page walks through wiring the plugin into a project, the fixtures and
-hooks it provides, and the patterns you'll use day-to-day.
-
-!!! info "Where the plugin lives"
-    The plugin is part of `sift_client.util.test_results`. It is **not**
-    registered as a `pytest11` entry point. Projects opt in with a
-    `from sift_client.util.test_results import *` in their `conftest.py`.
-    That import is what wires up the fixtures, the CLI options, and the
-    `pytest_runtest_makereport` hook.
-
-## Install
-
-```bash
-pip install sift-stack-py pytest python-dotenv
-```
-
-Set the connection details in a `.env` next to your tests:
-
-```bash
-SIFT_API_KEY="your-api-key"
-SIFT_GRPC_URI="..."
-SIFT_REST_URI="..."
-```
-
-The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your Sift organization. You can find these on the Sift Manage page as well as generate an API key.
-
-## Wire the plugin into `conftest.py`
-
-Two things are required: a session-scoped `sift_client` fixture (the plugin's
-`report_context` fixture resolves it by name), and a star-import that registers
-the plugin's fixtures into the conftest's namespace.
-
-```python title="conftest.py"
-import os
-
-import pytest
-from dotenv import load_dotenv
-
-from sift_client import SiftClient, SiftConnectionConfig
-
-# Star-import wires fixtures + hooks + CLI options into pytest collection.
-from sift_client.util.test_results import *
-
-load_dotenv()
-
-
-@pytest.fixture(scope="session")
-def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI")
-    rest_url = os.getenv("SIFT_REST_URI")
-    api_key = os.getenv("SIFT_API_KEY")
-    
-    return SiftClient(
-        connection_config=SiftConnectionConfig(
-            api_key=api_key,
-            grpc_url=grpc_url,
-            rest_url=rest_url,
-        )
-    )
-```
-
-That's the whole setup. Every test in the session will now create a step on a
-single shared `TestReport`.
-
-## Plugin provided fixtures
-
-| Name | Kind | Scope | Purpose |
-|---|---|---|---|
-| `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
-| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, and `current_step`. |
-| `module_substep` | fixture (autouse) | module | One step per test file with each function nested as a substep. |
-| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted only when `--sift-test-results-check-connection` is set. |
-
-### CLI options
-
-| Flag | Default | Effect |
-|---|---|---|
-| `--sift-test-results-log-file=<path\|true\|false>` | temp file | Where the JSONL log of create/update calls goes. With a log file set, the plugin spawns an `import-test-result-log --incremental` worker that polls the file and replays entries against Sift while the run is in flight. Pass `false` to disable the file entirely; create/update calls then go straight to the API synchronously during tests. |
-| `--no-sift-test-results-git-metadata` | git metadata on | Skip capturing git repo/branch/commit on the report's metadata. |
-| `--sift-test-results-check-connection` | off | Make `report_context`, `step`, and `module_substep` no-op (yield `None`) when `client_has_connection` is `False`. Lets the same suite run locally without a Sift backend. |
-
-These can be set permanently in `pytest.ini`:
-
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-test-results-check-connection
-```
-
-!!! warning "FedRAMP / shared environments"
-    Pass `--sift-test-results-log-file=false` to skip the temp file + worker
-    pipeline. Create/update calls then run inline against the API instead of
-    being deferred through a subprocess.
-
-### Report metadata captured automatically
-
-Every report the plugin creates includes:
-
-- `name` and `test_case`: derived from the first positional argument to `pytest`. When it resolves to an existing path the plugin uses the basename for `name` and the full path string for `test_case`; otherwise both fall back to `pytest <args>`. `name` always has a UTC ISO timestamp appended. See examples below.
-- `test_system_name`: `socket.gethostname()`.
-- `system_operator`: `getpass.getuser()`.
-- `start_time` / `end_time`: set on session enter/exit.
-- `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
-- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-test-results-git-metadata` or when not in a git repo.
-
-Example invocations:
-
-| Pytest invocation | Report `name` | Report `test_case` |
-|---|---|---|
-| `pytest tests/test_battery.py` | `test_battery.py 2026-05-04T12:00:00.123456+00:00` | `tests/test_battery.py` |
-| `pytest tests/` | `tests 2026-05-04T12:00:00.123456+00:00` | `tests` |
-| `pytest -k voltage` | `pytest -k voltage 2026-05-04T12:00:00.123456+00:00` | `pytest -k voltage` |
-
-To override defaults (e.g. set a serial number, system operator, or extra
-metadata), call `report_context.report.update({...})` from any test or
-fixture. See [Linking a Run](#linking-a-run-to-the-report) for the same
-pattern applied to `run_id`.
-
-## Basic usage
-
-With the conftest in place, the simplest test needs nothing extra. The `step`
-fixture is `autouse=True` and pytest test failures and skips are mapped to
-step statuses automatically.
-
-```python title="test_basic.py"
-def test_no_fixtures_still_creates_a_step():
-    """Autouse `step` records this function as a step on the session report."""
-    assert 1 + 1 == 2
-
-
-def test_measure_a_single_value(step):
-    """Take `step` explicitly when you want to record a measurement."""
-    voltage = 4.97
-    passed = step.measure(
-        name="battery_voltage",
-        value=voltage,
-        bounds={"min": 4.8, "max": 5.2},
-        unit="V",
-    )
-    assert passed, f"voltage {voltage}V out of bounds"
-
-
-def test_measure_strings_and_booleans(step):
-    """`bounds` accepts a string or `True`/`False` for non-numeric values."""
-    step.measure(name="firmware_version", value="1.4.2", bounds="1.4.2")
-    step.measure(name="self_test_passed", value=True, bounds=True)
-
-
-def test_docstring_becomes_step_description(step):
-    """This docstring is the step's description in Sift.
-
-    The plugin pulls `request.node.obj.__doc__` when it creates the step.
-    Helper functions called from within the test do not get this treatment;
-    pass `description="..."` explicitly on `substep(...)` instead.
-    """
-    assert step.current_step.description is not None
-```
-
-!!! tip "Measurements never raise"
-    `step.measure(...)` returns `True` if the value is in bounds and `False`
-    otherwise. A `False` result marks the enclosing step as failed but does
-    not raise. Chain measurements freely and inspect the boolean if you need
-    custom flow control.
-
-### Status semantics for failures
-
-The plugin uses the step exit handler in `NewStep.__exit__` to translate test
-outcomes into `TestStatus`:
-
-| Outcome | Resulting `TestStatus` |
-|---|---|
-| In-bounds measurements only | `PASSED` |
-| Failed measurement, failed `report_outcome`, failed substep, or `AssertionError` raised by the test | `FAILED` (no traceback is attached, since pytest already prints it in the runner output) |
-| Non-`AssertionError` exception escapes the test (e.g. `ValueError`, `TimeoutError`) | `ERROR`, with the formatted traceback (last 10 frames plus the first frame) on `step.error_info.error_message` |
-| Manual `step.current_step.update({"status": ...})` | Whatever you set; the step exit handler honors a manually-resolved status |
-
-A failure or error at any depth propagates upward: the parent substep, the
-function step, the module step (if `module_substep` is active), and the
-session report all get marked failed.
-
-## Nested steps
-
-Use `step.substep(name=...)` to open a child step. Substeps nest arbitrarily
-deep, and a failure at any depth propagates up to fail the parent and the
-report.
-
-```python title="test_nested_steps.py"
-import time
-
-
-def test_phased_check(step):
-    """Phase a single test into setup/exercise/verify substeps."""
-    with step.substep(name="setup", description="Power on and wait for boot") as setup:
-        setup.measure(name="boot_time_s", value=2.1, bounds={"max": 5.0}, unit="s")
-
-    with step.substep(name="exercise", description="Drive the test sequence"):
-        time.sleep(0.01)
-
-    with step.substep(name="verify", description="Read final state") as verify:
-        verify.measure(name="final_state", value="IDLE", bounds="IDLE")
-
-
-def test_deeply_nested(step):
-    """A failure at the bottom fails everyone above it."""
-    with step.substep(name="level_1") as l1:
-        with l1.substep(name="level_2") as l2:
-            with l2.substep(name="level_3") as l3:
-                l3.measure(name="leaf_value", value=42, bounds={"min": 0, "max": 100})
-```
-
-Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …)
-assigned by `ReportContext`. Sibling substeps within the same parent
-auto-increment; opening a new top-level step starts a new branch.
-
-### One step per file
-
-`module_substep` is autouse and module-scoped. When it's active (it's pulled
-in by the star-import in `conftest.py`), each file becomes a parent step and
-every function in it nests one level down. Its name is the test file's
-basename and its description is the module's docstring (if any).
-
-### Linking a Run to the report
-
-`report_context` is the session-scoped fixture; mutating it in one test
-affects the whole report.
-
-```python
-def test_link_run_to_report(report_context, sift_client):
-    run = sift_client.runs.create(...)  # however you create your run
-    report_context.report.update({"run_id": run.id_})
-```
-
-The same `update({...})` pattern works for any field on `TestReportUpdate`,
-including `serial_number`, `part_number`, `system_operator`, and `metadata`.
-
-## How pytest layout maps to a Sift report
-
-The plugin builds the report tree by hooking pytest's collection: every test
-node it sees becomes a step. What you control is which constructs create
-nodes and where you nest substeps inside them. Common layouts and the
-resulting report trees:
-
-### Flat module of test functions
-
-The default. Each function is one step directly under the report.
-
-```python title="test_battery.py"
-def test_voltage(step): ...
-def test_current(step): ...
-def test_temperature(step): ...
-```
-
-```text title="Sift report"
-TestReport
-├── test_voltage
-├── test_current
-└── test_temperature
-```
-
-### One step per file with `module_substep`
-
-`module_substep` is autouse and module-scoped. Every file becomes a parent
-step and every function in it nests one level down.
-
-```python title="test_battery.py"
-def test_voltage(step): ...
-def test_current(step): ...
-```
-
-```python title="test_thermal.py"
-def test_idle_temp(step): ...
-def test_load_temp(step): ...
-```
-
-```text title="Sift report"
-TestReport
-├── test_battery.py
-│   ├── test_voltage
-│   └── test_current
-└── test_thermal.py
-    ├── test_idle_temp
-    └── test_load_temp
-```
-
-### Test classes
-
-Pytest classes (`class TestFoo: ...`) do not create a parent step on their
-own. The plugin keys off the test node's `name`, which is just the method
-name. To group a class's methods under a class-level step, add a class-scoped
-fixture that opens a step with `report_context.new_step(...)`:
-
-```python title="test_charging.py"
-import pytest
-
-
-class TestCharging:
-    @pytest.fixture(scope="class", autouse=True)
-    def class_step(self, report_context):
-        with report_context.new_step(
-            name="TestCharging",
-            description="Charging subsystem",
-        ) as parent:
-            yield parent
-
-    def test_starts_at_zero(self, step): ...
-    def test_reaches_full(self, step): ...
-    def test_thermal_throttle(self, step): ...
-```
-
-```text title="Sift report"
-TestReport
-└── TestCharging
-    ├── test_starts_at_zero
-    ├── test_reaches_full
-    └── test_thermal_throttle
-```
-
-!!! note "Combining with `module_substep`"
-    `module_substep` and a class-scoped step both open at module/class scope,
-    so they each grab the next sibling slot under the report and the inner
-    one nests under the outer. If you want both layers (file → class →
-    method), make the class step itself open via the active outer step
-    rather than the report root.
-
-### Parametrized tests
-
-Each parametrize case is a distinct pytest node, so each gets its own step.
-The step name includes the parameter id pytest generates.
+---
+hide:
+  - navigation
+  - toc
+---
 
-```python
-@pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
-def test_rail(step, voltage):
-    step.measure(name="rail_v", value=voltage, bounds={"min": 0.0})
-```
+<meta http-equiv="refresh" content="0; url=../../guides/pytest_plugin/">
 
-```text title="Sift report"
-TestReport
-├── test_rail[3.3]
-├── test_rail[5.0]
-└── test_rail[12.0]
-```
-
-### Helper functions
-
-Helpers called from a test do not auto-create a step. The plugin only sees
-pytest-collected nodes. To represent helper work in the report, open a
-substep at the call site and pass it into the helper:
-
-```python
-def measure_rail(step, name, value, bounds):
-    return step.measure(name=name, value=value, bounds=bounds, unit="V")
-
-
-def test_power_rails(step):
-    with step.substep(name="3.3V rail") as rail_3v3:
-        measure_rail(rail_3v3, "rail_v", 3.31, {"min": 3.2, "max": 3.4})
-
-    with step.substep(name="5V rail") as rail_5v:
-        measure_rail(rail_5v, "rail_v", 5.02, {"min": 4.9, "max": 5.1})
-```
-
-```text title="Sift report"
-TestReport
-└── test_power_rails
-    ├── 3.3V rail
-    │   └── rail_v        (measurement)
-    └── 5V rail
-        └── rail_v        (measurement)
-```
-
-!!! tip "Docstring-as-description is top-level only"
-    The plugin reads the test function's docstring and uses it as the step
-    description. Docstrings on helper functions are not picked up. Pass
-    `description="..."` explicitly on `substep(...)` if you want one.
-
-### Fixtures that contribute steps
-
-A fixture can open its own substep around setup/teardown by using `step` (for
-function-scope) or `report_context.new_step(...)` (for any scope). The substep
-ends when the fixture's `yield` returns, which makes the report tree mirror
-the lifecycle.
-
-```python
-@pytest.fixture
-def warmed_up_dut(step):
-    with step.substep(name="warmup", description="Bring DUT to operating temp"):
-        # ... do warmup work ...
-        yield "dut-handle"
-
-
-def test_steady_state(step, warmed_up_dut):
-    step.measure(name="temp_c", value=37.2, bounds={"min": 35.0, "max": 40.0})
-```
-
-```text title="Sift report"
-TestReport
-└── test_steady_state
-    ├── warmup        (from fixture)
-    └── temp_c        (measurement)
-```
-
-## Measurement variants
-
-`step.measure(...)` records exactly one measurement. For datasets coming off a
-sensor or calculated channel, use one of the bulk variants.
-
-### `measure_avg`: one row, the mean
-
-`measure_avg` accepts a Python list, a NumPy array, or a pandas `Series`,
-takes the mean, and evaluates it against bounds.
-
-```python
-import numpy as np
-import pandas as pd
-
-
-def test_avg_with_list(step):
-    samples = [4.97, 5.01, 5.03, 4.99, 5.02]
-    step.measure_avg(
-        name="bus_voltage_avg",
-        values=samples,
-        bounds={"min": 4.9, "max": 5.1},
-        unit="V",
-    )
-
-
-def test_avg_with_numpy(step):
-    samples = np.linspace(99.5, 100.5, num=50)
-    step.measure_avg(
-        name="cpu_temp_avg",
-        values=samples,
-        bounds={"min": 95.0, "max": 105.0},
-        unit="C",
-    )
-
-
-def test_avg_with_pandas(step):
-    series = pd.Series([0.998, 1.001, 0.999, 1.002, 1.000])
-    step.measure_avg(
-        name="reference_clock_ratio",
-        values=series,
-        bounds={"min": 0.99, "max": 1.01},
-    )
-```
-
-### `measure_all`: only out-of-bounds rows
-
-Records measurements only for samples that fail bounds, so an all-pass
-dataset of N samples doesn't add N rows to the report. Returns `True` when
-every sample is in bounds.
-
-```python
-def test_only_outliers_recorded(step):
-    samples = [10.1, 10.2, 10.3, 99.9, 10.0, 10.1]  # 99.9 is the outlier
-    all_in_bounds = step.measure_all(
-        name="pressure_psi",
-        values=samples,
-        bounds={"min": 9.0, "max": 11.0},
-        unit="psi",
-    )
-    # Returns False because 99.9 is out of bounds. The step is already
-    # marked failed; raise here only if you also want pytest to fail.
-    assert all_in_bounds
-```
-
-!!! note "`measure_all` requires at least one bound"
-    Passing `bounds={}` raises `ValueError("No bounds provided")`. At
-    least one of `min` or `max` must be set.
-
-### `report_outcome`: externally computed pass/fail
-
-When the decision is computed elsewhere, drop it onto the report as a
-named substep with an optional reason. Returns the result you passed in,
-so you can use it inline.
-
-```python
-def test_external_checks(step):
-    step.report_outcome(
-        name="config_loaded",
-        result=True,
-        reason="loaded /etc/dut/config.yaml",
-    )
-
-    # Failures show up as a failed substep without raising.
-    rare_warning_seen = False
-    step.report_outcome(
-        name="no_rare_warning",
-        result=not rare_warning_seen,
-        reason="grep'd dmesg for the known-flaky warning",
-    )
-```
-
-### Bounds reference
-
-| Pass to `bounds=` | Value type | Effect |
-|---|---|---|
-| `{"min": x, "max": y}` (either key optional) | `int` / `float` | Numeric window. One-sided is fine. |
-| `NumericBounds(min=x, max=y)` | `int` / `float` | Same as the dict form, explicit. |
-| `"expected-string"` | `str` (or `bool`) | Exact equality. For `bool` values, compares lowercased string (`"true"`/`"false"`). |
-| `True` or `False` | `bool` (or `str`) | Exact equality. For `str` values, compares lowercased strings. |
-| `None` | any | Records the value but does not evaluate it; measurement is recorded as `passed=True`. |
-
-The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
-
-## Skip handling
-
-- `@pytest.mark.skip` and `@pytest.mark.skipif`: the plugin's
-  `pytest_runtest_makereport` hook sees the skipped outcome and creates a
-  step with `TestStatus.SKIPPED`.
-- Inside a test function, you can mark just one substep as skipped without
-  aborting the whole test:
-
-  ```python
-  from sift_client.sift_types.test_report import TestStatus
-
-
-  def test_runtime_skip(step):
-      with step.substep(name="optional_calibration") as cal:
-          if not precondition_met():
-              cal.current_step.update({"status": TestStatus.SKIPPED})
-  ```
-
-  A manually-resolved status is honored by the step's exit handler. No
-  further bookkeeping required. `SKIPPED` does not propagate as a failure.
-
-## Running the suite
-
-```bash
-# Full run against your Sift tenant
-pytest
-
-# Pin the log file so you can replay it later if the import worker dies
-pytest --sift-test-results-log-file=./sift-results.jsonl
-```
-
-See [Running offline](#running-offline) for the same suite running with or
-without a reachable Sift server.
-
-## Running offline
-
-The plugin supports two offline workflows, depending on whether you want a
-Sift report at all when the test environment can't reach Sift. The first
-turns the plugin into a no-op when the server is unreachable. The second
-keeps the plugin running normally and writes every create/update to a local
-JSONL file that you upload from a connected machine afterward.
-
-| Pattern | Flag | Runtime behavior | Follow-up |
-|---|---|---|---|
-| Skip when offline | `--sift-test-results-check-connection` | Fixtures yield `None`, no log file, no report. Pytest still reports pass/fail. | None. |
-| Capture locally, upload later | `--sift-test-results-log-file=<path>` | Plugin writes every create/update to the JSONL file. | `import-test-result-log <path>` from a connected machine. |
-
-Pattern 1 suits laptop dev and CI without Sift secrets. Pattern 2 suits
-field tests, vehicles on remote sites, and air-gapped labs.
-
-### Pattern 1: skip when offline
-
-`--sift-test-results-check-connection` makes the plugin ping Sift once at
-session start through the `client_has_connection` fixture (which by default
-calls `sift_client.ping.ping()`). On a failed ping, `report_context`,
-`step`, and `module_substep` yield `None` for the rest of the session.
-Pytest still runs the tests and still reports pass/fail.
-
-```bash
-pytest --sift-test-results-check-connection
-```
-
-```ini title="pytest.ini"
-[pytest]
-addopts = --sift-test-results-check-connection
-```
-
-#### Handling `None` in tests
-
-Calls on `step` raise `AttributeError` when it's `None`, so tests that take
-`step` as a parameter need a guard. The cleanest fix is to shadow the
-plugin's `step` fixture in your conftest and turn the `None` case into an
-automatic skip.
-
-```python title="conftest.py"
-import pytest
-
-from sift_client.util.test_results import *
-
-
-@pytest.fixture(autouse=True)
-def step(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    yield step
-```
-
-The `step` parameter on the override resolves to the plugin's fixture, not
-to the override itself. `autouse=True` is required so the skip applies to
-tests that don't request `step` directly. The same shadowing trick works
-for `module_substep` and `report_context`.
-
-For one-off tests that don't share a conftest, an inline guard works just
-as well:
-
-```python
-def test_battery_voltage(step):
-    if step is None:
-        pytest.skip("Sift unavailable")
-    step.measure(name="battery_voltage", value=4.97, bounds={"min": 4.8, "max": 5.2})
-```
-
-If you'd rather have tests pass through silently than skip them, wrap the
-calls in a helper that no-ops on `None`:
-
-```python
-def safe_measure(step, **kwargs):
-    if step is None:
-        return True
-    return step.measure(**kwargs)
-```
-
-#### Overriding the connection check
-
-The default `client_has_connection` fixture calls `sift_client.ping.ping()`.
-Override it in your conftest if pinging is the wrong signal for your
-environment, for example a token cache that's only warm when authenticated:
-
-```python title="conftest.py"
-from pathlib import Path
-
-import pytest
-
-
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client) -> bool:
-    return Path("~/.sift-token-cache").expanduser().is_file()
-```
-
-The plugin only consults this fixture when `--sift-test-results-check-connection`
-is set, so an unused override has no effect on a normal run.
-
-### Pattern 2: capture locally, upload later
-
-This pattern keeps the plugin running normally even when Sift is
-unreachable. The plugin writes to the log file, the worker dies on connect,
-and the file is left on disk for you to upload later. Pin the log file path
-so you can find it afterward, and don't pass
-`--sift-test-results-check-connection`, which would suppress the logging
-this pattern relies on.
-
-```bash
-pytest --sift-test-results-log-file=./run.jsonl
-```
-
-What happens during the run:
-
-- Every report, step, and measurement create/update is written to
-  `run.jsonl`. The plugin doesn't contact the Sift API for any of these
-  calls; they return simulated responses keyed by UUIDs that the replay
-  later maps to real IDs.
-- The `import-test-result-log --incremental` worker subprocess starts and
-  exits early when it can't reach Sift. The session does not fail when the
-  worker exits before the run ends.
-- Tests run against a real `step` fixture, so `step.measure(...)`,
-  substeps, parametrize, fixtures, and `module_substep` behave exactly as
-  they do online. No conftest changes are needed.
-
-Once you have connectivity, replay the file:
-
-```bash
-import-test-result-log ./run.jsonl
-```
-
-The replay creates the report, steps, and measurements against Sift in one
-batch. See [Replaying a saved log file](#replaying-a-saved-log-file) for
-details on cleanup and the incremental flag.
-
-!!! warning "Pin the log path for Pattern 2"
-    Without `--sift-test-results-log-file=<path>`, the plugin writes to a
-    `tempfile.NamedTemporaryFile` and only surfaces the path via a
-    `logger.info` line. Always pin a known path when you intend to replay
-    the file later.
-
-## Replaying a saved log file
-
-When the worker doesn't finish cleanly the plugin will print a hint mentioning
-`import-test-result-log`. To import:
+# Pytest Plugin
 
-```bash
-import-test-result-log <path-to-log.jsonl>
-```
+This page has moved to the [Pytest Plugin guide](../guides/pytest_plugin/index.md).
 
-That replays the saved JSONL log as a single batch (no `--incremental`) and
-deletes the file when it lives under the system temp dir.
\ No newline at end of file
+You should be redirected automatically. If your browser does not redirect,
+follow the link above.
diff --git a/python/docs/examples/pytest_plugin_quickstart.md b/python/docs/examples/pytest_plugin_quickstart.md
new file mode 100644
index 000000000..cf19c11fb
--- /dev/null
+++ b/python/docs/examples/pytest_plugin_quickstart.md
@@ -0,0 +1,182 @@
+# Pytest Plugin Quickstart
+
+A walkthrough of the runnable demo at
+[`python/examples/pytest_plugin/`](https://github.com/sift-stack/sift/tree/main/python/examples/pytest_plugin).
+The demo is a self-contained pytest project that exercises every layer of the
+plugin's step tree: packages, modules, classes (including nested), parametrize
+axes, manual substeps, and gate markers. It also includes a tests directory
+that uses no Sift APIs at all, to show how the autouse fixtures capture plain
+pytest tests for free.
+
+For a conceptual reference (fixtures, ini flags, status semantics), see the
+[Pytest Plugin guide](../guides/pytest_plugin/index.md).
+
+## Project layout
+
+```
+examples/pytest_plugin/
+├── conftest.py                            # registers the plugin
+├── pyproject.toml                         # pytest knobs + report name/test_case/metadata
+├── .env.example                           # credential template
+└── tests/
+    ├── pytest_only/                       # subpackage step
+    │   ├── __init__.py
+    │   └── test_pytest_only_demo.py       # plain pytest, no Sift APIs
+    └── with_sift/                         # subpackage step
+        ├── __init__.py
+        └── test_with_sift_demo.py         # measurements, substeps, classes, parametrize, gates
+```
+
+Every Python package (directory with `__init__.py`), test file, and test class
+above each test becomes its own parent step in the report tree.
+
+## `conftest.py`
+
+A single `pytest_plugins` declaration loads the plugin. The default
+`sift_client` fixture reads `SIFT_API_KEY` / `SIFT_GRPC_URI` / `SIFT_REST_URI`
+from the environment — set them in your shell, your CI secret store, or a
+local `.env` (`pip install pytest-dotenv` auto-loads it).
+
+```python title="conftest.py"
+--8<-- "examples/pytest_plugin/conftest.py"
+```
+
+## `pyproject.toml`
+
+Pytest behavior knobs sit under `[tool.pytest.ini_options]`, each commented at
+its default — uncomment any line to opt out of a layer of the step tree. The
+report's display `name`, `test_case`, and free-form `metadata` are set under
+`[tool.sift.pytest.report]`; `name` and `test_case` accept template
+placeholders.
+
+```toml title="pyproject.toml"
+--8<-- "examples/pytest_plugin/pyproject.toml"
+```
+
+## `.env.example`
+
+```bash title=".env.example"
+--8<-- "examples/pytest_plugin/.env.example"
+```
+
+## The pytest_only module
+
+Plain pytest tests with no `sift_client` imports, no `step` fixture, no
+markers. Each one still becomes a leaf step in the report tree. The plugin's
+autouse fixtures capture pass/fail automatically.
+
+```python title="tests/pytest_only/test_pytest_only_demo.py"
+--8<-- "examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py"
+```
+
+## The with_sift module
+
+Exercises the plugin's full surface: numeric / string / bool bounds, nested
+`step.substep`, `@pytest.mark.sift_exclude`, class steps with docstring
+descriptions, nested classes, stacked `@pytest.mark.parametrize`, and
+`step.report_outcome`.
+
+```python title="tests/with_sift/test_with_sift_demo.py"
+--8<-- "examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py"
+```
+
+## Run it
+
+### Without Sift credentials
+
+```bash
+cd python/examples/pytest_plugin
+pytest --sift-disabled -v
+```
+
+`--sift-disabled` makes the plugin a no-op transport: `step.measure(...)`
+still evaluates bounds and returns a real pass/fail boolean, but nothing
+contacts Sift and no log file is written. Useful for previewing the report
+tree or unit-testing measurement logic.
+
+### Against a real Sift org
+
+```bash
+cp .env.example .env
+# Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
+pytest -v
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+### Offline (record now, replay later)
+
+```bash
+pytest --sift-offline --sift-log-file=/tmp/sift-demo.jsonl -v
+# Later, from anywhere with credentials:
+import-test-result-log /tmp/sift-demo.jsonl
+```
+
+## Expected report tree
+
+With the plugin's defaults (every layer enabled), the demo produces:
+
+```
+TestReport (FAILED, since failures propagate up from leaves)
+├── pytest_only                         ← package step (FAILED)
+│   └── test_pytest_only_demo.py        ← module step (FAILED)
+│       ├── test_passes                                              PASSED
+│       ├── test_uses_a_pytest_fixture                               PASSED
+│       ├── test_assertion_failure_marks_step_failed                 FAILED
+│       ├── test_skipped                                             SKIPPED
+│       ├── test_unexpected_exception_marks_step_errored             ERROR
+│       ├── test_parametrize_without_step
+│       │   ├── value='v1'                                           PASSED
+│       │   └── value='v2'                                           PASSED
+│       └── TestPytestClass
+│           └── test_method                                          PASSED
+└── with_sift                           ← package step (FAILED)
+    └── test_with_sift_demo.py          ← module step (FAILED)
+        ├── test_measurements                                        PASSED
+        ├── test_substeps                                            PASSED
+        │   ├── phase_1
+        │   └── phase_2
+        │       └── phase_2a
+        │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
+        ├── test_measure_series                                      PASSED
+        ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
+        ├── test_pytest_fail_if_step_failed_at_end                                FAILED  (pytest FAILED)
+        ├── test_report_level_metadata                               PASSED
+        └── TestClassStep
+            ├── test_parametrize
+            │   ├── axis_a='a1'
+            │   │   ├── axis_b='b1'                                  PASSED
+            │   │   └── axis_b='b2'                                  PASSED
+            │   └── axis_a='a2'
+            │       ├── axis_b='b1'                                  PASSED
+            │       └── axis_b='b2'                                  PASSED
+            └── TestNested
+                └── test_report_outcome
+                    └── check                                        PASSED
+```
+
+The `pytest_only` module deliberately includes one failing, one skipped, and
+one erroring test so the demo shows every `TestStatus` mapping (`FAILED` for
+assertions, `SKIPPED` for `pytest.skip`, `ERROR` for any other exception).
+The `with_sift` module shows two patterns for handling measurement results:
+`test_failed_measurement_marks_sift_step_failed` lets the test keep passing
+in pytest while the Sift step is `FAILED` (useful when measurements are
+diagnostic data you want to collect regardless of outcome); and
+`test_pytest_fail_if_step_failed_at_end` takes every measurement first and
+then calls `step.pytest_fail_if_step_failed()` once at the end, so every
+measurement still lands in the report even when one fails. The end-of-test
+call is the recommended pattern: it fails via `pytest.fail` (no assertion
+noise in `error_info`), and unlike asserting on an individual
+`step.measure(...)` call it does not short-circuit on the first failure and
+skip every measurement that follows. Expected
+pytest output is `16 passed, 3 failed, 1 skipped`.
+
+Flip any of the `sift_*_step` / `sift_parametrize_nesting` flags in
+`pyproject.toml` to `false` to collapse a layer.
+
+## Next steps
+
+- [Pytest Plugin guide](../guides/pytest_plugin/index.md): conceptual reference
+  covering fixtures, configuration, report structure, and pass/fail behavior.
+- The demo's [README](https://github.com/sift-stack/sift/blob/main/python/examples/pytest_plugin/README.md)
+  on GitHub mirrors this page and is the canonical source.
diff --git a/python/docs/guides/index.md b/python/docs/guides/index.md
new file mode 100644
index 000000000..105f0bb25
--- /dev/null
+++ b/python/docs/guides/index.md
@@ -0,0 +1,11 @@
+# Guides
+
+Conceptual references for the Sift Python client. Guides explain how a feature
+works and how to configure it. For runnable, end-to-end walkthroughs see the
+[Examples](../examples/index.md) section.
+
+## Available guides
+
+- [Pytest Plugin](pytest_plugin/index.md): turn a pytest run into a `TestReport`
+  in Sift. Each test becomes a `TestStep`, measurements are recorded as rows, and
+  failures propagate up through nested substeps to the report.
diff --git a/python/docs/guides/pytest_plugin/configuration.md b/python/docs/guides/pytest_plugin/configuration.md
new file mode 100644
index 000000000..a8e291006
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/configuration.md
@@ -0,0 +1,364 @@
+# Configuration & Defaults
+
+This page is the full reference for everything the plugin exposes: fixtures, CLI
+flags, ini options, credential handling, and the markers that control which
+tests report.
+
+!!! info "Where the plugin lives"
+    The plugin lives at `sift_client.pytest_plugin`. It is **not** registered as
+    a `pytest11` entry point. Projects opt in with a `pytest_plugins` declaration
+    in their top-level `conftest.py`. Pytest then loads the module as a real
+    plugin: the fixtures, CLI options, and `pytest_runtest_makereport` hook all
+    register through standard pytest machinery, so `pytest --trace-config` lists
+    it and `pytest -p no:sift_client.pytest_plugin` disables it.
+
+## Credentials
+
+Set the connection details in a `.env` next to your tests:
+
+```bash
+SIFT_API_KEY="your-api-key"
+SIFT_GRPC_URI="..."
+SIFT_REST_URI="..."
+```
+
+The `SIFT_GRPC_URI` and `SIFT_REST_URI` are the gRPC and REST endpoints for your
+Sift organization. You can find these on the Sift Manage page as well as
+generate an API key.
+
+The default `sift_client` fixture reads its two URIs from the environment
+first, then from the `sift_grpc_uri` / `sift_rest_uri` ini keys.
+`SIFT_API_KEY` is intentionally env-only, so keep it out of source control (see
+[API key handling](#api-key-handling) below). There are no CLI flags for
+credentials.
+
+| Setting | Where | Notes |
+|---|---|---|
+| `SIFT_API_KEY` | env var only | Inject from your secret store in CI; for local dev use a `.env` (see below). Never read from a committed file. |
+| `SIFT_GRPC_URI` | env > `sift_grpc_uri` ini | Stable per-org gRPC endpoint; safe to commit. |
+| `SIFT_REST_URI` | env > `sift_rest_uri` ini | Stable per-org REST endpoint; safe to commit. |
+
+### API key handling
+
+`SIFT_API_KEY` is read from the process environment only — the plugin never
+reads it from a committed file. How you get it into the environment is up to
+you:
+
+- **CI:** set `SIFT_API_KEY` directly via your provider's secret manager.
+- **Local dev:** keep the values in a `.env` (gitignored) and let
+  [`pytest-dotenv`](https://pypi.org/project/pytest-dotenv/) load them — it is
+  not bundled with `sift-stack-py`, so install it explicitly:
+
+    ```bash
+    pip install pytest-dotenv
+    ```
+
+    ```bash title=".env"
+    SIFT_API_KEY=sk-...your-key...
+    SIFT_GRPC_URI=your-org.grpc.example.com
+    SIFT_REST_URI=https://your-org.rest.example.com
+    ```
+
+    Once installed, pytest-dotenv auto-loads `.env` from the rootdir before
+    tests run — no `conftest.py` glue and no `load_dotenv()` call. (Point it at
+    a different file with the `env_files` ini key if you prefer.)
+
+Prefer real environment variables (shell exports, CI secrets) for anything you
+can't keep in a local file.
+
+!!! warning "FedRAMP / shared environments"
+    Pass `--sift-log-file=false` (or set the ini key to `"false"`) to skip the
+    temp file + worker pipeline. Create/update calls then run inline against the
+    API instead of being deferred through a subprocess.
+
+## Wire the plugin into `conftest.py`
+
+A single `pytest_plugins` declaration in your top-level `conftest.py` is all
+that's required. The plugin ships a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+
+```python title="conftest.py"
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
+
+That's the whole setup. Every test in the session will now create a step on a
+single shared `TestReport`.
+
+### Customizing the `SiftClient`
+
+To construct the client differently (custom TLS, timeouts, alternate
+credentials, etc.), override the `sift_client` fixture in your conftest. The
+plugin's default falls away in favor of your definition.
+
+```python title="conftest.py"
+import os
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+@pytest.fixture(scope="session")
+def sift_client() -> SiftClient:
+    return SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key=os.getenv("SIFT_API_KEY"),
+            grpc_url=os.getenv("SIFT_GRPC_URI"),
+            rest_url=os.getenv("SIFT_REST_URI"),
+            use_ssl=False,
+        )
+    )
+```
+
+## Plugin provided fixtures
+
+| Name | Kind | Scope | Purpose |
+|---|---|---|---|
+| `report_context` | fixture (autouse) | session | The `ReportContext` backing the run's `TestReport`. Use it to attach metadata or open ad-hoc steps. |
+| `step` | fixture (autouse) | function | A `NewStep` created for the current test function. Exposes `measure*`, `substep`, `report_outcome`, `pytest_fail_if_step_failed`, and `current_step`. |
+| `_sift_parents` | internal fixture (autouse) | function | Resolves the report-tree parents for the current test: a parent step for each `pytest.Package`, `pytest.Module`, and `pytest.Class` ancestor, then one per `@pytest.mark.parametrize` axis (and fixture parametrization) nested inside them. Parents are created once and reused across tests in any order, so test execution order is never changed. Each layer is gated independently; see [settings reference](#settings-reference). |
+| `client_has_connection` | fixture | session | Calls `sift_client.ping.ping()`; consulted by `report_context` at session start in online mode (the default). Override to skip the ping or use a different reachability signal. |
+
+## Settings reference
+
+Every setting the plugin reads, grouped by the three config kinds. Within a
+group, a `—` means the setting can't be set from that surface.
+
+Each kind has a home chosen for a specific workflow:
+
+- **Pytest behavior** lives in `[tool.pytest.ini_options]` (log/offline/disabled/git/`*_step`/autouse/parametrize). A CLI flag exists for the ones with a real ad-hoc override workflow.
+- **Connection** comes from the environment first, falling back to the ini keys; the API key is env-only so secrets stay out of committed files.
+- **Report content** takes static defaults from `[tool.sift.pytest.report]` and per-run dynamic values from `SIFT_REPORT_*` env vars (CI builds, hardware cycling, anything `.env`-driven; pytest-dotenv loads `.env` for local dev).
+
+**Precedence within a setting:** env > CLI flag > ini key > TOML > built-in
+default. No setting exposes both env and CLI, so the chain isn't ambiguous in
+practice.
+
+The plugin scans `SIFT_*` env vars and `[tool.sift.pytest.*]` keys at session
+start; anything outside these tables fires a warning with a closest-match
+suggestion, so typos like `SIFT_REPORT_SERIALNUM` surface immediately.
+
+<!-- BEGIN settings-reference (auto-generated from PLUGIN_OPTIONS in sift_client/_internal/pytest_plugin/options.py; regenerate via test_settings_reference_docs_in_sync) -->
+### Pytest behavior
+
+| Setting | CLI flag | Ini (`[tool.pytest.ini_options]`) |
+|---|---|---|
+| Path to the JSONL log of create/update calls (path \| true \| false \| none). | `--sift-log-file` | `sift_log_file` |
+| Capture git repo/branch/commit on the report. | `--no-sift-git-metadata` | `sift_git_metadata` |
+| Skip the session-start ping; route create/update through the JSONL log. | `--sift-offline` | `sift_offline` |
+| Disable Sift entirely (no API calls, no log file). Supersedes --sift-offline. | `--sift-disabled` | `sift_disabled` |
+| Open the resulting report in a browser at session end (online only; no-op when the report URL can't be resolved). | `--sift-open-report` | `sift_open_report` |
+| Default for the Sift autouse fixtures (report_context, step, hierarchy/parametrize parents). | — | `sift_autouse` |
+| Open a parent step for each Python package in the test path. | — | `sift_package_step` |
+| Open a parent step for each test module. | — | `sift_module_step` |
+| Open per-class parent steps, including nested classes. | — | `sift_class_step` |
+| Cluster parametrized tests under shared parent steps (e.g. test_a -> v=1, v=2). | — | `sift_parametrize_nesting` |
+
+### Connection
+
+| Setting | Ini (`[tool.pytest.ini_options]`) | Env var |
+|---|---|---|
+| Sift API key (secret, env-only). | — | `SIFT_API_KEY` |
+| Sift gRPC endpoint URI. | `sift_grpc_uri` | `SIFT_GRPC_URI` |
+| Sift REST endpoint URI. | `sift_rest_uri` | `SIFT_REST_URI` |
+| Sift web-app origin for the report link in the terminal footer (e.g. https://app.siftstack.com). When unset, the link is derived from the REST URI for known Sift hosts. | `sift_app_url` | `SIFT_APP_URL` |
+
+### Report content
+
+| Setting | TOML (`[tool.sift...]`) | Env var |
+|---|---|---|
+| Template for the report display name. Placeholders: {target}, {command}, {args}, {rootdir}, {timestamp}, {count}, {git_repo}, {git_branch}, {git_commit}. | `[tool.sift.pytest.report] name` | — |
+| Template for the report's test_case field (same placeholders as report_name). | `[tool.sift.pytest.report] test_case` | — |
+| Name of the test system / rig. Defaults to the host's name. | `[tool.sift.pytest.report] test_system_name` | `SIFT_REPORT_TEST_SYSTEM_NAME` |
+| Operator running the test. Defaults to the OS user. | `[tool.sift.pytest.report] system_operator` | `SIFT_REPORT_SYSTEM_OPERATOR` |
+| Serial number of the unit under test. | `[tool.sift.pytest.report] serial_number` | `SIFT_REPORT_SERIAL_NUMBER` |
+| Part number of the unit under test. | `[tool.sift.pytest.report] part_number` | `SIFT_REPORT_PART_NUMBER` |
+| Free-form report metadata, as a TOML table of scalar values. For dynamic per-run keys, attach them in conftest via the report_context fixture. | `[tool.sift.pytest.report.metadata]` (table) | — |
+<!-- END settings-reference -->
+
+### Quick-start examples
+
+```toml title="pyproject.toml"
+[tool.pytest.ini_options]
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
+
+[tool.sift.pytest.report]
+name = "{rootdir} ({count} tests) {timestamp}"
+test_system_name = "rig-7"
+
+[tool.sift.pytest.report.metadata]
+build_id = "v1.2.3"
+```
+
+```bash title="CI env (set by your runner)"
+SIFT_API_KEY=...                    # from a secret manager
+SIFT_REPORT_SYSTEM_OPERATOR=ci-bot
+SIFT_REPORT_SERIAL_NUMBER=$UNIT_SN  # cycles per matrix job
+```
+
+```ini title="pytest.ini (alternative — pytest-execution flags only)"
+[pytest]
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = your-org.sift.example:443
+sift_rest_uri = https://your-org.sift.example
+```
+
+CLI flags can be made permanent via `addopts`:
+
+```ini title="pytest.ini"
+[pytest]
+addopts = --sift-offline
+```
+
+## Report content in depth
+
+The [settings reference](#settings-reference) above maps each report-content
+field to its `[tool.sift.pytest.report]` key and `SIFT_REPORT_*` env var. This
+section covers the two template fields and the metadata table in more detail.
+
+```toml title="pyproject.toml — static project defaults"
+[tool.sift.pytest.report]
+name             = "{rootdir} {git_branch} ({count} tests) {timestamp}"
+test_case        = "{rootdir}-{git_branch}"
+test_system_name = "rig-7"
+system_operator  = "ci-bot"
+serial_number    = "SN-001"
+part_number      = "PN-9000"
+```
+
+```bash title="Per-run overrides — CI or hardware-bench shell"
+SIFT_REPORT_SERIAL_NUMBER=$UNIT_SN \
+SIFT_REPORT_SYSTEM_OPERATOR=$CI_ACTOR \
+pytest tests/
+```
+
+### `name` vs `test_case`
+
+The two fields look similar but serve opposite purposes:
+
+- **`name`** is the report's **per-run display label** — what you see in the
+  Test Results list. It should be unique per run, which is why its default ends
+  in `{timestamp}`.
+- **`test_case`** is the **cross-run grouping key** — reports that share a
+  `test_case` are treated as runs of the *same* case, so Sift can track its
+  pass/fail history over time. It should be stable across runs, which is why
+  its default has **no** timestamp.
+
+By default both derive from the same `{target}` (what ran), and the timestamp
+is the only difference: `name` = `{target} {timestamp}` (distinct each run),
+`test_case` = `{target}` (identical across runs of the same target, so they
+group together). Set either explicitly to override — a static `test_case` like
+`"{rootdir}"` is common when you want every run of a project to group under one
+case regardless of which subset ran.
+
+### Templates for `name` and `test_case`
+
+`name` and `test_case` accept the same f-string-style placeholders:
+
+| Placeholder | Value |
+|---|---|
+| `{target}` | What ran, derived from the collected tests (not the command line) and anchored to the project name: `project/tests/test_x.py::test_y` for a single test (the `[param]` suffix is stripped), `project/tests/test_x.py` for a single file, `project/tests/motor` for several files' common directory, or just `project` for a whole-suite run. |
+| `{command}` | The full pytest invocation, e.g. `pytest tests/ -k smoke`. |
+| `{args}` | The invocation arguments without the leading `pytest`. |
+| `{rootdir}` | The pytest rootdir name (typically the project directory). |
+| `{timestamp}` | The report start time in ISO 8601 (UTC). |
+| `{count}` | The number of collected tests in the run. |
+| `{git_repo}` | The `origin` remote URL, or empty when not in a git repo. |
+| `{git_branch}` | The current branch, or empty when not in a git repo. |
+| `{git_commit}` | The current commit (`git describe --always --dirty`), or empty when not in a git repo. |
+
+**Defaults when unset.** Because `{target}` is derived from the collected
+tests, the defaults reflect what actually ran and don't change with flag order
+or `-k` / `-m` filters:
+
+(`<project>` below is the rootdir directory name.)
+
+| Invocation | default `name` | default `test_case` |
+|---|---|---|
+| `pytest tests/test_motor.py::test_spin[12V]` | `<project>/tests/test_motor.py::test_spin 2026-...` | `<project>/tests/test_motor.py::test_spin` |
+| `pytest -v tests/test_motor.py` | `<project>/tests/test_motor.py 2026-...` | `<project>/tests/test_motor.py` |
+| `pytest -k motor` (hits `tests/motor/`) | `<project>/tests/motor 2026-...` | `<project>/tests/motor` |
+| `pytest` (whole suite) | `<project> 2026-...` | `<project>` |
+
+The git placeholders are resolved independently of `--no-sift-git-metadata`
+(which only controls whether git values are stored on the report metadata) and
+render empty outside a git checkout. An unknown placeholder is reported as a
+warning and the value falls back to the default rather than failing the run.
+
+Regardless of the name, the full pytest command is always preserved on the
+report's metadata under the `pytest_command` key, so the exact invocation stays
+queryable and viewable in the report detail.
+
+### Report metadata
+
+`[tool.sift.pytest.report.metadata]` is a TOML table whose typed values land
+on the report's metadata alongside the git fields and the auto-recorded
+`pytest_command`. Use it for build IDs, fixture identifiers, shift labels,
+and any key/value data not otherwise modeled.
+
+```toml title="pyproject.toml — static metadata defaults"
+[tool.sift.pytest.report.metadata]
+build_id = "v1.2.3"
+fixture  = "PSU-A"
+shift    = "night"
+lane     = 2          # ints, floats, and bools come through with their TOML type
+verbose  = true
+```
+
+For per-run dynamic entries (CI build IDs, cycling serial numbers), attach them
+in your `conftest.py` through the `report_context` fixture rather than the TOML
+table.
+
+Nested tables, lists, and `null` values in
+`[tool.sift.pytest.report.metadata]` are skipped with a warning since the
+report's metadata is a flat `dict[str, str | float | bool]`.
+
+## Controlling which tests produce reports
+
+By default every test in the session produces a Sift step. Two markers and one
+ini key let you narrow that to a specific set of tests, which is useful when a
+repo holds tests that you don't want included in the Sift test report.
+
+| Setting                                                 | Effect                                                                                       |
+|---------------------------------------------------------|----------------------------------------------------------------------------------------------|
+| `sift_autouse = false` in `pyproject.toml` | Flip the project-wide default off. Tests no longer produce steps unless explicitly opted in. |
+| `@pytest.mark.sift_include` on a test, class, or module | Force reporting on for that scope, regardless of the project default.                        |
+| `@pytest.mark.sift_exclude` on a test, class, or module | Force reporting off for that scope, regardless of the project default.                       |
+
+Closest marker determines setting. `sift_exclude` beats `sift_include` when both apply.
+`pytestmark` at the class or module level inherits to every test in scope.
+
+### Bulk-applying a marker to a directory
+
+To opt an entire directory in (or out) without editing each file, hook
+`pytest_collection_modifyitems` in the directory's `conftest.py`:
+
+```python title="tests/example/conftest.py"
+from pathlib import Path
+
+import pytest
+
+_HERE = Path(__file__).parent
+
+
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
+```
+
+This applies `sift_include` to every test collected under `tests/example/`.
+Combine with `sift_autouse = false` in `pyproject.toml` for opting in to
+specific directories.
+
+`pytest_collection_modifyitems` receives every item in the session, not just
+this directory's, so the `relative_to` filter is what scopes the marker.
diff --git a/python/docs/guides/pytest_plugin/index.md b/python/docs/guides/pytest_plugin/index.md
new file mode 100644
index 000000000..93879692c
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/index.md
@@ -0,0 +1,122 @@
+# Pytest Plugin
+
+The Sift Python client ships a pytest plugin that turns a pytest run into a
+`TestReport` in Sift. Each test function becomes a `TestStep`, measurements are presented
+as rows under that step, and failures propagate up through nested substeps to
+the report itself.
+
+## Quick start
+
+Install the client and pytest:
+
+```bash
+pip install sift-stack-py pytest
+```
+
+The default `sift_client` fixture reads its connection details from the
+environment:
+
+```bash
+SIFT_API_KEY="..."
+SIFT_GRPC_URI="..."
+SIFT_REST_URI="..."
+```
+
+Find these on the Sift Manage page, where you can also generate an API key. Set
+them in your shell or CI secret store. For local dev, `pip install
+pytest-dotenv` and drop the same values in a `.env` next to your tests — it
+loads them automatically, no code required.
+
+Register the plugin with a single `pytest_plugins` declaration in your top-level
+`conftest.py`:
+
+```python title="conftest.py"
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
+
+Write a test. The `step` fixture is `autouse`, so any test becomes a step on the
+report. Take it as an argument when you want to record a measurement:
+
+```python title="test_battery.py"
+def test_battery_voltage(step):
+    step.measure(
+        name="battery_voltage",
+        value=4.97,
+        bounds={"min": 4.8, "max": 5.2},
+        unit="V",
+    )
+    step.pytest_fail_if_step_failed()
+```
+
+Run it:
+
+```bash
+pytest
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+!!! tip "Fail at the end, not per measurement"
+    `step.measure(...)` returns a pass/fail boolean and never raises, so a
+    failing measurement marks the step failed without aborting the test. Take
+    every measurement first, then call `step.pytest_fail_if_step_failed()` once
+    at the end, so every measurement still lands in the report even when one
+    fails. It fails the test via `pytest.fail` (no assertion noise in
+    `error_info`), and unlike asserting on an individual `step.measure(...)` call
+    it does not short-circuit on the first failure and skip every measurement
+    after it.
+
+## Sensible defaults
+
+With nothing but the `conftest.py` above, you get:
+
+- **Full step tree.** Every Python package, test module, test class, and
+  parametrize axis above a test becomes a parent step, so the report mirrors
+  your test layout.
+- **Online mode.** The plugin pings Sift at session start and streams
+  create/update calls to your tenant during the run.
+- **Git metadata.** Repo, branch, and commit are captured on the report
+  automatically.
+
+Everything is on by default and individually overridable. See
+[Configuration & Defaults](configuration.md) for the full audit of every knob,
+marker, flag, and fixture.
+
+## Running modes
+
+The plugin runs in one of three modes, picked at invocation.
+
+| Mode | How to select | Contacts Sift | When to use                                                   |
+|---|---|---|---------------------------------------------------------------|
+| **Online** | default (no flag) | Yes, during the run | Default choice                                                |
+| **Offline** | `--sift-offline` | No; records to a log file for later replay | Environments without Sift access.                             |
+| **Disabled** | `--sift-disabled` | No | Local dev. Bounds still evaluate and return a real pass/fail. |
+
+Online mode pings Sift once at session start and aborts if Sift is unreachable or the credentials are invalid, 
+so a misconfigured job fails immediately instead of silently producing no report. 
+During the run, every create and update is appended to a JSONL log file. 
+A background worker uploads new entries to Sift incrementally. 
+If the connection drops mid-test, the test keeps running and the log keeps writing locally. 
+The remaining entries can be uploaded afterward by running import-test-result-log, which the plugin prints on exit.
+
+See [Running Modes](running_modes.md) for the log-file and replay pipeline,
+overriding the connection check, and replaying a saved log.
+
+## Report structure
+
+The report tree mirrors your test layout: packages, modules, classes, and
+parametrize axes nest automatically, and you can open arbitrary substeps inside
+a test. See [Report Structure](report_structure.md) for the layout-to-tree
+mapping, measurement variants, and report metadata.
+
+## Pass/fail outcomes
+
+Every pytest outcome (pass, assertion failure, exception, skip, xfail, hard
+exit) maps to a `TestStatus`, and failures roll up to the parent steps and the
+report. See [Pass/Fail Behavior](pass_fail_behavior.md).
+
+## Try the runnable demo
+
+The [Pytest Plugin Quickstart](../../examples/pytest_plugin_quickstart.md) walks
+through a self-contained demo project that exercises every layer of the step
+tree, with instructions to run it with or without a Sift tenant.
diff --git a/python/docs/guides/pytest_plugin/pass_fail_behavior.md b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
new file mode 100644
index 000000000..d0862778c
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/pass_fail_behavior.md
@@ -0,0 +1,195 @@
+# Pass/Fail Behavior
+
+The pytest plugin maps every pytest outcome to a `TestStatus` on the
+corresponding Sift step. Use this page to look up what a given test will
+produce, and how that result rolls up to the parent steps and the report.
+
+## `TestStatus` values
+
+The statuses below come from `sift_client.sift_types.test_report.TestStatus`.
+
+| Status        | Meaning                                                                                                                |
+| ------------- |------------------------------------------------------------------------------------------------------------------------|
+| `PASSED`      | The step completed and every check it owns succeeded.                                                                  |
+| `FAILED`      | An assertion, a `pytest.fail(...)`, a failed `report_outcome`, or a failing measurement marked it.                     |
+| `ERROR`       | An unexpected exception escaped the test body or a fixture (setup or teardown).                                        |
+| `ABORTED`     | A hard exit (`SystemExit` or `KeyboardInterrupt`) cut the test off; resolved while pytest tears the session down.      |
+| `SKIPPED`     | The test was skipped at collection time, at runtime, or from a fixture.                                                |
+| `IN_PROGRESS` | A transient creation state. It survives into the report only if the process is killed so abruptly that teardown never runs. |
+
+## Normal test outcomes
+
+| Scenario                                  | Trigger                              | Outcome  |
+| ----------------------------------------- | ------------------------------------ | -------- |
+| Test passes                               | function body returns cleanly        | `PASSED` |
+| Assertion failure                         | `assert 1 == 2`                      | `FAILED` |
+| `pytest.fail("...")` from the body        | `pytest.fail("intentional failure")` | `FAILED` |
+| Uncaught non-assertion exception          | `raise ValueError("boom")`           | `ERROR`  |
+
+An assertion failure records the concise assertion message (the exception
+line(s), no traceback frames) on `step.error_info.error_message` while still
+mapping to `FAILED`. A non-assertion exception gets its formatted traceback
+(the last 10 frames plus the first frame) recorded on
+`step.error_info.error_message`.
+
+## Hard exits
+
+Hard exits map to `ABORTED`. The step is resolved during fixture teardown, not
+at the instant of the exit:
+
+- When the exit produces a call-phase report (`sys.exit(1)`, `SystemExit`), the
+  plugin reads the status off that report.
+- When a `KeyboardInterrupt` aborts the session before any call-phase report
+  (Ctrl-C, or `raise KeyboardInterrupt` in the body), pytest still runs fixture
+  finalizers as it unwinds. The plugin sees setup completed with no call outcome
+  and resolves the cut-off step to `ABORTED` there.
+
+The status only reaches the report if those finalizers run. If the process is
+killed before they do (`SIGKILL`, the OOM killer, power loss), nothing is written
+and the step keeps the `IN_PROGRESS` it was created with. That is the only path
+that leaves a step `IN_PROGRESS` in a finalized report.
+
+| Scenario                                       | Trigger                            | Outcome                                          |
+| ---------------------------------------------- | ---------------------------------- | ------------------------------------------------ |
+| `SystemExit` from the test body                | `sys.exit(1)`                      | `ABORTED` (read from the call-phase report)      |
+| `KeyboardInterrupt` from the test body         | `raise KeyboardInterrupt`          | `ABORTED` (resolved during teardown)             |
+| Session-aborting `KeyboardInterrupt`           | Ctrl-C terminates pytest           | `ABORTED` (resolved during teardown)             |
+| Process killed before finalizers run           | `SIGKILL` / OOM / power loss       | `IN_PROGRESS` (nothing written after creation)   |
+
+### Abort propagation through nested substeps
+
+Every step that was open when the abort fired records
+`ABORTED`.
+
+```python title="test_abort.py"
+import sys
+
+
+def test_x(step):
+    with step.substep(name="completed_sub"):
+        pass  # closes as PASSED before the abort
+    with step.substep(name="outer_sub") as outer_sub:
+        with outer_sub.substep(name="inner_sub"):
+            sys.exit(1)  # ABORTED applied to inner_sub, outer_sub, and the test step
+```
+
+The Sift report shows `completed_sub` as `PASSED` and the three steps
+still open at the abort (`inner_sub`, `outer_sub`, and the test step
+itself) as `ABORTED`.
+
+## Skips
+
+| Scenario                              | Trigger                                       | Outcome   |
+| ------------------------------------- | --------------------------------------------- | --------- |
+| Collection-time skip                  | `@pytest.mark.skip(reason=...)`               | `SKIPPED` |
+| Conditional collection-time skip      | `@pytest.mark.skipif(True, reason=...)`       | `SKIPPED` |
+| Runtime skip from the test body       | `pytest.skip("...")`                          | `SKIPPED` |
+| Skip raised inside a fixture          | `@pytest.fixture` calls `pytest.skip("...")`  | `SKIPPED` |
+
+`SKIPPED` does not propagate as a failure. A skipped substep or test does
+not block its parent from resolving to `PASSED`.
+
+Inside a test function, you can mark just one substep as skipped without
+aborting the whole test:
+
+```python
+from sift_client.sift_types.test_report import TestStatus
+
+
+def test_runtime_skip(step):
+    with step.substep(name="optional_calibration") as cal:
+        if not precondition_met():
+            cal.current_step.update({"status": TestStatus.SKIPPED})
+```
+
+A manually-resolved status is honored by the step's exit handler. No further
+bookkeeping required.
+
+## Expected failures (xfail / xpass)
+
+xfail marks declare that a test is expected to fail. The plugin follows
+the same semantics pytest does.
+
+| Scenario                                  | Trigger                                                    | Outcome                                                       |
+| ----------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------- |
+| xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                     | `PASSED` (the test fulfilled the xfail expectation)           |
+| Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`          | `FAILED` (the mark no longer matches reality)                 |
+| Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                     | `PASSED` (`strict=False` does not insist on the failure)      |
+| `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` (the `raises=` mismatch is a real test failure)      |
+| `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)`                            | `SKIPPED` (the body never ran)                                |
+
+## Influencing outcomes from test code
+
+A test can also set the step's outcome directly via the helpers below.
+Substeps your test opens follow the same propagation rules as the ones
+the plugin opens for you.
+
+### Manual status override
+
+`step.current_step.update({...})` sets the status directly. The step's
+exit handler does not overwrite it.
+
+```python
+from sift_client.sift_types.test_report import TestStatus
+
+
+def test_manual(step):
+    step.current_step.update({"status": TestStatus.FAILED})
+```
+
+### `report_outcome` for externally computed checks
+
+`report_outcome(name, result, reason)` records a named check whose
+pass/fail was computed elsewhere (a subprocess, a remote system, your own
+comparison logic). A failing outcome marks the step `FAILED`.
+
+```python
+def test_external_check(step):
+    result, reason = run_external_validator()
+    step.report_outcome("ext-validator", result, reason)
+```
+
+### Measurements with bounds
+
+`step.measure(name=, value=, bounds=)` records a measurement and resolves
+the step to `FAILED` if the value is out of bounds. The call returns the
+pass/fail boolean and does not raise, so multiple measurements can run
+without short-circuiting.
+
+```python
+def test_battery(step):
+    step.measure(name="voltage", value=12.1, bounds={"min": 11.5, "max": 13.0}, unit="V")
+    step.measure(name="current", value=0.42, bounds={"max": 1.0}, unit="A")
+```
+
+### Substep failures
+
+A failed substep propagates failure to its parent step. A manually-set
+`SKIPPED` on a substep does not.
+
+```python
+def test_with_substep(step):
+    with step.substep(name="check") as inner:
+        inner.measure(name="value", value=99.0, bounds={"min": 0.0, "max": 5.0})
+    # The outer step resolves to FAILED because the substep failed.
+```
+
+## Propagation rules
+
+Every non-`PASSED`/`SKIPPED` step marks its parent as failed. What the
+parent records depends on whether its own scope had an abort and whether
+a child already failed:
+
+- A hard exit (`SystemExit` or an observed `KeyboardInterrupt`) in the
+  step's own scope records `ABORTED`. `ABORTED` propagates through every
+  step the abort passes through on its way up.
+- A child that already recorded a non-`PASSED`/`SKIPPED` outcome marks
+  the parent as `FAILED`. This holds whether or not an exception is still
+  propagating through the parent's scope: only the originating substep
+  records `ERROR`; ancestors inherit `FAILED`. The traceback stays on
+  the originating step's `error_info`.
+- A step records `ERROR` only when its own scope raised a non-Assertion
+  exception AND no child has failed.
+
+`SKIPPED` does not propagate. A status set explicitly via
+`current_step.update` is kept.
diff --git a/python/docs/guides/pytest_plugin/report_structure.md b/python/docs/guides/pytest_plugin/report_structure.md
new file mode 100644
index 000000000..188bee4ca
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/report_structure.md
@@ -0,0 +1,427 @@
+# Report Structure
+
+The report tree mirrors your test layout. Every Python package, test module,
+test class, and parametrize axis above a test becomes a parent step, and you can
+open arbitrary substeps inside a test. This page covers the layout-to-tree
+mapping, the measurement variants you record into it, and the metadata the
+plugin captures for you.
+
+## Recording measurements
+
+With the conftest in place, the simplest test needs nothing extra. The `step`
+fixture is `autouse=True` and pytest test failures and skips are mapped to step
+statuses automatically.
+
+```python title="test_basic.py"
+def test_no_fixtures_still_creates_a_step():
+    """Autouse `step` records this function as a step on the session report."""
+    assert 1 + 1 == 2
+
+
+def test_measure_a_single_value(step):
+    """Take `step` explicitly when you want to record a measurement."""
+    voltage = 4.97
+    step.measure(
+        name="battery_voltage",
+        value=voltage,
+        bounds={"min": 4.8, "max": 5.2},
+        unit="V",
+    )
+    # An out-of-bounds measurement already marks the step FAILED. Call this at
+    # the end to also fail pytest, without an assertion message in error_info.
+    step.pytest_fail_if_step_failed()
+
+
+def test_measure_strings_and_booleans(step):
+    """`bounds` accepts a string or `True`/`False` for non-numeric values."""
+    step.measure(name="firmware_version", value="1.4.2", bounds="1.4.2")
+    step.measure(name="self_test_passed", value=True, bounds=True)
+
+
+def test_docstring_becomes_step_description(step):
+    """This docstring is the step's description in Sift.
+
+    The plugin pulls `request.node.obj.__doc__` when it creates the step.
+    Helper functions called from within the test do not get this treatment;
+    pass `description="..."` explicitly on `substep(...)` instead.
+    """
+    assert step.current_step.description is not None
+```
+
+!!! tip "Measurements never raise"
+    `step.measure(...)` returns `True` if the value is in bounds and `False`
+    otherwise. A `False` result marks the enclosing step as failed but does not
+    raise. Chain measurements freely and inspect the boolean if you need custom
+    flow control. For how outcomes map to `TestStatus` and propagate upward, see
+    [Pass/Fail Behavior](pass_fail_behavior.md).
+
+## Nested steps
+
+Use `step.substep(name=...)` to open a child step. Substeps nest arbitrarily
+deep, and a failure at any depth propagates up to fail the parent and the
+report.
+
+```python title="test_nested_steps.py"
+import time
+
+
+def test_phased_check(step):
+    """Phase a single test into setup/exercise/verify substeps."""
+    with step.substep(name="setup", description="Power on and wait for boot") as setup:
+        setup.measure(name="boot_time_s", value=2.1, bounds={"max": 5.0}, unit="s")
+
+    with step.substep(name="exercise", description="Drive the test sequence"):
+        time.sleep(0.01)
+
+    with step.substep(name="verify", description="Read final state") as verify:
+        verify.measure(name="final_state", value="IDLE", bounds="IDLE")
+
+
+def test_deeply_nested(step):
+    """A failure at the bottom fails everyone above it."""
+    with step.substep(name="level_1") as l1:
+        with l1.substep(name="level_2") as l2:
+            with l2.substep(name="level_3") as l3:
+                l3.measure(name="leaf_value", value=42, bounds={"min": 0, "max": 100})
+```
+
+Each step gets a hierarchical `step_path` (`1`, `1.1`, `1.1.2`, `2`, …) assigned
+by `ReportContext`. Sibling substeps within the same parent auto-increment;
+opening a new top-level step starts a new branch.
+
+### Mirroring the test layout
+
+The plugin opens a parent step for each Python package (`__init__.py`
+directory), test file, and test class above every test, plus a parent step for
+each `@pytest.mark.parametrize` axis. Every layer is on by default and
+individually opt-out via ini flags (`sift_package_step`, `sift_module_step`,
+`sift_class_step`, `sift_parametrize_nesting`). Class/module/package docstrings
+become the matching step's description.
+
+A parent step is created `IN_PROGRESS` and resolves to its final status as soon
+as the last test in its subtree finishes — independent of test execution order,
+so with incremental upload the report tree fills in progressively rather than
+all at once at the end. Its time window spans from its first test starting to its
+last test finishing.
+
+### Linking a Run to the report
+
+`report_context` is the session-scoped fixture; mutating it in one test affects
+the whole report.
+
+```python
+def test_link_run_to_report(report_context, sift_client):
+    run = sift_client.runs.create(...)  # however you create your run
+    report_context.report.update({"run_id": run.id_})
+```
+
+The same `update({...})` pattern works for any field on `TestReportUpdate`,
+including `serial_number`, `part_number`, `system_operator`, and `metadata`.
+
+## How pytest layout maps to a Sift report
+
+The plugin builds the report tree by hooking pytest's collection: every test
+node it sees becomes a step. What you control is which constructs create nodes
+and where you nest substeps inside them. Common layouts and the resulting report
+trees:
+
+### Flat module of test functions
+
+The default. Each function is one step directly under the report.
+
+```python title="test_battery.py"
+def test_voltage(step): ...
+def test_current(step): ...
+def test_temperature(step): ...
+```
+
+```text title="Sift report"
+TestReport
+├── test_voltage
+├── test_current
+└── test_temperature
+```
+
+### Modules nested under a package
+
+Two test files under the same Python package (directory with `__init__.py`)
+share that package step as their parent.
+
+```python title="suites/__init__.py"
+```
+
+```python title="suites/test_battery.py"
+def test_voltage(step): ...
+def test_current(step): ...
+```
+
+```python title="suites/test_thermal.py"
+def test_idle_temp(step): ...
+def test_load_temp(step): ...
+```
+
+```text title="Sift report"
+TestReport
+└── suites
+    ├── test_battery.py
+    │   ├── test_voltage
+    │   └── test_current
+    └── test_thermal.py
+        ├── test_idle_temp
+        └── test_load_temp
+```
+
+### Test classes (and nested classes)
+
+`class TestFoo:` and `class TestOuter: class TestInner:` produce class and
+nested class steps automatically, with no manual fixture needed.
+
+```python title="test_charging.py"
+class TestCharging:
+    """Charging subsystem."""
+
+    def test_starts_at_zero(self, step): ...
+    def test_reaches_full(self, step): ...
+    def test_thermal_throttle(self, step): ...
+```
+
+```text title="Sift report"
+TestReport
+└── test_charging.py
+    └── TestCharging
+        ├── test_starts_at_zero
+        ├── test_reaches_full
+        └── test_thermal_throttle
+```
+
+The class's docstring becomes the step description.
+
+### Parametrized tests
+
+Parametrized tests cluster under a parent step named after the test function,
+with one inner parent per parametrize axis (outer-to-inner in decorator-on-page
+order). Stacked parametrize produces nested step levels.
+
+```python
+@pytest.mark.parametrize("voltage", [3.3, 5.0, 12.0])
+def test_rail(step, voltage):
+    step.measure(name="rail_v", value=voltage, bounds={"min": 0.0})
+```
+
+```text title="Sift report"
+TestReport
+└── test_module.py
+    └── test_rail
+        ├── voltage=3.3
+        ├── voltage=5.0
+        └── voltage=12.0
+```
+
+Stacked parametrize:
+
+```python
+@pytest.mark.parametrize("voltage", ["high", "low"])
+@pytest.mark.parametrize("component", ["motor", "valve"])
+def test_iso(step, voltage, component): ...
+```
+
+```text title="Sift report"
+TestReport
+└── test_module.py
+    └── test_iso
+        ├── voltage='high'
+        │   ├── component='motor'
+        │   └── component='valve'
+        └── voltage='low'
+            ├── component='motor'
+            └── component='valve'
+```
+
+Set `sift_parametrize_nesting = false` in `pytest.ini` to fall back to flat leaf
+names (`test_rail[3.3]`).
+
+### Helper functions
+
+Helpers called from a test do not auto-create a step. The plugin only sees
+pytest-collected nodes. To represent helper work in the report, open a substep
+at the call site and pass it into the helper:
+
+```python
+def measure_rail(step, name, value, bounds):
+    return step.measure(name=name, value=value, bounds=bounds, unit="V")
+
+
+def test_power_rails(step):
+    with step.substep(name="3.3V rail") as rail_3v3:
+        measure_rail(rail_3v3, "rail_v", 3.31, {"min": 3.2, "max": 3.4})
+
+    with step.substep(name="5V rail") as rail_5v:
+        measure_rail(rail_5v, "rail_v", 5.02, {"min": 4.9, "max": 5.1})
+```
+
+```text title="Sift report"
+TestReport
+└── test_power_rails
+    ├── 3.3V rail
+    │   └── rail_v        (measurement)
+    └── 5V rail
+        └── rail_v        (measurement)
+```
+
+!!! tip "Docstring-as-description is top-level only"
+    The plugin reads the test function's docstring and uses it as the step
+    description. Docstrings on helper functions are not picked up. Pass
+    `description="..."` explicitly on `substep(...)` if you want one.
+
+### Fixtures that contribute steps
+
+A fixture can open its own substep around setup/teardown by using `step` (for
+function-scope) or `report_context.new_step(...)` (for any scope). The substep
+ends when the fixture's `yield` returns, which makes the report tree mirror the
+lifecycle.
+
+```python
+@pytest.fixture
+def warmed_up_dut(step):
+    with step.substep(name="warmup", description="Bring DUT to operating temp"):
+        # ... do warmup work ...
+        yield "dut-handle"
+
+
+def test_steady_state(step, warmed_up_dut):
+    step.measure(name="temp_c", value=37.2, bounds={"min": 35.0, "max": 40.0})
+```
+
+```text title="Sift report"
+TestReport
+└── test_steady_state
+    ├── warmup        (from fixture)
+    └── temp_c        (measurement)
+```
+
+## Measurement variants
+
+`step.measure(...)` records exactly one measurement. For datasets coming off a
+sensor or calculated channel, use one of the bulk variants.
+
+### `measure_avg`: one row, the mean
+
+`measure_avg` accepts a Python list, a NumPy array, or a pandas `Series`, takes
+the mean, and evaluates it against bounds.
+
+```python
+import numpy as np
+import pandas as pd
+
+
+def test_avg_with_list(step):
+    samples = [4.97, 5.01, 5.03, 4.99, 5.02]
+    step.measure_avg(
+        name="bus_voltage_avg",
+        values=samples,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+
+
+def test_avg_with_numpy(step):
+    samples = np.linspace(99.5, 100.5, num=50)
+    step.measure_avg(
+        name="cpu_temp_avg",
+        values=samples,
+        bounds={"min": 95.0, "max": 105.0},
+        unit="C",
+    )
+
+
+def test_avg_with_pandas(step):
+    series = pd.Series([0.998, 1.001, 0.999, 1.002, 1.000])
+    step.measure_avg(
+        name="reference_clock_ratio",
+        values=series,
+        bounds={"min": 0.99, "max": 1.01},
+    )
+```
+
+### `measure_all`: only out-of-bounds rows
+
+Records measurements only for samples that fail bounds, so an all-pass dataset
+of N samples doesn't add N rows to the report. Returns `True` when every sample
+is in bounds.
+
+```python
+def test_only_outliers_recorded(step):
+    samples = [10.1, 10.2, 10.3, 99.9, 10.0, 10.1]  # 99.9 is the outlier
+    all_in_bounds = step.measure_all(
+        name="pressure_psi",
+        values=samples,
+        bounds={"min": 9.0, "max": 11.0},
+        unit="psi",
+    )
+    # Returns False because 99.9 is out of bounds. The step is already
+    # marked failed; call this only if you also want pytest to fail.
+    step.pytest_fail_if_step_failed()
+```
+
+!!! note "`measure_all` requires at least one bound"
+    Passing `bounds={}` raises `ValueError("No bounds provided")`. At least one
+    of `min` or `max` must be set.
+
+### `report_outcome`: externally computed pass/fail
+
+When the decision is computed elsewhere, drop it onto the report as a named
+substep with an optional reason. Returns the result you passed in, so you can
+use it inline.
+
+```python
+def test_external_checks(step):
+    step.report_outcome(
+        name="config_loaded",
+        result=True,
+        reason="loaded /etc/dut/config.yaml",
+    )
+
+    # Failures show up as a failed substep without raising.
+    rare_warning_seen = False
+    step.report_outcome(
+        name="no_rare_warning",
+        result=not rare_warning_seen,
+        reason="grep'd dmesg for the known-flaky warning",
+    )
+```
+
+### Bounds reference
+
+| Pass to `bounds=` | Value type | Effect |
+|---|---|---|
+| `{"min": x, "max": y}` (either key optional) | `int` / `float` | Numeric window. One-sided is fine. |
+| `NumericBounds(min=x, max=y)` | `int` / `float` | Same as the dict form, explicit. |
+| `"expected-string"` | `str` (or `bool`) | Exact equality. For `bool` values, compares lowercased string (`"true"`/`"false"`). |
+| `True` or `False` | `bool` (or `str`) | Exact equality. For `str` values, compares lowercased strings. |
+| `None` | any | Records the value but does not evaluate it; measurement is recorded as `passed=True`. |
+
+The `unit` argument is a free-form string label (e.g. `"V"`, `"C"`, `"psi"`).
+
+## Report metadata captured automatically
+
+Every report the plugin creates includes:
+
+- `name` and `test_case`: derived from the first positional argument to `pytest`. When it resolves to an existing path the plugin uses the basename for `name` and the full path string for `test_case`; otherwise both fall back to `pytest <args>`. `name` always has a UTC ISO timestamp appended. See examples below.
+- `test_system_name`: `socket.gethostname()`.
+- `system_operator`: `getpass.getuser()`.
+- `start_time` / `end_time`: set on session enter/exit.
+- `status`: starts at `IN_PROGRESS`, finalized to `PASSED` or `FAILED` on session exit (failure if any step failed or an exception escaped the session).
+- `metadata.git_repo`, `metadata.git_branch`, `metadata.git_commit`: captured via `git remote get-url origin` / `git rev-parse --abbrev-ref HEAD` / `git describe --always --dirty --exclude '*'`. Suppressed by `--no-sift-git-metadata` or when not in a git repo.
+
+Example invocations:
+
+| Pytest invocation | Report `name` | Report `test_case` |
+|---|---|---|
+| `pytest tests/test_battery.py` | `test_battery.py 2026-05-04T12:00:00.123456+00:00` | `tests/test_battery.py` |
+| `pytest tests/` | `tests 2026-05-04T12:00:00.123456+00:00` | `tests` |
+| `pytest -k voltage` | `pytest -k voltage 2026-05-04T12:00:00.123456+00:00` | `pytest -k voltage` |
+
+To override defaults (e.g. set a serial number, system operator, or extra
+metadata), call `report_context.report.update({...})` from any test or fixture.
+See [Linking a Run](#linking-a-run-to-the-report) for the same pattern applied
+to `run_id`.
diff --git a/python/docs/guides/pytest_plugin/running_modes.md b/python/docs/guides/pytest_plugin/running_modes.md
new file mode 100644
index 000000000..6c5ab05be
--- /dev/null
+++ b/python/docs/guides/pytest_plugin/running_modes.md
@@ -0,0 +1,219 @@
+# Running Modes
+
+The plugin runs in one of three modes, picked at invocation. This page covers
+how each mode behaves, the log-file/replay pipeline, and how to replay a saved
+log against Sift.
+
+## Running the suite
+
+```bash
+# Full run against your Sift tenant
+pytest
+
+# Pin the log file so you can replay it later if the import worker dies
+pytest --sift-log-file=./sift-results.jsonl
+```
+
+## The three modes
+
+| Mode | Flag | Network | Log file | `step.measure(...)` | When to use |
+|---|---|---|---|---|---|
+| Online (default) | _(none)_ | yes (pings at session start, aborts if it fails) | optional write-through backup | real measurement against Sift | CI with Sift credentials, local dev hitting your tenant |
+| Offline | `--sift-offline` | none | required (the sole sink) | real measurement queued to log | field tests, air-gapped labs, CI without network |
+| Disabled | `--sift-disabled` | none | none | bounds eval; returns a real bool | local dev or CI that doesn't have (or want) Sift |
+
+Pass both flags and disabled wins: it skips Sift entirely and supersedes every
+other setting.
+
+## Terminal output
+
+Each run prints a header with the SDK version and active mode, and an end-of-run
+`Sift report` panel summarizing the outcome. Both are suppressed under `-q`. The
+panel is color-coded when the terminal supports it (green pass, red
+failure/error, yellow skip, cyan link) and plain text otherwise (`--color=no`,
+captured output, CI logs).
+
+The section title carries the report name (truncated if long). The `Steps` row
+tallies every step in the report by final status, so it counts substeps and the
+package/module/class/parametrize grouping steps too — its totals are expected to
+exceed pytest's own test count. The `Measurements` row tallies recorded
+measurements (`step.measure(...)`) and is omitted when there are none. The
+`Test case` and `System` rows echo the report's test case, test system, and
+operator.
+
+**Online** shows the report metadata, step and measurement breakdowns, and a
+clickable link. The web host is derived from the REST URI for known Sift hosts;
+for on-prem or custom deployments set `sift_app_url`
+(ini) or the `SIFT_APP_URL` env var. Add `--sift-open-report` to
+open the report in a browser at session end.
+
+```text
+============================= test session starts ==============================
+platform linux -- Python 3.11.8, pytest-8.3.2, pluggy-1.5.0
+Sift: sift-stack-py 0.17.0 — online mode
+collected 12 items
+
+tests/test_battery.py ........                                           [ 66%]
+tests/test_thermal.py ....                                               [100%]
+
+================ Sift report · pytest tests/ 2026-05-27T22:44:23Z ==============
+  Test case    pytest tests/
+  Status       PASSED       online · sift-stack-py 0.17.0
+  Steps        14 passed
+  Measurements 42 passed
+  System       ci-runner-7 · cibot
+  Log file     /tmp/sift-a1b2c3.jsonl
+  Report       https://app.siftstack.com/test-results/0193f1a2-7c44-7e5b-9b1a-2f6c0d9e84aa
+============================== 12 passed in 3.45s ==============================
+```
+
+If the background uploader doesn't finish, the panel still links the report and
+flags that it may be incomplete:
+
+```text
+================ Sift report · pytest tests/ 2026-05-27T22:44:23Z ==============
+  Test case    pytest tests/
+  Status       FAILED       online · sift-stack-py 0.17.0
+  Steps        11 passed · 2 failed · 1 error
+  Measurements 40 passed · 3 failed
+  System       ci-runner-7 · cibot
+  Log file     /tmp/sift-a1b2c3.jsonl
+  Report       https://app.siftstack.com/test-results/0193f1a2-7c44-7e5b-9b1a-2f6c0d9e84aa
+               may be incomplete — finish with: import-test-result-log /tmp/sift-a1b2c3.jsonl
+```
+
+When the web host can't be resolved and no override is set, the `Report` row
+shows the report id instead of a link.
+
+**Offline** shows the metadata and breakdowns, then the upload command under a
+small rule (the log path is part of the command):
+
+```text
+================ Sift report · pytest tests/ 2026-05-27T22:44:23Z ==============
+  Test case    pytest tests/
+  Status       PASSED       offline · not uploaded
+  Steps        14 passed
+  Measurements 42 passed
+  System       ci-runner-7 · cibot
+  Log file     ./run.jsonl
+------------------------------ to upload to Sift -------------------------------
+  >> import-test-result-log ./run.jsonl
+```
+
+**Disabled** notes that no report was created:
+
+```text
+===================================== Sift =====================================
+Sift disabled — no test report created.
+```
+
+## Online mode (default)
+
+`report_context` resolves `client_has_connection` at session start. The default
+implementation calls `sift_client.ping.ping()`. A failed ping aborts the whole
+session with `pytest.UsageError` and points at `--sift-offline` and
+`--sift-disabled` as escape hatches.
+
+This is loud on purpose. A CI run that silently no-ops on a flaky network won't
+get noticed until somebody goes looking for the report, which is usually weeks
+later, which is usually too late.
+
+With the default `--sift-log-file` setting on, create/update calls are written
+to a JSONL log file during the run and an `import-test-result-log --incremental`
+worker replays them against Sift in the background. If the worker crashes
+mid-session (connection failure, API error) or is still draining its backlog at
+session end, the failure is logged at session end with a `replay-test-result-log`
+command for manual recovery. Test outcomes are unaffected and the local log
+file is preserved. Pass `--sift-log-file=false` to make every create/update
+synchronous against the API instead.
+
+### Overriding the connection check
+
+Override `client_has_connection` when ping isn't the right signal, for example a
+token cache that's only warm when authenticated:
+
+```python title="conftest.py"
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def client_has_connection(sift_client) -> bool:
+    return Path("~/.sift-token-cache").expanduser().is_file()
+```
+
+The override is ignored under `--sift-offline` and `--sift-disabled`.
+
+## Offline mode (`--sift-offline`)
+
+Same fixtures, same `step.measure(...)` semantics as online. The difference is
+where the writes go: every create/update lands in a JSONL log file instead of
+hitting the Sift API. The session-start ping is skipped, missing `SIFT_*` env
+vars are tolerated (placeholders are filled), and the replay worker
+(`import-test-result-log --incremental`) does not get spawned at session end.
+
+```bash
+pytest --sift-offline --sift-log-file=./run.jsonl
+```
+
+Once you have connectivity, replay it:
+
+```bash
+import-test-result-log ./run.jsonl
+```
+
+That replay creates the report, steps, and measurements against Sift. See
+[Replaying a saved log file](#replaying-a-saved-log-file) for cleanup and the
+incremental flag.
+
+`--sift-log-file=none` is rejected when offline is set. The log file is the only
+sink in offline mode, so without it the results are gone.
+
+!!! warning "Pin the log path"
+    Without `--sift-log-file=<path>`, offline mode writes to a
+    `tempfile.NamedTemporaryFile` and only surfaces the path via a `logger.info`
+    line. Pin a known path when you intend to replay later.
+
+## Disabled mode (`--sift-disabled`)
+
+The plugin stays loaded with the same fixtures and markers as the other modes.
+Nothing contacts Sift, no log file is written, and no `SIFT_*` env vars are
+required. `step.measure(...)`, `step.measure_avg(...)`, `step.measure_all(...)`,
+`step.substep(...)`, and `report_context.report.update({...})` all behave
+normally: bounds evaluate and you get a real pass/fail boolean back.
+
+Entities returned in disabled mode report `is_simulated == True` (on
+`TestReport`, `TestStep`, `TestMeasurement`, and `ReportContext`) so consumers
+and tests can branch on provenance. Offline-mode entities also report
+`is_simulated == True`.
+
+How to turn it on, in the order most projects pick:
+
+```bash
+# Per-invocation kill-switch
+pytest --sift-disabled
+```
+
+```toml
+# Per-project default (uncommon; online is usually the right default)
+# pyproject.toml:
+[tool.pytest.ini_options]
+sift_disabled = true
+```
+
+Good fit for local dev without Sift credentials. Also for library consumers who
+don't have a Sift tenant. Also useful in CI for runs that shouldn't add noise to
+the report stream, like a PR job re-running the same suite five times in a row.
+
+## Replaying a saved log file
+
+When the worker doesn't finish cleanly the plugin will print a hint mentioning
+`import-test-result-log`. To import:
+
+```bash
+import-test-result-log <path-to-log.jsonl>
+```
+
+That replays the saved JSONL log as a single batch (no `--incremental`) and
+deletes the file when it lives under the system temp dir.
diff --git a/python/examples/pytest_plugin/.env.example b/python/examples/pytest_plugin/.env.example
new file mode 100644
index 000000000..a8c028598
--- /dev/null
+++ b/python/examples/pytest_plugin/.env.example
@@ -0,0 +1,3 @@
+SIFT_API_KEY=your-api-key
+SIFT_GRPC_URI=your-org.grpc.example.com
+SIFT_REST_URI=https://your-org.rest.example.com
diff --git a/python/examples/pytest_plugin/README.md b/python/examples/pytest_plugin/README.md
new file mode 100644
index 000000000..fcc60fd5f
--- /dev/null
+++ b/python/examples/pytest_plugin/README.md
@@ -0,0 +1,121 @@
+# Pytest plugin demo
+
+A self-contained pytest project that exercises every feature of
+`sift_client.pytest_plugin`: package / module / class / parametrize step
+nesting, nested classes, manual substeps, `step.measure(...)` against
+numeric / string / bool bounds, gate markers, and the ini opt-outs.
+
+```
+examples/pytest_plugin/
+├── conftest.py                            # registers the plugin
+├── pyproject.toml                         # pytest knobs + report name/test_case/metadata
+├── .env.example                           # credential template (copy to .env for local runs)
+└── tests/
+    ├── pytest_only/                       # subpackage step: `pytest_only` opens a parent step
+    │   ├── __init__.py
+    │   └── test_pytest_only_demo.py       # plain pytest tests with no Sift APIs
+    └── with_sift/                         # subpackage step: `with_sift` opens a parent step
+        ├── __init__.py
+        └── test_with_sift_demo.py         # measurements, substeps, classes, nested classes,
+                                            # stacked parametrize, sift_exclude marker
+```
+
+Every layer of organization shows up in the report tree: Python packages
+(directories with `__init__.py`), modules (test files), classes (including
+nested classes), and parametrize axes each open a parent step. Flip
+`sift_package_step`, `sift_module_step`, `sift_class_step`, or
+`sift_parametrize_nesting` to `false` in `pyproject.toml` to disable this behavior.
+
+## Run it
+
+**Against a real Sift org**:
+
+```bash
+pip install pytest-dotenv        # auto-loads .env; or export the vars yourself
+cp .env.example .env
+# Fill in SIFT_API_KEY / SIFT_GRPC_URI / SIFT_REST_URI
+pytest -v
+```
+
+A `TestReport` shows up in Sift once the session finishes.
+
+**Offline (record now, replay later - intended for offline environments)**:
+
+```bash
+pytest --sift-offline --sift-log-file=/tmp/sift-demo.jsonl -v
+# Later, from anywhere with credentials:
+import-test-result-log /tmp/sift-demo.jsonl
+```
+
+## What the report tree looks like
+
+With the plugin's defaults (the `[tool.pytest.ini_options]` knobs left
+commented), running this demo produces a tree like:
+
+```
+TestReport (FAILED, since failures propagate up from leaves)
+├── pytest_only                         ← package step (FAILED)
+│   └── test_pytest_only_demo.py        ← module step (FAILED)
+│       ├── test_passes                                              PASSED
+│       ├── test_uses_a_pytest_fixture                               PASSED
+│       ├── test_assertion_failure_marks_step_failed                 FAILED
+│       ├── test_skipped                                             SKIPPED
+│       ├── test_unexpected_exception_marks_step_errored             ERROR
+│       ├── test_parametrize_without_step
+│       │   ├── value='v1'                                           PASSED
+│       │   └── value='v2'                                           PASSED
+│       └── TestPytestClass
+│           └── test_method                                          PASSED
+└── with_sift                           ← package step (FAILED)
+    └── test_with_sift_demo.py          ← module step (FAILED)
+        ├── test_measurements                                        PASSED
+        ├── test_substeps                                            PASSED
+        │   ├── phase_1
+        │   └── phase_2
+        │       └── phase_2a
+        │   (test_excluded: @sift_exclude, runs in pytest, NOT in tree)
+        ├── test_measure_series                                      PASSED
+        ├── test_failed_measurement_marks_sift_step_failed           FAILED  (pytest PASSED)
+        ├── test_pytest_fail_if_step_failed_at_end                                FAILED  (pytest FAILED)
+        ├── test_report_level_metadata                               PASSED
+        └── TestClassStep
+            ├── test_parametrize
+            │   ├── axis_a='a1'
+            │   │   ├── axis_b='b1'                                  PASSED
+            │   │   └── axis_b='b2'                                  PASSED
+            │   └── axis_a='a2'
+            │       ├── axis_b='b1'                                  PASSED
+            │       └── axis_b='b2'                                  PASSED
+            └── TestNested
+                └── test_report_outcome
+                    └── check                                        PASSED
+```
+
+The `pytest_only` module deliberately includes one failing, one skipped, and
+one erroring test so the demo shows every `TestStatus` mapping (`FAILED` for
+assertions, `SKIPPED` for `pytest.skip`, `ERROR` for any other exception).
+The `with_sift` module shows two patterns for handling measurement results:
+`test_failed_measurement_marks_sift_step_failed` lets the test keep passing
+in pytest while the Sift step is `FAILED` (useful when measurements are
+diagnostic data you want to collect regardless of outcome); and
+`test_pytest_fail_if_step_failed_at_end` takes every measurement first and
+then calls `step.pytest_fail_if_step_failed()` once at the end, so every
+measurement still lands in the report even when one fails. The end-of-test
+call is the recommended pattern: it fails via `pytest.fail` (no assertion
+noise in `error_info`), and unlike asserting on an individual
+`step.measure(...)` call it does not short-circuit on the first failure and
+skip every measurement that follows. Expected
+pytest output is `16 passed, 3 failed, 1 skipped`.
+
+Toggle any of the `sift_*_step` / `sift_parametrize_nesting` flags in
+`pyproject.toml` to `false` to collapse a layer.
+
+## What each file demonstrates
+
+| File | Feature |
+|---|---|
+| `conftest.py` | Plugin registration via `pytest_plugins` (a single line) |
+| `pyproject.toml` | Pytest nesting/git-metadata knobs at their defaults; report `name`, `test_case`, and `metadata` under `[tool.sift.pytest.report]` |
+| `tests/pytest_only/test_pytest_only_demo.py` | Plain pytest tests with no Sift APIs. The plugin captures pass/fail automatically; covers functions, fixtures, parametrize, classes, plus one each of `AssertionError` (FAILED), `pytest.skip` (SKIPPED), and a raised `ValueError` (ERROR) |
+| `tests/with_sift/test_with_sift_demo.py` | `step.measure` (numeric/string/bool bounds, units, description, metadata, `channel_names`), `step.measure_avg` and `step.measure_all` for series, an out-of-bounds measurement (pytest PASSED, Sift step FAILED), the recommended `step.pytest_fail_if_step_failed()` end-of-test call that fails pytest while still recording every measurement, nested `step.substep` (with step-level `metadata=...`), `@pytest.mark.sift_exclude`, class step + class docstring → description, nested classes, stacked `@pytest.mark.parametrize`, `step.report_outcome`, and session-level metadata via `report_context.report.update({...})` |
+| `tests/{pytest_only,with_sift}/__init__.py` | Each Python package (directory with `__init__.py`) becomes a parent step in the report tree |
diff --git a/python/examples/pytest_plugin/conftest.py b/python/examples/pytest_plugin/conftest.py
new file mode 100644
index 000000000..b019aef1d
--- /dev/null
+++ b/python/examples/pytest_plugin/conftest.py
@@ -0,0 +1,13 @@
+"""Project-level conftest for the pytest plugin demo.
+
+A single ``pytest_plugins`` declaration is all that's needed — the plugin's
+fixtures, hooks, and CLI options register through standard pytest machinery
+from there.
+
+The default ``sift_client`` fixture reads ``SIFT_API_KEY`` / ``SIFT_GRPC_URI``
+/ ``SIFT_REST_URI`` from the environment. Set them however you prefer: your CI
+secret store, your shell, or a local ``.env`` loaded by ``pytest-dotenv``
+(``pip install pytest-dotenv`` and it auto-loads ``.env`` — no code here).
+"""
+
+pytest_plugins = ["sift_client.pytest_plugin"]
diff --git a/python/examples/pytest_plugin/pyproject.toml b/python/examples/pytest_plugin/pyproject.toml
new file mode 100644
index 000000000..71280d16a
--- /dev/null
+++ b/python/examples/pytest_plugin/pyproject.toml
@@ -0,0 +1,33 @@
+# Single config file for the demo. Pytest behavior lives under
+# [tool.pytest.ini_options]; Sift report content lives under
+# [tool.sift.pytest.report].
+
+[tool.pytest.ini_options]
+# Defaults give you the full step tree: every package, module, class, and
+# parametrize axis becomes a parent step. These are the available knobs and
+# their defaults — uncomment to opt out of a layer.
+#
+# sift_autouse = true              # autouse fixtures (default: true)
+# sift_package_step = true         # Python package (dir with __init__.py) parent step (default: true)
+# sift_module_step = true          # module (test file) parent step (default: true)
+# sift_class_step = true           # class parent step incl. nested (default: true)
+# sift_parametrize_nesting = true  # parametrize parent steps (default: true)
+# sift_git_metadata = true         # git repo/branch/commit included on the report (default: true)
+
+[tool.sift.pytest.report]
+# Display name for the report. Placeholders: {target} {command} {args}
+# {rootdir} {timestamp} {count} {git_repo} {git_branch} {git_commit}.
+# Omit to use the default "{target} {timestamp}". {target} reflects what ran,
+# from the collected tests, anchored to the project name: e.g.
+# project/tests/test_x.py::test_y (single test, [param] stripped),
+# project/tests/motor (several files' common dir), or project (whole suite).
+name = "pytest-plugin demo ({count} tests) {timestamp}"
+# Grouping key across runs (same placeholders available). Omit to default to
+# {target} (what ran).
+test_case = "pytest-plugin-demo {git_branch}"
+
+[tool.sift.pytest.report.metadata]
+# Free-form key/value metadata stamped on every report. Values keep their TOML
+# type (string, int, float, bool).
+ci_revision = 2
+test_source = 'pytest-plugin-demo'
\ No newline at end of file
diff --git a/python/examples/pytest_plugin/tests/pytest_only/__init__.py b/python/examples/pytest_plugin/tests/pytest_only/__init__.py
new file mode 100644
index 000000000..939562d5f
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/pytest_only/__init__.py
@@ -0,0 +1,7 @@
+"""Subpackage of plain pytest tests with no Sift awareness.
+
+Demonstrates that the plugin captures any test's pass/fail with no opt-in
+needed — no ``step`` fixture, no markers, no imports from ``sift_client``.
+The package directory itself becomes a parent step in the report tree (via
+``sift_package_step``, on by default).
+"""
diff --git a/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py b/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
new file mode 100644
index 000000000..77790d301
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/pytest_only/test_pytest_only_demo.py
@@ -0,0 +1,49 @@
+"""Plain pytest tests are automatically captured by the plugin as steps.
+
+No imports from ``sift_client`` or fixture usage required. Each test
+becomes a step in the report tree: passing tests resolve to ``PASSED``,
+failing tests to ``FAILED``. This allows integrating existing tests
+with Sift Test Results without modification.
+"""
+
+import pytest
+
+
+def test_passes():
+    """Functions become steps in the report tree. The function docstring is used as the step description."""
+    assert 1 + 1 == 2
+
+
+@pytest.mark.parametrize("value", ["v1", "v2"])
+def test_parametrize_without_step(value):
+    """Parametrized tests are nested under a common step with sub steps for each permutation."""
+    assert value.startswith("v")
+
+
+class TestPytestClass:
+    """Test classes are turned into parent steps for their methods. Class docstrings are used as step the description."""
+
+    def test_method(self):
+        assert True
+
+
+def test_uses_a_pytest_fixture(tmp_path):
+    """Normal pytest fixtures keep working the plugin doesn't intercept them."""
+    (tmp_path / "marker").write_text("ok")
+    assert (tmp_path / "marker").read_text() == "ok"
+
+
+def test_assertion_failure_marks_step_failed():
+    """An ``AssertionError`` resolves the Sift step as ``FAILED`` (no traceback attached)."""
+    assert 1 + 1 == 3
+
+
+@pytest.mark.skip(reason="Demonstrating the skip outcome")
+def test_skipped():
+    """Skipped tests resolve as ``SKIPPED`` in the Sift report."""
+    pass
+
+
+def test_unexpected_exception_marks_step_errored():
+    """Non-``AssertionError`` exceptions resolve the Sift step as ``ERROR`` with the traceback attached."""
+    raise ValueError("simulated environmental failure")
diff --git a/python/examples/pytest_plugin/tests/with_sift/__init__.py b/python/examples/pytest_plugin/tests/with_sift/__init__.py
new file mode 100644
index 000000000..6fd60c38d
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/with_sift/__init__.py
@@ -0,0 +1,8 @@
+"""Subpackage of tests that use the Sift plugin APIs explicitly.
+
+Demonstrates ``step.measure`` (numeric / string / bool bounds), nested
+``step.substep``, gate markers, class and nested-class step nesting, stacked
+parametrize, and ``step.report_outcome``. The package directory itself
+becomes a parent step in the report tree (via ``sift_package_step``, on by
+default).
+"""
diff --git a/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
new file mode 100644
index 000000000..c25c605c5
--- /dev/null
+++ b/python/examples/pytest_plugin/tests/with_sift/test_with_sift_demo.py
@@ -0,0 +1,177 @@
+"""End-to-end demo of the test-results features: measurements, substeps,
+exclusion, classes, nested classes, and stacked parametrize."""
+
+import pytest
+
+
+def test_measurements(step) -> None:
+    """Measurements are the first-class method for recording numeric, string, or bool bounds criteria and their outcomes. These show up in report steps.
+    ``step.measure`` accepts numeric (min/max), string, or bool bounds.
+    Names should be chosen that provide sufficient context, but general enough that similar/identical measurements
+    across steps or reports can be compared.
+    """
+    step.measure(name="numeric_value", value=1.5, bounds={"min": 0.0, "max": 2.0})
+    step.measure(name="string_label", value="ok", bounds="ok")
+    step.measure(name="bool_flag", value=True, bounds=True)
+
+    # Descriptions and metadata can also be provided to measurements.
+    step.measure(
+        name="numeric_value_2",
+        value=1.5,
+        bounds={"min": 0.0, "max": 2.0},
+        description="Numeric that represents X, Y, Z",
+        metadata={"subsystem": "A"},
+    )
+
+    # If you plan to link the pytest report to a Sift Run, you can also assign related channels for easy plotting in the app
+    step.measure(
+        name="numeric_value",
+        value=1.5,
+        bounds={"min": 0.0, "max": 2.0},
+        channel_names=["channel_1", "channel_2"],
+    )
+
+
+def test_substeps(step) -> None:
+    """``step.substep(...)`` opens child steps inside one test; substeps nest arbitrarily.
+    This can be useful for grouping related measurements or for creating a more natural report structure
+    without the need to create a new test, class, etc.
+
+    Metadata can be attached at the step level by passing ``metadata=...`` to
+    ``substep``; the same keyword is accepted by ``report_context.new_step``
+    and propagates to the resulting ``TestStep``.
+
+    A failed substep marks this step FAILED in the report without raising, so
+    the end-of-test ``step.pytest_fail_if_step_failed()`` call is needed here
+    too: it folds substep failures (not just direct measurements) into the
+    pytest outcome.
+    """
+    with step.substep(name="phase_1", metadata={"phase_index": 1}) as s1:
+        s1.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    with step.substep(name="phase_2", metadata={"phase_index": 2}) as s2:
+        with s2.substep(name="phase_2a") as s2a:
+            s2a.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    # Fails pytest if any substep above failed; no-op when they all passed.
+    step.pytest_fail_if_step_failed()
+
+
+def test_measure_series(step) -> None:
+    """``measure_avg`` and ``measure_all`` are the series variants of ``measure``.
+
+    Both accept a list, numpy array, or pandas series of numeric values.
+    ``measure_avg`` records one row holding the mean of the series and
+    bounds-checks it. ``measure_all`` evaluates every value individually and
+    records one row per out-of-bounds element (in-bounds values are NOT
+    recorded, keeping the report compact).
+    """
+    voltages = [4.95, 5.02, 5.01, 4.98, 5.00]
+    step.measure_avg(
+        name="voltage_mean",
+        values=voltages,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+    # All values are in-bounds here, so measure_all records nothing extra;
+    # change one to e.g. 6.0 to see an out-of-bounds row appear.
+    step.measure_all(
+        name="voltage_samples",
+        values=voltages,
+        bounds={"min": 4.9, "max": 5.1},
+        unit="V",
+    )
+
+
+def test_failed_measurement_marks_sift_step_failed(step) -> None:
+    """An out-of-bounds measurement marks the Sift step as ``FAILED``
+    without raising. The pytest test still passes (no assertion, no
+    exception); the Sift report records bounds compliance while pytest
+    records control flow.
+
+    Use this pattern when measurements are diagnostic data you want to
+    collect alongside the test result, even when some readings fall outside
+    spec. See ``test_assert_passed_at_end`` below for the recommended way
+    to also fail pytest when any measurement is out of bounds.
+    """
+    step.measure(
+        name="voltage",
+        value=99.0,  # outside the bounds below; marks the step FAILED in Sift
+        bounds={"min": 0.0, "max": 10.0},
+        unit="V",
+    )
+
+
+def test_pytest_fail_if_step_failed_at_end(step) -> None:
+    """Recommended pattern: do every measurement and substep first, then call
+    ``step.pytest_fail_if_step_failed()`` once at the end.
+
+    Asserting on individual ``step.measure(...)`` calls raises
+    ``AssertionError`` on the first failure, so any measurements after the
+    failing one never run and never land in the Sift report. The end-of-test
+    call is strictly better for diagnostic completeness: every measurement and
+    substep is recorded, including the failures, and the aggregate result is
+    then folded into the pytest outcome. It fails via ``pytest.fail`` rather
+    than an assertion, so the failed step carries no assertion noise in
+    ``error_info``.
+
+    It fails on any failure the report would record: out-of-bounds
+    measurements, failed substeps, and ``report_outcome`` failures. The ``b``
+    measurement below is deliberately out of bounds. ``c`` still runs and is
+    recorded; only the final call fails the test.
+    """
+    step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
+    step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})  # out of bounds
+    step.measure(name="c", value=1.5, bounds={"min": 0.0, "max": 2.0})  # still recorded
+    step.pytest_fail_if_step_failed()
+
+
+def test_report_level_metadata(step, report_context) -> None:
+    """Attach metadata to the run-wide ``TestReport`` via ``report_context.report.update(...)``.
+
+    The same ``update({...})`` pattern works for any field on
+    ``TestReportUpdate`` (``run_id``, ``serial_number``, ``part_number``,
+    ``system_operator``, ``metadata``, ...). Useful for linking a session
+    to a Sift Run or tagging the report with build / operator info at runtime.
+
+    Updating ``metadata`` *replaces* the whole map server-side, so spread the
+    report's current metadata first to add keys without dropping the entries
+    configured under ``[tool.sift.pytest.report.metadata]`` (or the git
+    metadata and auto-recorded ``pytest_command``).
+    """
+    report_context.report.update(
+        {
+            "metadata": {
+                **report_context.report.metadata,
+                "build_id": "v1.2.3",
+                "operator": "ci",
+            }
+        }
+    )
+    step.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+
+@pytest.mark.sift_exclude
+def test_excluded() -> None:
+    """``sift_exclude`` runs the test in pytest but produces no Sift step."""
+    assert True
+
+
+class TestClassStep:
+    """A test class becomes its own step in the report tree.
+
+    This docstring becomes the description of the ``TestClassStep`` step.
+    """
+
+    @pytest.mark.parametrize("axis_a", ["a1", "a2"])
+    @pytest.mark.parametrize("axis_b", ["b1", "b2"])
+    def test_parametrize(self, step, axis_a: str, axis_b: str) -> None:
+        """Stacked parametrize nests outer-to-inner in decorator-on-page order."""
+        step.measure(name="value", value=1.0, bounds={"min": 0.0, "max": 2.0})
+
+    class TestNested:
+        """Nested classes produce nested class steps."""
+
+        def test_report_outcome(self, step) -> None:
+            """``step.report_outcome`` records a non-numeric pass/fail substep."""
+            step.report_outcome(name="check", result=True, reason="value matched")
diff --git a/python/lib/sift_client/_internal/grpc_transport/transport.py b/python/lib/sift_client/_internal/grpc_transport/transport.py
index 7e0bc5425..e088befa0 100644
--- a/python/lib/sift_client/_internal/grpc_transport/transport.py
+++ b/python/lib/sift_client/_internal/grpc_transport/transport.py
@@ -8,7 +8,6 @@
 
 from importlib.metadata import PackageNotFoundError, version
 from typing import TYPE_CHECKING, Any, TypedDict, cast
-from urllib.parse import ParseResult, urlparse
 
 import grpc
 import grpc.aio as grpc_aio
@@ -21,6 +20,7 @@
     Metadata,
     MetadataInterceptor,
 )
+from sift_client._internal.urls import parse_host
 
 if TYPE_CHECKING:
     from sift_client._internal.grpc_transport._async_interceptors.base import ClientAsyncInterceptor
@@ -78,7 +78,7 @@ def use_sift_channel(
 
     credentials = get_ssl_credentials(cert_via_openssl)
     options = _compute_channel_options(config)
-    api_uri = _clean_uri(config["uri"], use_ssl)
+    api_uri = parse_host(config["uri"])
     channel = grpc.secure_channel(api_uri, credentials, options)
     interceptors = _compute_sift_interceptors(config, metadata)
     return grpc.intercept_channel(channel, *interceptors)
@@ -98,7 +98,7 @@ def use_sift_async_channel(
         return _use_insecure_sift_async_channel(config, metadata)
 
     return grpc_aio.secure_channel(
-        target=_clean_uri(config["uri"], use_ssl),
+        target=parse_host(config["uri"]),
         credentials=get_ssl_credentials(cert_via_openssl),
         options=_compute_channel_options(config),
         interceptors=_compute_sift_async_interceptors(config, metadata),
@@ -112,7 +112,7 @@ def _use_insecure_sift_channel(
     FOR DEVELOPMENT PURPOSES ONLY
     """
     options = _compute_channel_options(config)
-    api_uri = _clean_uri(config["uri"], False)
+    api_uri = parse_host(config["uri"])
     channel = grpc.insecure_channel(api_uri, options)
     interceptors = _compute_sift_interceptors(config, metadata)
     return grpc.intercept_channel(channel, *interceptors)
@@ -125,7 +125,7 @@ def _use_insecure_sift_async_channel(
     FOR DEVELOPMENT PURPOSES ONLY
     """
     return grpc_aio.insecure_channel(
-        target=_clean_uri(config["uri"], False),
+        target=parse_host(config["uri"]),
         options=_compute_channel_options(config),
         interceptors=_compute_sift_async_interceptors(config, metadata),
     )
@@ -205,21 +205,6 @@ def _metadata_async_interceptor(
     return MetadataAsyncInterceptor(md)
 
 
-def _clean_uri(uri: str, use_ssl: bool) -> str:
-    """
-    This will automatically transform the URI to an acceptable form regardless of whether or not
-    users included the scheme in the URL or included trailing slashes.
-    """
-
-    if "http://" in uri or "https://" in uri:
-        parsed: ParseResult = urlparse(uri)
-        return parsed.netloc
-
-    full_uri = f"https://{uri}" if use_ssl else f"http://{uri}"
-    parsed_res: ParseResult = urlparse(full_uri)
-    return parsed_res.netloc
-
-
 def _compute_user_agent() -> str:
     try:
         return f"sift_stack_py/{version('sift_stack_py')}"
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py b/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
index 24e0534d7..383f2d5a3 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/_test_results_log.py
@@ -143,9 +143,13 @@ class _ReplayState:
 
 @dataclass
 class ReplayResult:
-    """Result of replaying a log file."""
+    """Result of replaying a log file.
 
-    report: TestReport
+    ``report`` is None on an incremental resume tick that uploaded only steps or
+    measurements; the report itself was created on an earlier tick.
+    """
+
+    report: TestReport | None = None
     steps: list[TestStep] = field(default_factory=list)
     measurements: list[TestMeasurement] = field(default_factory=list)
 
diff --git a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
index d15f86c48..184833e50 100644
--- a/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
+++ b/python/lib/sift_client/_internal/low_level_wrappers/test_results.py
@@ -3,7 +3,7 @@
 import logging
 import uuid
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, TypeVar, cast
 
 from google.protobuf import json_format
 from sift.test_reports.v1.test_reports_pb2 import (
@@ -68,6 +68,9 @@
 logger = logging.getLogger(__name__)
 
 
+_EntityT = TypeVar("_EntityT", TestReport, TestStep, TestMeasurement)
+
+
 class TestResultsLowLevelClient(LowLevelClientBase, WithGrpcClient):
     """Low-level client for the TestResultsAPI.
 
@@ -82,6 +85,16 @@ def __init__(self, grpc_client: GrpcClient):
         """
         super().__init__(grpc_client)
 
+    @staticmethod
+    def _mark_simulated(instance: _EntityT) -> _EntityT:
+        """Stamp an entity as having been produced by the simulate path.
+
+        Mirrors the ``__dict__`` write used by ``BaseType._apply_client_to_instance``
+        to bypass pydantic's frozen-model guard.
+        """
+        instance.__dict__["_simulated"] = True
+        return instance
+
     @staticmethod
     def simulate_create_test_report_response(
         request: CreateTestReportRequest,
@@ -387,7 +400,7 @@ async def create_test_report(
                     request,
                     response_id=simulated_proto.test_report_id,
                 )
-            return TestReport._from_proto(simulated_proto)
+            return self._mark_simulated(TestReport._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestReport(request)
         grpc_test_report = cast("CreateTestReportResponse", response).test_report
@@ -505,7 +518,9 @@ async def update_test_report(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestReport", request)
-            return self.simulate_update_test_report_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_report_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestReport(request)
         grpc_test_report = cast("UpdateTestReportResponse", response).test_report
@@ -560,7 +575,7 @@ async def create_test_step(
                     request,
                     response_id=simulated_proto.test_step_id,
                 )
-            return TestStep._from_proto(simulated_proto)
+            return self._mark_simulated(TestStep._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestStep(request)
         grpc_test_step = cast("CreateTestStepResponse", response).test_step
@@ -661,7 +676,9 @@ async def update_test_step(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestStep", request)
-            return self.simulate_update_test_step_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_step_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestStep(request)
         grpc_test_step = cast("UpdateTestStepResponse", response).test_step
@@ -716,7 +733,7 @@ async def create_test_measurement(
                     request,
                     response_id=simulated_proto.measurement_id,
                 )
-            return TestMeasurement._from_proto(simulated_proto)
+            return self._mark_simulated(TestMeasurement._from_proto(simulated_proto))
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).CreateTestMeasurement(
             request
@@ -861,7 +878,9 @@ async def update_test_measurement(
         if log_file is not None or simulate:
             if log_file is not None:
                 log_request_to_file(log_file, "UpdateTestMeasurement", request)
-            return self.simulate_update_test_measurement_response(request, existing=existing)
+            return self._mark_simulated(
+                self.simulate_update_test_measurement_response(request, existing=existing)
+            )
 
         response = await self._grpc_client.get_stub(TestReportServiceStub).UpdateTestMeasurement(
             request
@@ -1053,13 +1072,17 @@ async def _replay_update_report(
         id_map: dict[str, str],
         state: _ReplayState,
     ) -> None:
-        if state.report is None:
-            raise ValueError("UpdateTestReport found before CreateTestReport")
         request = UpdateTestReportRequest()
         json_format.Parse(json_str, request)
         request.test_report.test_report_id = self._map_id(
             id_map, request.test_report.test_report_id
         )
+        # Batch/simulate replays the whole log in order, so a missing report means
+        # the log is malformed. Incremental replay may have created the report on an
+        # earlier tick (its real ID lives in id_map), so state.report is legitimately
+        # None here -- the mapped ID is enough to issue the update.
+        if simulate and state.report is None:
+            raise ValueError("UpdateTestReport found before CreateTestReport")
         state.report = await self.update_test_report(
             request=request, simulate=simulate, existing=state.report
         )
@@ -1184,6 +1207,7 @@ async def _incremental_import_log_file(self, log_path: Path) -> ReplayResult:
         next tick.
         """
         tracking = LogTracking.load(log_path)
+        resuming = tracking.last_uploaded_line > 0
         id_map = tracking.id_map
         state = _ReplayState()
 
@@ -1202,7 +1226,10 @@ async def _incremental_import_log_file(self, log_path: Path) -> ReplayResult:
             tracking.last_uploaded_line += 1
             tracking.save(log_path)
 
-        if state.report is None:
+        # On a resume tick the CreateTestReport line was consumed on an earlier
+        # tick, so state.report is expected to be None; the report already exists
+        # on the server. Only a genuine first pass over an empty log is an error.
+        if state.report is None and not resuming:
             raise ValueError("No CreateTestReport found in log file")
 
         return ReplayResult(
diff --git a/python/lib/sift_client/_internal/pyproject_config.py b/python/lib/sift_client/_internal/pyproject_config.py
new file mode 100644
index 000000000..6a8bd177b
--- /dev/null
+++ b/python/lib/sift_client/_internal/pyproject_config.py
@@ -0,0 +1,84 @@
+"""Loader for the ``[tool.sift]`` table in a project's ``pyproject.toml``.
+
+The pytest plugin consumes this loader to resolve report-content config (under
+``[tool.sift.pytest.report]``) and SDK-level fallbacks (URIs under
+``[tool.sift]``). A malformed or missing ``pyproject.toml`` returns ``{}`` so a
+bad config file never aborts the session — the plugin falls back to its
+built-in defaults and surfaces a single warning.
+"""
+
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+# ``tomllib`` landed in 3.11; ``tomli`` is the same parser packaged for older
+# interpreters and is declared as a conditional install dep on 3.8-3.10.
+try:
+    import tomllib  # type: ignore[import-not-found,unused-ignore]
+except ImportError:  # pragma: no cover - exercised on 3.8-3.10 only
+    import tomli as tomllib  # type: ignore[no-redef,import-not-found,unused-ignore]
+
+if TYPE_CHECKING:
+    import pytest
+
+
+# Bound the upward walk so a misconfigured environment can't trigger an
+# unbounded filesystem traversal looking for a project root that isn't there.
+_MAX_PARENT_WALK = 3
+
+
+def _find_pyproject(config: pytest.Config) -> Path | None:
+    """Locate the active project's ``pyproject.toml``.
+
+    Order:
+    1. ``config.inipath`` when it is itself a ``pyproject.toml`` (the common
+       case: project uses ``[tool.pytest.ini_options]`` so pytest loaded the
+       ini settings directly from pyproject).
+    2. ``<config.rootpath>/pyproject.toml``.
+    3. A bounded walk upward from ``rootpath`` for monorepo layouts where
+       pytest's rootdir is a subdirectory and the project pyproject lives
+       higher up.
+    """
+    inipath = config.inipath
+    if inipath is not None and inipath.name == "pyproject.toml" and inipath.is_file():
+        return inipath
+    cur = Path(config.rootpath).resolve()
+    candidate = cur / "pyproject.toml"
+    if candidate.is_file():
+        return candidate
+    for _ in range(_MAX_PARENT_WALK):
+        cur = cur.parent
+        candidate = cur / "pyproject.toml"
+        if candidate.is_file():
+            return candidate
+    return None
+
+
+def load_tool_sift(config: pytest.Config) -> dict[str, Any]:
+    """Return the parsed ``[tool.sift]`` table from the project's pyproject.toml.
+
+    Returns ``{}`` when no pyproject is discoverable, when the file omits the
+    ``[tool.sift]`` table, or when parsing fails. A parse / IO failure emits a
+    single :class:`SiftPytestPluginWarning` so the session continues with
+    defaults rather than aborting on a malformed file.
+    """
+    pyproject = _find_pyproject(config)
+    if pyproject is None:
+        return {}
+    try:
+        with pyproject.open("rb") as fh:
+            data = tomllib.load(fh)
+    except (OSError, tomllib.TOMLDecodeError) as exc:
+        # Deferred import: ``pytest_plugin`` imports this loader, so a
+        # top-level import here would close the cycle at module load time.
+        from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+        warnings.warn(
+            f"Failed to read {pyproject} for [tool.sift]: {type(exc).__name__}: {exc}",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+        return {}
+    return (data.get("tool") or {}).get("sift") or {}
diff --git a/python/lib/sift_client/_internal/pytest_plugin/__init__.py b/python/lib/sift_client/_internal/pytest_plugin/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/lib/sift_client/_internal/pytest_plugin/modes.py b/python/lib/sift_client/_internal/pytest_plugin/modes.py
new file mode 100644
index 000000000..317bcfa96
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/modes.py
@@ -0,0 +1,68 @@
+"""Run-mode detection and the per-test Sift gate.
+
+Resolves the active mode (disabled > offline > online) from the ``DISABLED_OPTION`` /
+``OFFLINE_OPTION`` options, and decides whether the Sift autouse fixtures activate for
+a given node via the ``sift_include`` / ``sift_exclude`` markers.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from sift_client._internal.pytest_plugin.options import (
+    AUTOUSE_OPTION,
+    DISABLED_OPTION,
+    OFFLINE_OPTION,
+)
+
+if TYPE_CHECKING:
+    import pytest
+
+
+def is_offline(pytestconfig: pytest.Config | None) -> bool:
+    return bool(OFFLINE_OPTION.resolve(pytestconfig))
+
+
+def is_disabled(pytestconfig: pytest.Config | None) -> bool:
+    return bool(DISABLED_OPTION.resolve(pytestconfig))
+
+
+def sdk_version() -> str:
+    """Return the installed ``sift_stack_py`` version, or ``"unknown"``."""
+    from importlib.metadata import PackageNotFoundError, version
+
+    try:
+        return version("sift_stack_py")
+    except PackageNotFoundError:
+        return "unknown"
+
+
+def mode_label(config: pytest.Config) -> str:
+    """Resolve the active mode for the terminal header: disabled > offline > online."""
+    if is_disabled(config):
+        return "disabled"
+    if is_offline(config):
+        return "offline"
+    return "online"
+
+
+def sift_enabled_for(node: pytest.Item | pytest.Collector, default: bool) -> bool:
+    """Resolve the Sift gate for a node: sift_exclude > sift_include > default.
+
+    `get_closest_marker` walks the node hierarchy upward, so markers applied
+    at any level (function, class, module, package, session) are honored.
+    """
+    if node.get_closest_marker("sift_exclude"):
+        return False
+    if node.get_closest_marker("sift_include"):
+        return True
+    return default
+
+
+def gate_enabled(node: pytest.Item | pytest.Collector, config: pytest.Config) -> bool:
+    """Whether the Sift autouse fixtures should activate for ``node``.
+
+    Combines the ``sift_autouse`` ini default with the per-test marker gate, so
+    the ``step`` and parent-step fixtures share one entry point.
+    """
+    return sift_enabled_for(node, bool(AUTOUSE_OPTION.resolve(config)))
diff --git a/python/lib/sift_client/_internal/pytest_plugin/options.py b/python/lib/sift_client/_internal/pytest_plugin/options.py
new file mode 100644
index 000000000..c3b6801a1
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/options.py
@@ -0,0 +1,579 @@
+"""Declarative settings registry for the Sift pytest plugin.
+
+Every plugin setting is declared once as an :class:`Option` in the ``PLUGIN_OPTIONS``
+registry. That single registry drives ``pytest_addoption``, value resolution,
+the docs settings-reference table, and the unknown-key typo detector, so a
+setting is added or changed in one place instead of wired up across several.
+"""
+
+from __future__ import annotations
+
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Any
+
+import pytest
+
+from sift_client._internal.pyproject_config import load_tool_sift
+
+# Settings-reference categories. Each maps to a docs subsection and, in the
+# renderer, to the column subset that category actually uses.
+CAT_BEHAVIOR = "Pytest behavior"
+CAT_CONNECTION = "Connection"
+CAT_REPORT = "Report content"
+CATEGORIES = (CAT_BEHAVIOR, CAT_CONNECTION, CAT_REPORT)
+
+tool_sift_key = pytest.StashKey[dict]()
+
+
+def tool_sift(config: pytest.Config | None) -> dict[str, Any]:
+    """Session-cached ``[tool.sift]`` table.
+
+    Every option that reads TOML, plus the typo detector, would otherwise
+    re-parse pyproject.toml on the session-start path, and re-emit the
+    malformed-file warning each time. Parse once per session via the config
+    stash; ``load_tool_sift`` stays the uncached parser for direct callers.
+    """
+    if config is None:
+        return {}
+    cached = config.stash.get(tool_sift_key, None)
+    if cached is None:
+        cached = load_tool_sift(config)
+        config.stash[tool_sift_key] = cached
+    return cached
+
+
+@dataclass(frozen=True)
+class Option:
+    """A single setting plus the logic to resolve it from wherever it can be set.
+
+    A setting may come from an env var, a CLI flag, a pytest ini key, or a
+    ``[tool.sift...]`` TOML path. :meth:`resolve` walks the declared surfaces in
+    env > cli > ini > toml order; ``metadata`` (``merge=True``) is the one
+    free-form table, resolved by :meth:`resolve_merged`. The single ``PLUGIN_OPTIONS``
+    registry of these drives ``pytest_addoption``, the resolvers, the docs
+    settings-reference table, and the typo detector.
+
+    Declare only the surface fields a setting uses:
+
+    - ``cli`` / ``cli_action``: CLI flag and argparse action (``cli_dest`` derived).
+    - ``ini`` / ``ini_type`` / ``ini_default``: pytest ini key + type/default.
+    - ``toml``: tuple path under ``[tool.sift...]``, e.g.
+      ``("pytest", "report", "name")`` -> ``tool.sift.pytest.report.name``.
+    - ``env``: full env var name, e.g. ``"SIFT_API_KEY"``.
+
+    ``category`` groups the option in the docs reference (one of ``CATEGORIES``).
+    """
+
+    name: str
+    help: str
+    category: str
+    cli: str | None = None
+    cli_action: str | None = None
+    ini: str | None = None
+    ini_type: str | None = None
+    ini_default: Any = None
+    toml: tuple[str, ...] | None = None
+    env: str | None = None
+    merge: bool = False
+
+    @property
+    def cli_dest(self) -> str:
+        """Argparse ``dest`` for the option.
+
+        When the option has both a CLI flag and an ini key, the dest matches
+        the ini name so ``config.getoption(ini_name)`` returns the CLI value
+        (and falls through to ``config.getini(ini_name)`` when the flag wasn't
+        passed). Without an ini key, the dest derives from the flag name.
+        """
+        if self.ini:
+            return self.ini
+        if self.cli is None:
+            return self.name
+        return self.cli.lstrip("-").replace("-", "_")
+
+    def __post_init__(self) -> None:
+        if self.cli_action and not self.cli:
+            raise ValueError(f"Option({self.name!r}): cli_action requires cli")
+        if self.ini_type and not self.ini:
+            raise ValueError(f"Option({self.name!r}): ini_type requires ini")
+        if self.merge and not self.toml:
+            raise ValueError(f"Option({self.name!r}): merge=True needs toml")
+        if not any([self.cli, self.ini, self.toml, self.env]):
+            raise ValueError(f"Option({self.name!r}): declares no surfaces")
+        if self.category not in CATEGORIES:
+            raise ValueError(f"Option({self.name!r}): category must be one of {CATEGORIES}")
+
+    def resolve(self, config: pytest.Config | None) -> Any:
+        """First set value from declared surfaces; ``None`` when unset everywhere.
+
+        Walk order is env > cli > ini > toml. No current option declares both
+        env and cli, so the chain isn't ambiguous in practice.
+        ``getini`` returns the typed default for unset bool/list keys, so this
+        only returns ini values for booleans (always meaningful), non-empty
+        strings, and non-empty lists.
+        """
+        if self.env:
+            env_value = os.getenv(self.env)
+            if env_value not in (None, ""):
+                return env_value
+        if config is None:
+            return None
+        if self.cli:
+            cli_value = config.getoption(self.cli_dest, default=None)
+            if cli_value is not None:
+                return cli_value
+        if self.ini:
+            try:
+                ini_value = config.getini(self.ini)
+            except (KeyError, ValueError):
+                ini_value = None
+            if isinstance(ini_value, bool):
+                return ini_value
+            if isinstance(ini_value, str) and ini_value:
+                return ini_value
+            if isinstance(ini_value, list) and ini_value:
+                return ini_value
+        if self.toml:
+            toml_value = _walk_toml(tool_sift(config), self.toml)
+            if toml_value not in (None, ""):
+                return toml_value
+        return None
+
+    def resolve_merged(self, config: pytest.Config | None) -> dict[str, str | float | bool]:
+        """For ``merge=True`` dict-shape settings: the free-form TOML table.
+
+        TOML values that don't fit ``dict[str, str | float | bool]`` (nested
+        tables, lists, ``None``) are dropped with a warning so a malformed
+        entry can't crash report creation.
+        """
+        from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+        result: dict[str, str | float | bool] = {}
+        if config is not None and self.toml:
+            base = _walk_toml(tool_sift(config), self.toml)
+            if isinstance(base, dict):
+                for key, value in base.items():
+                    if not isinstance(key, str):
+                        continue
+                    if isinstance(value, (bool, str, int, float)):
+                        # ``bool`` first since ``isinstance(True, int)`` is True.
+                        result[key] = value  # type: ignore[assignment]
+                        continue
+                    warnings.warn(
+                        f"[tool.sift.{'.'.join(self.toml)}] entry {key!r} ignored: "
+                        f"unsupported type {type(value).__name__}.",
+                        SiftPytestPluginWarning,
+                        stacklevel=2,
+                    )
+        return result
+
+
+def _walk_toml(data: dict[str, Any], path: tuple[str, ...]) -> Any:
+    """Walk a parsed TOML tree along ``path``; return None on any missing key."""
+    cur: Any = data
+    for key in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(key)
+        if cur is None:
+            return None
+    return cur
+
+
+# ---------------------------------------------------------------------------
+# Settings registry.
+#
+# Add new options here. The registry drives `pytest_addoption`, resolution,
+# the docs settings-reference table, and the unknown-key typo detector, so a
+# setting is declared once instead of wired up in several places.
+#
+# Where each setting lives follows a few principles:
+#   - Secrets (the API key) come from environment variables only, never a
+#     committed file.
+#   - Pytest behavior lives in [tool.pytest.ini_options] so it integrates with
+#     `pytest --help` / `--co` / `--trace-config`.
+#   - Sift report content lives in [tool.sift.pytest.report.*].
+#   - Non-secret endpoints take an env var plus one static home (ini or toml,
+#     not both).
+#   - A CLI flag is added only when there is a real per-run override workflow;
+#     stable project config stays in ini/toml.
+#   - Dynamic per-run values are injected via environment variables (pytest-dotenv
+#     loads .env for local dev; CI sets the same names from its secret store).
+# ---------------------------------------------------------------------------
+
+# Pytest behavior. The CLI flag survives because the per-run override is real.
+LOG_FILE_OPTION = Option(
+    name="log_file",
+    category=CAT_BEHAVIOR,
+    help="Path to the JSONL log of create/update calls (path | true | false | none).",
+    cli="--sift-log-file",
+    ini="sift_log_file",
+)
+GIT_METADATA_OPTION = Option(
+    name="git_metadata",
+    category=CAT_BEHAVIOR,
+    help="Capture git repo/branch/commit on the report.",
+    cli="--no-sift-git-metadata",
+    cli_action="store_false",
+    ini="sift_git_metadata",
+    ini_type="bool",
+    ini_default=True,
+)
+OFFLINE_OPTION = Option(
+    name="offline",
+    category=CAT_BEHAVIOR,
+    help="Skip the session-start ping; route create/update through the JSONL log.",
+    cli="--sift-offline",
+    cli_action="store_true",
+    ini="sift_offline",
+    ini_type="bool",
+    ini_default=False,
+)
+DISABLED_OPTION = Option(
+    name="disabled",
+    category=CAT_BEHAVIOR,
+    help="Disable Sift entirely (no API calls, no log file). Supersedes --sift-offline.",
+    cli="--sift-disabled",
+    cli_action="store_true",
+    ini="sift_disabled",
+    ini_type="bool",
+    ini_default=False,
+)
+
+OPEN_OPTION = Option(
+    name="open_report",
+    category=CAT_BEHAVIOR,
+    help="Open the resulting report in a browser at session end (online only; "
+    "no-op when the report URL can't be resolved).",
+    cli="--sift-open-report",
+    cli_action="store_true",
+    ini="sift_open_report",
+    ini_type="bool",
+    ini_default=False,
+)
+
+# Pytest behavior: set-once project defaults (no CLI flag, no per-run override).
+AUTOUSE_OPTION = Option(
+    name="autouse",
+    category=CAT_BEHAVIOR,
+    help="Default for the Sift autouse fixtures (report_context, step, hierarchy/parametrize parents).",
+    ini="sift_autouse",
+    ini_type="bool",
+    ini_default=True,
+)
+PACKAGE_STEP_OPTION = Option(
+    name="package_step",
+    category=CAT_BEHAVIOR,
+    help="Open a parent step for each Python package in the test path.",
+    ini="sift_package_step",
+    ini_type="bool",
+    ini_default=True,
+)
+MODULE_STEP_OPTION = Option(
+    name="module_step",
+    category=CAT_BEHAVIOR,
+    help="Open a parent step for each test module.",
+    ini="sift_module_step",
+    ini_type="bool",
+    ini_default=True,
+)
+CLASS_STEP_OPTION = Option(
+    name="class_step",
+    category=CAT_BEHAVIOR,
+    help="Open per-class parent steps, including nested classes.",
+    ini="sift_class_step",
+    ini_type="bool",
+    ini_default=True,
+)
+PARAMETRIZE_NESTING_OPTION = Option(
+    name="parametrize_nesting",
+    category=CAT_BEHAVIOR,
+    help="Cluster parametrized tests under shared parent steps (e.g. test_a -> v=1, v=2).",
+    ini="sift_parametrize_nesting",
+    ini_type="bool",
+    ini_default=True,
+)
+
+# Credentials. The API key is env-only; the URIs accept env + ini.
+API_KEY_OPTION = Option(
+    name="api_key",
+    category=CAT_CONNECTION,
+    help="Sift API key (secret, env-only).",
+    env="SIFT_API_KEY",
+)
+GRPC_URI_OPTION = Option(
+    name="grpc_uri",
+    category=CAT_CONNECTION,
+    help="Sift gRPC endpoint URI.",
+    env="SIFT_GRPC_URI",
+    ini="sift_grpc_uri",
+)
+REST_URI_OPTION = Option(
+    name="rest_uri",
+    category=CAT_CONNECTION,
+    help="Sift REST endpoint URI.",
+    env="SIFT_REST_URI",
+    ini="sift_rest_uri",
+)
+APP_URL_OPTION = Option(
+    name="app_url",
+    category=CAT_CONNECTION,
+    help="Sift web-app origin for the report link in the terminal footer (e.g. "
+    "https://app.siftstack.com). When unset, the link is derived from the REST URI "
+    "for known Sift hosts.",
+    env="SIFT_APP_URL",
+    ini="sift_app_url",
+)
+
+# Report content. Project defaults in [tool.sift.pytest.report]; CI injects
+# per-run values via SIFT_REPORT_* env vars (pytest-dotenv handles .env files
+# for local dev).
+REPORT_NAME_OPTION = Option(
+    name="report_name",
+    category=CAT_REPORT,
+    help="Template for the report display name. Placeholders: {target}, {command}, {args}, "
+    "{rootdir}, {timestamp}, {count}, {git_repo}, {git_branch}, {git_commit}.",
+    toml=("pytest", "report", "name"),
+)
+TEST_CASE_OPTION = Option(
+    name="test_case",
+    category=CAT_REPORT,
+    help="Template for the report's test_case field (same placeholders as report_name).",
+    toml=("pytest", "report", "test_case"),
+)
+TEST_SYSTEM_NAME_OPTION = Option(
+    name="test_system_name",
+    category=CAT_REPORT,
+    help="Name of the test system / rig. Defaults to the host's name.",
+    env="SIFT_REPORT_TEST_SYSTEM_NAME",
+    toml=("pytest", "report", "test_system_name"),
+)
+SYSTEM_OPERATOR_OPTION = Option(
+    name="system_operator",
+    category=CAT_REPORT,
+    help="Operator running the test. Defaults to the OS user.",
+    env="SIFT_REPORT_SYSTEM_OPERATOR",
+    toml=("pytest", "report", "system_operator"),
+)
+SERIAL_NUMBER_OPTION = Option(
+    name="serial_number",
+    category=CAT_REPORT,
+    help="Serial number of the unit under test.",
+    env="SIFT_REPORT_SERIAL_NUMBER",
+    toml=("pytest", "report", "serial_number"),
+)
+PART_NUMBER_OPTION = Option(
+    name="part_number",
+    category=CAT_REPORT,
+    help="Part number of the unit under test.",
+    env="SIFT_REPORT_PART_NUMBER",
+    toml=("pytest", "report", "part_number"),
+)
+METADATA_OPTION = Option(
+    name="metadata",
+    category=CAT_REPORT,
+    help="Free-form report metadata, as a TOML table of scalar values. For "
+    "dynamic per-run keys, attach them in conftest via the report_context fixture.",
+    toml=("pytest", "report", "metadata"),
+    merge=True,
+)
+
+PLUGIN_OPTIONS: tuple[Option, ...] = (
+    LOG_FILE_OPTION,
+    GIT_METADATA_OPTION,
+    OFFLINE_OPTION,
+    DISABLED_OPTION,
+    OPEN_OPTION,
+    AUTOUSE_OPTION,
+    PACKAGE_STEP_OPTION,
+    MODULE_STEP_OPTION,
+    CLASS_STEP_OPTION,
+    PARAMETRIZE_NESTING_OPTION,
+    API_KEY_OPTION,
+    GRPC_URI_OPTION,
+    REST_URI_OPTION,
+    APP_URL_OPTION,
+    REPORT_NAME_OPTION,
+    TEST_CASE_OPTION,
+    TEST_SYSTEM_NAME_OPTION,
+    SYSTEM_OPERATOR_OPTION,
+    SERIAL_NUMBER_OPTION,
+    PART_NUMBER_OPTION,
+    METADATA_OPTION,
+)
+
+
+def register_options(parser: pytest.Parser) -> None:
+    """Register every option's CLI flag and ini key on the pytest parser.
+
+    One loop drives both surfaces, so adding a setting is one entry in
+    ``PLUGIN_OPTIONS``, not edits scattered across the ``pytest_addoption`` hook.
+    """
+    group = parser.getgroup("sift", description="Sift test results")
+    for opt in PLUGIN_OPTIONS:
+        if opt.cli is not None:
+            cli_kwargs: dict[str, Any] = {
+                "dest": opt.cli_dest,
+                "default": None,
+                "help": opt.help,
+            }
+            if opt.cli_action is not None:
+                cli_kwargs["action"] = opt.cli_action
+            group.addoption(opt.cli, **cli_kwargs)
+        if opt.ini is not None:
+            ini_kwargs: dict[str, Any] = {"help": opt.help, "default": opt.ini_default}
+            if opt.ini_type is not None:
+                ini_kwargs["type"] = opt.ini_type
+            parser.addini(opt.ini, **ini_kwargs)
+
+
+def render_settings_reference() -> str:
+    """Render the Markdown settings reference from ``PLUGIN_OPTIONS``.
+
+    One ``### <category>`` subsection per category, each table showing only the
+    columns that category uses (so no dead all-``—`` columns). The plugin docs
+    at ``docs/guides/pytest_plugin/configuration.md`` embed this output verbatim
+    so the registry and the docs can't drift;
+    ``test_settings_reference_docs_in_sync`` is the guard rail. Regenerate with::
+
+        uv run python -c "from sift_client._internal.pytest_plugin.options import render_settings_reference; print(render_settings_reference())"
+    """
+
+    def _cli_cell(opt: Option) -> str:
+        return f"`{opt.cli}`" if opt.cli else "—"
+
+    def _ini_cell(opt: Option) -> str:
+        return f"`{opt.ini}`" if opt.ini else "—"
+
+    def _toml_cell(opt: Option) -> str:
+        if not opt.toml:
+            return "—"
+        if opt.merge:
+            return f"`[tool.sift.{'.'.join(opt.toml)}]` (table)"
+        section = ".".join(opt.toml[:-1])
+        return f"`[tool.sift.{section}] {opt.toml[-1]}`"
+
+    def _env_cell(opt: Option) -> str:
+        if opt.env:
+            return f"`{opt.env}`"
+        return "—"
+
+    # Per-category column layout: only the surfaces that category actually uses.
+    # Each column is (header, cell-renderer).
+    columns_by_category = {
+        CAT_BEHAVIOR: [
+            ("CLI flag", _cli_cell),
+            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
+        ],
+        CAT_CONNECTION: [
+            ("Ini (`[tool.pytest.ini_options]`)", _ini_cell),
+            ("Env var", _env_cell),
+        ],
+        CAT_REPORT: [
+            ("TOML (`[tool.sift...]`)", _toml_cell),
+            ("Env var", _env_cell),
+        ],
+    }
+
+    def _escape(cell: str) -> str:
+        # Literal pipes inside a Markdown table cell need backslash escaping or
+        # they'd be parsed as column separators.
+        return cell.replace("|", "\\|")
+
+    blocks: list[str] = []
+    for category in CATEGORIES:
+        opts = [o for o in PLUGIN_OPTIONS if o.category == category]
+        if not opts:
+            continue
+        columns = columns_by_category[category]
+        headers = ["Setting", *(h for h, _ in columns)]
+        lines = [
+            f"### {category}",
+            "",
+            "| " + " | ".join(headers) + " |",
+            "|" + "|".join(["---"] * len(headers)) + "|",
+        ]
+        for opt in opts:
+            cells = [opt.help, *(render(opt) for _, render in columns)]
+            lines.append("| " + " | ".join(_escape(c) for c in cells) + " |")
+        blocks.append("\n".join(lines))
+    return "\n\n".join(blocks)
+
+
+def warn_on_unknown_env_vars() -> None:
+    """Emit a warning for any ``SIFT_*`` env var not declared in the registry.
+
+    The registry declares each env var by its full name (``opt.env``); a
+    ``SIFT_*`` var that matches none of them is almost always a typo.
+    """
+    import difflib
+
+    from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+    known_full = {opt.env for opt in PLUGIN_OPTIONS if opt.env}
+    suggestion_pool = sorted(known_full)
+    for name in sorted(os.environ):
+        if not name.startswith("SIFT_"):
+            continue
+        if name in known_full:
+            continue
+        close = difflib.get_close_matches(name, suggestion_pool, n=1, cutoff=0.6)
+        hint = f" (did you mean `{close[0]}`?)" if close else ""
+        warnings.warn(
+            f"Unknown SIFT_* env var `{name}`{hint}; ignored.",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+
+
+def warn_on_unknown_toml_keys(config: pytest.Config) -> None:
+    """Walk ``[tool.sift.pytest.*]`` in pyproject.toml and warn on keys outside the registry.
+
+    Only the ``tool.sift.pytest`` subtree is checked. Other ``tool.sift.*``
+    subtrees are reserved for non-pytest Sift tooling (e.g. ``tool.sift.extras``
+    is consumed by this repo's extras-generation script) and aren't our
+    concern. Free-form subtrees (``merge=True`` options like ``metadata``)
+    stop the walk; their keys are user-defined and not validated.
+    """
+    import difflib
+
+    from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+    data = tool_sift(config)
+    pytest_table = (data or {}).get("pytest")
+    if not isinstance(pytest_table, dict):
+        return
+    # Build leaf/free-form/prefix sets relative to the ``("pytest", ...)`` root
+    # the registry already uses, so the walk runs on the table we just sliced.
+    leaves = {opt.toml for opt in PLUGIN_OPTIONS if opt.toml and not opt.merge}
+    free_form = {opt.toml for opt in PLUGIN_OPTIONS if opt.toml and opt.merge}
+    prefixes: set[tuple[str, ...]] = set()
+    for full in leaves | free_form:
+        for i in range(len(full)):
+            prefixes.add(full[:i])
+
+    def _walk(node: Any, base: tuple[str, ...]) -> None:
+        if base in free_form or not isinstance(node, dict):
+            return
+        for key, value in node.items():
+            path = (*base, str(key))
+            if path in leaves or path in free_form:
+                continue
+            if path in prefixes:
+                _walk(value, path)
+                continue
+            full_name = "tool.sift." + ".".join(path)
+            same_depth = [
+                ".".join(p) for p in (leaves | free_form | prefixes) if len(p) == len(path)
+            ]
+            close = difflib.get_close_matches(".".join(path), same_depth, n=1, cutoff=0.6)
+            hint = f" (did you mean `tool.sift.{close[0]}`?)" if close else ""
+            warnings.warn(
+                f"Unknown sift config key `{full_name}`{hint}; ignored.",
+                SiftPytestPluginWarning,
+                stacklevel=2,
+            )
+
+    _walk(pytest_table, ("pytest",))
diff --git a/python/lib/sift_client/_internal/pytest_plugin/report.py b/python/lib/sift_client/_internal/pytest_plugin/report.py
new file mode 100644
index 000000000..e125c3e03
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/report.py
@@ -0,0 +1,499 @@
+"""Report construction, status resolution, and step creation.
+
+Builds the session ``ReportContext`` from resolved settings (name/test_case
+templates, log-file mode, credentials for disabled mode), resolves a function
+step's status from pytest's per-phase reports, and finalizes after teardown.
+``report_context_impl`` is a pure generator that yields the context; the
+plugin's ``report_context`` fixture owns the module-level ``REPORT_CONTEXT``.
+"""
+
+from __future__ import annotations
+
+import os
+import warnings
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.pytest_plugin.modes import is_offline
+from sift_client._internal.pytest_plugin.options import (
+    GIT_METADATA_OPTION,
+    LOG_FILE_OPTION,
+    METADATA_OPTION,
+    PARAMETRIZE_NESTING_OPTION,
+    PART_NUMBER_OPTION,
+    REPORT_NAME_OPTION,
+    SERIAL_NUMBER_OPTION,
+    SYSTEM_OPERATOR_OPTION,
+    TEST_CASE_OPTION,
+    TEST_SYSTEM_NAME_OPTION,
+)
+from sift_client._internal.pytest_plugin.steps import (
+    finalize_parents,
+    parametrize_path_key,
+    strip_param,
+)
+from sift_client.sift_types.test_report import ErrorInfo, TestStatus
+from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import (
+    _git_metadata,
+    format_assertion_message,
+    format_truncated_traceback,
+)
+
+if TYPE_CHECKING:
+    from sift_client.util.test_results.context_manager import NewStep
+
+
+def resolve_real_report_id(context: Any) -> str | None:
+    """Resolve the real server-side report id for the online footer link.
+
+    In synchronous online mode (``--sift-log-file=false``) the report is created
+    directly against the API, so ``report.id_`` is already the real id. In the
+    default incremental mode the report is created through the simulate path
+    (a client-side UUID) and the background worker maps it to the real id on
+    replay, recording it in the ``<log>.tracking`` sidecar's ``id_map``. By the
+    time this footer runs the session-scoped report context has torn down and
+    the worker has drained, so the sidecar is final.
+
+    Returns ``None`` when the worker never mapped the report (e.g. it died before
+    replaying the create), meaning no real report exists to link.
+    """
+    report = context.report
+    if not report.id_:
+        # No id was ever assigned (unset/empty); nothing to link.
+        return None
+    sim_id = str(report.id_)
+    if not getattr(report, "is_simulated", False):
+        return sim_id
+    log_file = getattr(context, "log_file", None)
+    if log_file is None:
+        return None
+    from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+
+    return LogTracking.load(log_file).id_map.get(sim_id)
+
+
+def resolve_report_link(context: Any, offline: bool) -> tuple[str | None, str | None]:
+    """Resolve ``(report_id, report_url)`` for the terminal footer.
+
+    Offline runs never upload, so the id is ``None``. Online, the id comes from
+    ``resolve_real_report_id`` and the URL is built only when both the id and the
+    client's ``app_url`` are set. Truthiness, not ``is not None``: a
+    resolved-but-empty id (degenerate sidecar mapping, unset proto field) must
+    fall through to the "not uploaded" path, not produce a ``/test-results/`` link.
+    """
+    report_id = None if offline else resolve_real_report_id(context)
+    report_url = (
+        f"{context.client.app_url}/test-results/{report_id}"
+        if report_id and context.client.app_url
+        else None
+    )
+    return report_id, report_url
+
+
+def error_info_from_longrepr(longrepr: Any) -> ErrorInfo:
+    """Fall back to the report's longrepr when no Python exception is available."""
+    return ErrorInfo(error_code=1, error_message=str(longrepr) if longrepr is not None else "")
+
+
+def resolve_initial_status(new_step: NewStep, item: pytest.Item) -> None:
+    """Resolve the function step's status from pytest's per-phase reports.
+
+    Reads ``_sift_phase_setup`` / ``_sift_phase_call`` and the test's xfail marker,
+    then mutates ``new_step.current_step`` in place and flips
+    ``new_step._sift_managed_externally`` so ``NewStep.__exit__`` emits the
+    resolved status without re-classifying.
+
+    When the call phase reports ``passed`` and no override is needed (i.e. the
+    test's own status or substep failures should drive the result), this leaves
+    the step alone so the default ``__exit__`` resolution stays in charge.
+    """
+    current_step = new_step.current_step
+    if current_step is None:
+        # The step never opened (the autouse fixture short-circuited or was
+        # disabled). Nothing to resolve.
+        return
+    setup_phase = getattr(item, "_sift_phase_setup", None)
+    call_phase = getattr(item, "_sift_phase_call", None)
+    xfail_marker = item.get_closest_marker("xfail")
+    xfail_runs = xfail_marker.kwargs.get("run", True) if xfail_marker is not None else True
+
+    status: TestStatus | None = None
+    error_info: ErrorInfo | None = None
+
+    if setup_phase is not None and setup_phase.report.outcome == "failed":
+        status = TestStatus.ERROR
+        excinfo = setup_phase.call.excinfo
+        if excinfo is not None:
+            error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+        else:
+            error_info = error_info_from_longrepr(setup_phase.report.longrepr)
+    elif setup_phase is not None and setup_phase.report.outcome == "skipped":
+        status = TestStatus.SKIPPED
+    elif call_phase is None:
+        # Setup completed but the call-phase report never fired; the session was
+        # aborted (e.g. by KeyboardInterrupt) before the plugin could observe the
+        # outcome. Resolve to ABORTED rather than leaving it IN_PROGRESS, since the
+        # test was cut off and a finalized report should not carry a step that
+        # still reads as in-progress. No call ``excinfo`` exists here, so there is
+        # no traceback to attach.
+        status = TestStatus.ABORTED
+    else:
+        wasxfail = getattr(call_phase.report, "wasxfail", None)
+        if wasxfail is not None:
+            if call_phase.report.outcome == "failed":
+                # Strict xpass: pytest synthesizes a failure when an xfail(strict=True)
+                # test unexpectedly passes. The xfail mark no longer matches reality.
+                status = TestStatus.FAILED
+            elif call_phase.report.outcome == "skipped":
+                if xfail_marker is not None and xfail_runs is False:
+                    # xfail(run=False): the test body never executed.
+                    status = TestStatus.SKIPPED
+                else:
+                    # xfail + expected failure: the test fulfilled its xfail expectation.
+                    status = TestStatus.PASSED
+            else:
+                # Non-strict xpass: passes that weren't required to fail.
+                status = TestStatus.PASSED
+        elif call_phase.report.outcome == "passed":
+            # Default __exit__ resolves PASSED/FAILED from open_step_results and any
+            # status the test code may have set. Don't override it here.
+            return
+        elif call_phase.report.outcome == "skipped":
+            status = TestStatus.SKIPPED
+        elif call_phase.report.outcome == "failed":
+            excinfo = call_phase.call.excinfo
+            children_passed = new_step.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
+            if excinfo is None:
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, AssertionError):
+                status = TestStatus.FAILED
+                error_info = format_assertion_message(excinfo.type, excinfo.value)
+            elif isinstance(excinfo.value, pytest.fail.Exception):
+                status = TestStatus.FAILED
+            elif isinstance(excinfo.value, (KeyboardInterrupt, SystemExit)):
+                # Hard exits the plugin can observe: pytest converted the
+                # raise into a call-phase report. The session-aborting variant
+                # (call_phase is None) lands in the branch above, also ABORTED.
+                status = TestStatus.ABORTED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif xfail_marker is not None:
+                # xfail(raises=X) with a non-matching exception: the contract failed.
+                status = TestStatus.FAILED
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+            elif not children_passed:
+                # A substep already recorded the error and carries the traceback;
+                # the test step only inherits the child-failed signal.
+                status = TestStatus.FAILED
+            else:
+                status = TestStatus.ERROR
+                error_info = format_truncated_traceback(excinfo.type, excinfo.value, excinfo.tb)
+
+    if status is None:
+        return
+
+    # BaseType is frozen; mutate via __dict__ the same way _apply_client_to_instance does.
+    current_step.__dict__["status"] = status
+    if error_info is not None:
+        current_step.__dict__["error_info"] = error_info
+    new_step._sift_managed_externally = True
+
+
+def finalize_after_teardown(item: pytest.Item, teardown_report: pytest.TestReport) -> None:
+    """Upgrade a closed step to FAILED when the teardown phase failed.
+
+    The autouse step fixture has already exited by the time the teardown
+    makereport hook fires, so call ``step.update`` again to override the status
+    server-side and propagate the failure to the still-open parent step.
+    """
+    step: NewStep | None = getattr(item, "_sift_step", None)
+    if step is None:
+        return
+    current_step = step.current_step
+    if current_step is None:
+        return
+    if teardown_report.outcome == "failed" and current_step.status == TestStatus.PASSED:
+        current_step.update({"status": TestStatus.FAILED})
+        step.report_context.mark_step_failed_after_close(current_step)
+
+
+def _relativize(path: Path, rootpath: Path) -> str:
+    """Path relative to rootdir, or the basename when it sits outside the tree."""
+    try:
+        rel = str(path.relative_to(rootpath))
+    except ValueError:
+        return path.name
+    return "" if rel == "." else rel
+
+
+def derive_target(request: pytest.FixtureRequest, args: tuple[str, ...]) -> str:
+    """Describe what was run, from the collected items rather than the command line.
+
+    Collection is the ground truth of selection, independent of flag order,
+    ``-k`` / ``-m`` filters, or which path form was typed. Every value is
+    anchored to the rootdir (project) name so the shape is uniform; granularity
+    narrows with the selection:
+
+    * a single test -> ``project/tests/test_motor.py::test_spin`` (param stripped)
+    * a single file -> ``project/tests/test_motor.py``
+    * many files    -> their common directory, ``project/tests/motor``
+    * whole tree / nothing collected / paths outside rootdir -> ``project``
+
+    The report is session-level and individual tests are its steps, so the
+    file/directory grain is the natural unit of "what ran" for the report
+    itself. The verbatim invocation stays available via ``{command}`` and the
+    ``pytest_command`` metadata key.
+    """
+    rootpath = request.config.rootpath
+    root = rootpath.name
+
+    def _anchor(rel: str) -> str:
+        return f"{root}/{rel}" if rel else root
+
+    items = list(getattr(request.session, "items", ()) or ())
+    if not items:
+        return root
+    if len(items) == 1:
+        return _anchor(strip_param(items[0].nodeid))
+    paths = {p for p in (getattr(i, "path", None) for i in items) if p is not None}
+    if not paths:
+        return root
+    if len(paths) == 1:
+        return _anchor(_relativize(next(iter(paths)), rootpath))
+    try:
+        common = Path(os.path.commonpath([str(p) for p in paths]))
+    except ValueError:
+        # e.g. paths on different drives (Windows); fall back to the project.
+        return root
+    return _anchor(_relativize(common, rootpath))
+
+
+def build_template_fields(
+    target: str,
+    command: str,
+    args: tuple[str, ...],
+    request: pytest.FixtureRequest,
+) -> dict[str, Any]:
+    """Build the placeholder mapping shared by the name and test_case templates."""
+    items = getattr(request.session, "items", ()) or ()
+    git = _git_metadata() or {}
+    return {
+        "target": target,
+        "command": command,
+        "args": " ".join(args),
+        "rootdir": request.config.rootpath.name,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "count": len(items),
+        "git_repo": git.get("git_repo", ""),
+        "git_branch": git.get("git_branch", ""),
+        "git_commit": git.get("git_commit", ""),
+    }
+
+
+def format_template(
+    template: str,
+    fields: dict[str, Any],
+    *,
+    fallback: str,
+    option_label: str,
+) -> str:
+    """Format ``template`` with ``fields``; on bad input, warn and return ``fallback``.
+
+    A bad template should never block test results from being recorded, so the
+    rendering errors collapse to a warning + fallback rather than aborting the
+    session.
+    """
+    from sift_client.pytest_plugin import SiftPytestPluginWarning
+
+    try:
+        return template.format(**fields)
+    except (KeyError, IndexError, ValueError) as exc:
+        warnings.warn(
+            f"Invalid {option_label} template {template!r} ({exc}); using fallback.",
+            SiftPytestPluginWarning,
+            stacklevel=2,
+        )
+        return fallback
+
+
+def resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
+    """Determine log_file value from CLI flag or ini key.
+
+    Three signal types arrive here:
+
+    * ``None``: unset; nothing was passed on the CLI and the ini key is
+      absent. Treat as the default "use a temp file."
+    * Python ``False``: an explicit disable, typically set in a conftest via
+      ``config.option.sift_log_file = False``. Return ``None`` so
+      the rest of the pipeline knows to skip logging entirely.
+    * A string (from CLI or ini): interpret ``"true"`` / ``"1"`` as the temp
+      file default, ``"false"`` / ``"none"`` as disable, anything else as a
+      file path.
+
+    Rejects ``--sift-log-file=none`` combined with ``--sift-offline`` since
+    offline mode needs the log file as its sole sink.
+    """
+    raw = LOG_FILE_OPTION.resolve(pytestconfig)
+    disabled = raw is False or (isinstance(raw, str) and raw.lower() in ("false", "none"))
+    if disabled and is_offline(pytestconfig):
+        raise pytest.UsageError(
+            "--sift-log-file=none is incompatible with --sift-offline; offline "
+            "mode requires a log file. Pin one with --sift-log-file=<path>, or "
+            "drop --sift-log-file=none to use a temp file."
+        )
+    if raw is False:
+        return None
+    if not raw:
+        return True
+    lower = str(raw).lower()
+    if lower in ("true", "1"):
+        return True
+    if lower in ("false", "none"):
+        return None
+    return Path(raw)
+
+
+def report_context_impl(
+    sift_client: SiftClient,
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config | None = None,
+) -> Generator[ReportContext, None, None]:
+    args = request.config.invocation_params.args
+    # ``target`` is "what ran", derived from the collected items (see
+    # derive_target), invocation-independent, unlike parsing the command
+    # line. Both the display name and test_case default to it; the verbatim
+    # command stays available via {command} and the pytest_command metadata.
+    target = derive_target(request, args)
+    command = "pytest " + " ".join(args) if args else "pytest"
+    fields = build_template_fields(target, command, args, request)
+    name_template = REPORT_NAME_OPTION.resolve(pytestconfig) or "{target} {timestamp}"
+    name = format_template(
+        name_template,
+        fields,
+        fallback=f"{target} {fields['timestamp']}",
+        option_label="sift_report_name",
+    )
+    test_case_template = TEST_CASE_OPTION.resolve(pytestconfig)
+    test_case = (
+        format_template(
+            test_case_template,
+            fields,
+            fallback=target,
+            option_label="sift_test_case",
+        )
+        if test_case_template
+        else target
+    )
+    # Metadata starts from the [tool.sift.pytest.report.metadata] TOML table, and
+    # the auto-recorded pytest_command layers in last so the user can't
+    # accidentally overwrite it.
+    report_metadata: dict[str, str | float | bool] = {
+        **METADATA_OPTION.resolve_merged(pytestconfig),
+        "pytest_command": command,
+    }
+    # Mode → ReportContext flags:
+    #   online (default): log_file=<temp or user path>, replay_log_file=True
+    #   --sift-offline:   log_file=<temp or user path>, replay_log_file=False
+    #   --sift-disabled:  log_file=False,               replay_log_file=False
+    disabled = sift_client._simulate
+    offline = False if disabled else is_offline(pytestconfig)
+    log_file: str | Path | bool | None = False if disabled else resolve_log_file(pytestconfig)
+    include_git_metadata = bool(GIT_METADATA_OPTION.resolve(pytestconfig))
+    with ReportContext(
+        sift_client,
+        name=name,
+        test_case=test_case,
+        test_system_name=TEST_SYSTEM_NAME_OPTION.resolve(pytestconfig) or None,
+        system_operator=SYSTEM_OPERATOR_OPTION.resolve(pytestconfig) or None,
+        serial_number=SERIAL_NUMBER_OPTION.resolve(pytestconfig) or None,
+        part_number=PART_NUMBER_OPTION.resolve(pytestconfig) or None,
+        log_file=log_file,
+        include_git_metadata=include_git_metadata,
+        replay_log_file=not (disabled or offline),
+        metadata=report_metadata,
+    ) as context:
+        try:
+            yield context
+        finally:
+            # Close any report-tree parents still open INSIDE the ReportContext's
+            # ``with`` block, so their final ``__exit__`` update calls are written
+            # to the log file BEFORE the import worker drains. Without this, the
+            # worker exits with a partial backlog and the parent steps are stuck
+            # IN_PROGRESS in the Sift report. Most parents already closed early as
+            # their subtrees finished; this is the backstop for the rest.
+            finalize_parents()
+
+
+# Placeholder credentials used in --sift-offline mode when env/ini values
+# are missing. Offline mode never makes network calls, so the values are
+# only syntactically required by SiftConnectionConfig.
+OFFLINE_DEFAULTS = {
+    "SIFT_API_KEY": "offline",
+    "SIFT_GRPC_URI": "offline.invalid:0",
+    "SIFT_REST_URI": "http://offline.invalid",
+}
+
+
+def build_disabled_client() -> SiftClient:
+    """Construct a SiftClient for ``--sift-disabled`` mode.
+
+    Tagged with ``_simulate=True`` so test-results writes short-circuit through
+    the existing low-level simulate path without contacting Sift. The URLs are
+    syntactically valid but unreachable; nothing dials them.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="disabled",
+            grpc_url="disabled.invalid:0",
+            rest_url="http://disabled.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
+
+def step_impl(
+    report_context: ReportContext, request: pytest.FixtureRequest
+) -> Generator[NewStep, None, None]:
+    node = request.node
+    # Items get a parametrize path stashed in ``pytest_itemcollected``;
+    # modules/other nodes fall back to their node name. The leaf frame
+    # (``path[-1]``) is the test-specific display name; parents are opened by
+    # ``_sift_parents``. When parametrize-nesting is disabled, fall back to the
+    # bracket-mangled pytest name (e.g. ``test_a[1]``) so the leaf remains
+    # uniquely identifiable.
+    if PARAMETRIZE_NESTING_OPTION.resolve(request.config):
+        path = node.stash.get(parametrize_path_key, ())
+        name = path[-1] if path else str(node.name)
+    else:
+        name = str(node.name)
+    # ``node.obj`` may not exist (e.g., ``pytest.DoctestItem``) or may raise
+    # when accessed; fall back to no description in those cases rather than
+    # erroring out a perfectly valid test. ``getattr``'s default only
+    # suppresses ``AttributeError``; the try/except catches everything else
+    # (RuntimeError from a misbehaving ``__doc__`` descriptor, etc.).
+    try:
+        existing_docstring = getattr(getattr(node, "obj", None), "__doc__", None) or None
+    except Exception:
+        existing_docstring = None
+    # Attach the leaf under the parent ``_sift_parents`` resolved for this item
+    # (None -> a report-root step). ``push=True`` keeps the leaf on the step stack
+    # so any in-test ``substep`` nests under it.
+    parent_ns: NewStep | None = getattr(node, "_sift_parent", None)
+    parent_step = parent_ns.current_step if parent_ns is not None else None
+    with report_context.new_step(
+        name=name,
+        description=existing_docstring,
+        assertion_as_fail_not_error=False,
+        parent=parent_step,
+        push=True,
+    ) as new_step:
+        node._sift_step = new_step
+        yield new_step
+        resolve_initial_status(new_step, node)
diff --git a/python/lib/sift_client/_internal/pytest_plugin/steps.py b/python/lib/sift_client/_internal/pytest_plugin/steps.py
new file mode 100644
index 000000000..26779cb73
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/steps.py
@@ -0,0 +1,453 @@
+"""Report-tree parent steps: an identity-keyed registry built without reordering.
+
+Each test's package/module/class ancestors ("hierarchy" parents) and each
+``@pytest.mark.parametrize`` axis ("parametrize" parents) become parent steps the
+leaf nests under. Parents are kept in identity-keyed registries — created once and
+reused by every descendant regardless of execution order — so the plugin never
+reorders test items. A parent is closed as soon as the last leaf in its subtree
+finishes (``release_finished_leaf``), with ``finalize_parents`` as the session-end
+backstop for anything still open.
+"""
+
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple
+
+import pytest
+
+from sift_client._internal.pytest_plugin.modes import gate_enabled
+from sift_client._internal.pytest_plugin.options import (
+    CLASS_STEP_OPTION,
+    MODULE_STEP_OPTION,
+    PACKAGE_STEP_OPTION,
+    PARAMETRIZE_NESTING_OPTION,
+)
+
+if TYPE_CHECKING:
+    from typing import Callable
+
+    from sift_client.util.test_results import ReportContext
+    from sift_client.util.test_results.context_manager import NewStep
+
+# --- Report-tree type aliases ---------------------------------------------
+# The plugin juggles a few small tuple/dict shapes for the parent step tree;
+# naming them keeps the signatures below readable. Defined with ``typing``
+# generics (not ``list``/``tuple``) because some are used in runtime
+# ``StashKey[...]`` subscriptions, which must stay importable on Python 3.8.
+#
+# A hierarchy parent's identity is just a ``str`` (the ancestor node's
+# ``nodeid``); a parametrize parent's identity is a ``ParametrizeKey``: the
+# test's param-stripped node id followed by its outer-to-inner axis frames
+# (e.g. ``("pkg/test_m.py::TestC::test_a", "v=1")``).
+ParametrizeKey = Tuple[str, ...]
+# Outer-to-inner display-name axis path stashed per parametrized item
+# (``(originalname, "v=1", ...)``); the leaf is its last frame.
+ParametrizePath = Tuple[str, ...]
+# One collection-tree ancestor: ``(identity, display name, docstring, rendered)``.
+# ``rendered`` is True iff that layer's ``sift_*_step`` ini flag opens a step.
+HierarchyFrame = Tuple[str, str, Optional[str], bool]
+# Outer-to-inner ancestor frames stashed per item.
+HierarchyChain = Tuple[HierarchyFrame, ...]
+# A rendered parent to open, as returned by ``resolved_parents``.
+HierarchyParent = Tuple[str, str, Optional[str]]  # (identity, name, docstring)
+ParametrizeParent = Tuple[ParametrizeKey, str]  # (registry key, frame name)
+# A gated-in leaf's parents: its rendered hierarchy identities and parametrize keys.
+LeafParents = Tuple[List[str], List[ParametrizeKey]]
+
+parametrize_path_key = pytest.StashKey[ParametrizePath]()
+
+hierarchy_key = pytest.StashKey[HierarchyChain]()
+# See ``HierarchyFrame`` above for the chain entry shape. ``identity`` is the
+# node's ``nodeid``: two ancestors at the same depth with the same display name
+# but reached via different paths (e.g., ``proj_a/utils`` and ``proj_b/utils`` in
+# a monorepo) get distinct identities, so they never silently merge. Non-rendered
+# frames open no step; the next rendered descendant attaches to the nearest
+# rendered ancestor instead.
+
+# Open report-tree parent steps, keyed by identity so they are created once and
+# reused by every descendant regardless of test execution order. The leaf step
+# for each test is created under its resolved parent (see ``report.step_impl``),
+# so no global ordering of test items is required. Parents live OUTSIDE
+# ``ReportContext.step_stack`` (created with ``push=False``) and are closed early
+# by ``release_finished_leaf``, or at session end by ``finalize_parents``.
+#
+# Hierarchy parents (packages / modules / classes) keyed by the ancestor node's
+# ``nodeid``:
+hierarchy_parents: dict[str, NewStep] = {}
+# Parametrize parents keyed by ``ParametrizeKey``, so sibling parametrizations of
+# one test share a parent while parametrizations under different
+# tests/classes/modules never collide:
+parametrize_parents: dict[ParametrizeKey, NewStep] = {}
+
+# Remaining descendant leaves per open-able parent, keyed exactly like the
+# registries above. Populated from the collected (and selected) items in
+# ``tally_expected_parents`` and decremented as each test finishes; when a count
+# reaches zero the parent's whole subtree is done and it is closed early (see
+# ``release_finished_leaf``) instead of waiting for session end.
+expected_hierarchy: dict[str, int] = {}
+expected_parametrize: dict[ParametrizeKey, int] = {}
+# Each gated-in leaf's parent identities, so ``release_finished_leaf`` — which
+# only receives a nodeid — knows which counters to decrement.
+leaf_parents: dict[str, LeafParents] = {}
+
+
+def build_parametrize_path(item: pytest.Item) -> ParametrizePath:
+    """Outer-to-inner step display names for a parametrized item.
+
+    Pytest stores ``callspec.params`` with the BOTTOM decorator's axis first;
+    the Sift step tree treats the TOP decorator as outermost, so we reverse.
+    """
+    callspec = getattr(item, "callspec", None)
+    if callspec is None or not callspec.params:
+        return ()
+    originalname = getattr(item, "originalname", item.name)
+    frames: list[str] = [originalname]
+    for name, value in reversed(callspec.params.items()):
+        frames.append(f"{name}={value!r}")
+    return tuple(frames)
+
+
+def build_hierarchy_chain(
+    item: pytest.Item | pytest.Collector,
+    config: pytest.Config,
+) -> HierarchyChain:
+    """Outer-to-inner ``(identity, name, docstring, rendered)`` for collection ancestors.
+
+    Walks ``item.parent`` upward and ALWAYS collects every ``pytest.Package``,
+    ``pytest.Module``, and ``pytest.Class`` ancestor; they all carry the identity
+    that keeps the report tree coherent across tests, so two same-named ancestors
+    reached via different paths (e.g., ``proj_a/utils`` and ``proj_b/utils`` in a
+    monorepo where the ``proj_*`` dirs are ``pytest.Dir`` nodes the walker skips)
+    cannot silently merge.
+
+    The ``identity`` field is ``node.nodeid``, globally unique per collected node.
+
+    The ``rendered`` flag is True iff the layer's ini flag is on
+    (``sift_package_step`` / ``sift_module_step`` / ``sift_class_step``).
+    Non-rendered frames carry identity but don't open a Sift step.
+
+    The ``node.obj`` access is a pytest property that imports the underlying
+    Python object and can raise *any* exception (ImportError, custom
+    metaclass errors, descriptor ``__doc__`` properties that throw). Guard
+    broadly so a misbehaving collector doesn't abort the whole collection
+    phase; that frame's docstring just becomes ``None``.
+    """
+    include_package = bool(PACKAGE_STEP_OPTION.resolve(config))
+    include_module = bool(MODULE_STEP_OPTION.resolve(config))
+    include_class = bool(CLASS_STEP_OPTION.resolve(config))
+
+    chain: list[HierarchyFrame] = []
+    # ``node.parent`` is typed as the internal ``_pytest.nodes.Node`` which
+    # isn't part of pytest's public API; widen to ``Any`` for the walk.
+    node: Any = item
+    while node is not None:
+        if isinstance(node, pytest.Class):
+            rendered = include_class
+        elif isinstance(node, pytest.Module):
+            rendered = include_module
+        elif isinstance(node, pytest.Package):
+            rendered = include_package
+        else:
+            node = node.parent
+            continue
+        try:
+            doc = (
+                (getattr(node, "obj", None) and getattr(node.obj, "__doc__", None)) or ""
+            ).strip() or None
+        except Exception:
+            doc = None
+        chain.append((node.nodeid, node.name, doc, rendered))
+        node = node.parent
+    return tuple(reversed(chain))
+
+
+def resolved_parents(
+    node: pytest.Item,
+    config: pytest.Config,
+) -> tuple[list[HierarchyParent], list[ParametrizeParent]]:
+    """The rendered report-tree parents for ``node`` — the single source of truth.
+
+    Shared by ``get_or_create_parent_chain`` (which opens these parents) and the
+    early-close counters in ``tally_expected_parents`` (which count them), so the
+    two can never key on different identities. Returns ``(hierarchy, parametrize)``
+    outer-to-inner:
+
+    * hierarchy: ``(identity, name, doc)`` for each rendered package/module/class
+      ancestor. ``identity`` is the node's ``nodeid`` (the registry key).
+    * parametrize: ``(registry key, frame name)`` for each parametrize axis except
+      the innermost (the leaf is the ``step`` fixture's job). Empty when
+      ``sift_parametrize_nesting`` is off or the item isn't parametrized.
+
+    Reads the per-item stash written in ``pytest_itemcollected``; recomputes for
+    items a later hook injected without going through it.
+    """
+    if hierarchy_key in node.stash:
+        chain = node.stash[hierarchy_key]
+    else:
+        chain = build_hierarchy_chain(node, config)
+    # Non-rendered frames open no step; the next rendered descendant attaches to
+    # the nearest rendered ancestor, so they are simply dropped here.
+    hierarchy = [(identity, name, doc) for identity, name, doc, rendered in chain if rendered]
+
+    parametrize: list[ParametrizeParent] = []
+    if PARAMETRIZE_NESTING_OPTION.resolve(config):
+        if parametrize_path_key in node.stash:
+            path = node.stash[parametrize_path_key]
+        else:
+            path = build_parametrize_path(node)
+        if path:
+            # Key parametrize parents by the test's param-stripped identity plus
+            # the outer frame prefix, so sibling params share a parent but params
+            # under different tests never merge.
+            key: ParametrizeKey = (strip_param(node.nodeid),)
+            for frame in path[:-1]:
+                key = (*key, frame)
+                parametrize.append((key, frame))
+    return hierarchy, parametrize
+
+
+def strip_param(nodeid: str) -> str:
+    """Drop the trailing ``[param]`` from a nodeid, keeping ``file::Class::func``.
+
+    The parametrize id is a variation of the test, not its identity — leaving it
+    in would make a re-parametrization silently shift the grouping key. Splits on
+    the last ``::`` segment and cuts at its first ``[``; class/function names
+    never contain ``[``, so nested brackets in a param value can't confuse it.
+    """
+    head, sep, leaf = nodeid.rpartition("::")
+    leaf = leaf.split("[", 1)[0]
+    return f"{head}{sep}{leaf}"
+
+
+def get_or_create_parent_chain(
+    node: pytest.Item,
+    config: pytest.Config,
+    request: pytest.FixtureRequest,
+) -> NewStep | None:
+    """Resolve the innermost report-tree parent for ``node``, creating any missing ancestors.
+
+    Walks the item's rendered hierarchy ancestors (outer-to-inner) and then its
+    parametrize axes (see ``resolved_parents``), get-or-creating one parent step
+    per identity in the registries. Each new parent is opened under the running
+    parent (``push=False``, so it stays off ``ReportContext.step_stack``) and
+    reused by every later descendant — no contiguity of sibling items is required,
+    so test execution order is irrelevant.
+
+    Returns the innermost parent the leaf should attach to, or ``None`` when no
+    rendered parent applies (the leaf becomes a report-root step). ``report_context``
+    is fetched lazily, only when a parent actually needs creating, so excluded
+    items never trigger eager context setup.
+    """
+    rc_cache: list[ReportContext] = []
+
+    def rc() -> ReportContext:
+        if not rc_cache:
+            rc_cache.append(request.getfixturevalue("report_context"))
+        return rc_cache[0]
+
+    return _resolve_parent_chain(node, config, rc)
+
+
+def resolve_parent_chain_in_context(
+    node: pytest.Item,
+    config: pytest.Config,
+    context: ReportContext,
+) -> NewStep | None:
+    """``get_or_create_parent_chain`` for callers holding a ``ReportContext`` directly.
+
+    The collection-skip path runs from ``pytest_runtest_makereport`` (the autouse
+    fixtures never ran for a marker-skipped item), so it has no ``FixtureRequest``
+    to resolve ``report_context`` from, only the session ``ReportContext``. It
+    must still nest the skipped item's step under the same registry parents a
+    running sibling uses, so it shares the create-once logic here.
+    """
+    return _resolve_parent_chain(node, config, lambda: context)
+
+
+def _resolve_parent_chain(
+    node: pytest.Item,
+    config: pytest.Config,
+    rc: Callable[[], ReportContext],
+) -> NewStep | None:
+    """Shared body of the two parent-chain resolvers; ``rc`` supplies the context.
+
+    ``rc`` is called only when a parent actually needs creating, so a caller that
+    passes a lazy getter keeps the "no eager context setup" guarantee.
+    """
+    hierarchy, parametrize = resolved_parents(node, config)
+    parent_step: Any = None  # TestStep of the running innermost parent, or None (root).
+    innermost: NewStep | None = None
+
+    for identity, name, doc in hierarchy:
+        ns = hierarchy_parents.get(identity)
+        if ns is None:
+            ns = rc().new_step(
+                name=name,
+                description=doc,
+                assertion_as_fail_not_error=False,
+                parent=parent_step,
+                push=False,
+            )
+            ns.__enter__()
+            hierarchy_parents[identity] = ns
+        parent_step = ns.current_step
+        innermost = ns
+
+    for key, frame in parametrize:
+        ns = parametrize_parents.get(key)
+        if ns is None:
+            ns = rc().new_step(
+                name=frame,
+                assertion_as_fail_not_error=False,
+                parent=parent_step,
+                push=False,
+            )
+            ns.__enter__()
+            parametrize_parents[key] = ns
+        parent_step = ns.current_step
+        innermost = ns
+
+    return innermost
+
+
+def close_parent(ns: NewStep) -> None:
+    """Close one open report-tree parent, stamping its last-descendant finish time.
+
+    Shared by mid-session early close (``release_finished_leaf``) and the
+    session-end drain (``finalize_parents``). The ``end_time`` override comes from
+    ``ReportContext.parent_end_times`` so the parent's window ends at its latest
+    descendant rather than wall-clock at close. A misbehaving ``__exit__`` is
+    surfaced as a warning so it never blocks the remaining parents or cascades out
+    of pytest's finalizer chain.
+    """
+    from sift_client.pytest_plugin import REPORT_CONTEXT, SiftPytestStepDrainWarning
+
+    step = ns.current_step
+    if step is None:
+        return
+    if REPORT_CONTEXT is not None:
+        ns._sift_end_time_override = REPORT_CONTEXT.parent_end_times.get(step.step_path)
+    try:
+        ns.__exit__(None, None, None)
+    except Exception as exc:
+        warnings.warn(
+            f"Sift plugin: closing parent step {step.name!r} raised {type(exc).__name__}: {exc}",
+            SiftPytestStepDrainWarning,
+            stacklevel=2,
+        )
+
+
+def close_parents_innermost_first(parents: list[NewStep]) -> None:
+    """Close the given open parents deepest-``step_path`` first.
+
+    Innermost-first means a child parent's ``propagate_step_result`` (status) and
+    ``note_close`` (finish time) reach its parent's bookkeeping before that parent
+    resolves — so a failing/late subtree rolls up correctly whether parents close
+    mid-session or at session end.
+    """
+    parents.sort(
+        key=lambda ns: ns.current_step.step_path.count(".") if ns.current_step else -1,
+        reverse=True,
+    )
+    for ns in parents:
+        close_parent(ns)
+
+
+def finalize_parents() -> None:
+    """Close every still-open report-tree parent at session end, innermost-first.
+
+    The backstop for anything ``release_finished_leaf`` did not already close
+    early (e.g. a parent whose subtree never fully ran because the session was
+    aborted). Idempotent: the registries and counters are cleared up front, so the
+    second drain site (``pytest_sessionfinish`` after ``report_context_impl``) is
+    a no-op.
+    """
+    parents = [*parametrize_parents.values(), *hierarchy_parents.values()]
+    parametrize_parents.clear()
+    hierarchy_parents.clear()
+    expected_hierarchy.clear()
+    expected_parametrize.clear()
+    leaf_parents.clear()
+    close_parents_innermost_first(parents)
+
+
+def tally_expected_parents(session: pytest.Session) -> None:
+    """Count each open-able parent's descendant leaves, for mid-session early close.
+
+    Runs after all ``modifyitems`` and deselection (``pytest_collection_finish``),
+    so ``session.items`` is the final, selected set. Only gated-in items are
+    counted — that keeps ``sift_exclude``-d siblings (and an entirely gated-off
+    session, e.g. the dev suite's own outer run) out of the tallies, so a
+    partially-excluded class still closes when its included tests finish. The maps
+    are rebuilt every session because pytester runs inner sessions in-process,
+    sharing this module state.
+    """
+    expected_hierarchy.clear()
+    expected_parametrize.clear()
+    leaf_parents.clear()
+    for item in session.items:
+        if not gate_enabled(item, session.config):
+            continue
+        hierarchy, parametrize = resolved_parents(item, session.config)
+        h_ids = [identity for identity, _, _ in hierarchy]
+        p_keys = [key for key, _ in parametrize]
+        if not h_ids and not p_keys:
+            continue  # leaf is a report-root step; no parent to close
+        leaf_parents[item.nodeid] = (h_ids, p_keys)
+        for identity in h_ids:
+            expected_hierarchy[identity] = expected_hierarchy.get(identity, 0) + 1
+        for key in p_keys:
+            expected_parametrize[key] = expected_parametrize.get(key, 0) + 1
+
+
+def _decrement_parent_counts(
+    keys: list[Any],
+    expected: dict[Any, int],
+    registry: dict[Any, NewStep],
+    ready: list[NewStep],
+) -> None:
+    """Decrement each key's remaining-descendant count by one.
+
+    When a count reaches zero the parent's subtree is complete: drop it from both
+    the count map and the registry and queue its still-open step (if any) onto
+    ``ready`` for closing. The hierarchy and parametrize branches of
+    ``release_finished_leaf`` differ only in which (count, registry) pair they
+    pass here.
+    """
+    for key in keys:
+        remaining = expected.get(key)
+        if remaining is None:
+            continue
+        if remaining <= 1:
+            expected.pop(key, None)
+            closing = registry.pop(key, None)
+            if closing is not None:
+                ready.append(closing)
+        else:
+            expected[key] = remaining - 1
+
+
+def release_finished_leaf(nodeid: str) -> None:
+    """Decrement the finished item's parents; close any whose subtree is now done.
+
+    Called from ``pytest_runtest_logfinish``, which fires once per item for every
+    outcome (pass / fail / skip / error). When a parent's remaining-leaf count
+    reaches zero its whole subtree has finished, so it is closed now rather than
+    at session end — giving incremental uploads a progressively-resolving report
+    under any execution order. Closes innermost-first so a child parent rolls its
+    result and finish time up before its own parent resolves; several levels can
+    complete on the same leaf (e.g. the last param variant closes its parametrize
+    parent, class, and module at once). Items not in ``leaf_parents`` (gated-off,
+    or injected after collection) are ignored; anything left open is handled by
+    ``finalize_parents``.
+    """
+    entry = leaf_parents.pop(nodeid, None)
+    if entry is None:
+        return
+    h_ids, p_keys = entry
+    ready: list[NewStep] = []
+    _decrement_parent_counts(h_ids, expected_hierarchy, hierarchy_parents, ready)
+    _decrement_parent_counts(p_keys, expected_parametrize, parametrize_parents, ready)
+    if ready:
+        close_parents_innermost_first(ready)
diff --git a/python/lib/sift_client/_internal/pytest_plugin/terminal.py b/python/lib/sift_client/_internal/pytest_plugin/terminal.py
new file mode 100644
index 000000000..4f1eee0dd
--- /dev/null
+++ b/python/lib/sift_client/_internal/pytest_plugin/terminal.py
@@ -0,0 +1,231 @@
+"""Terminal-summary formatting for the session-end Sift report panel.
+
+Row writers and colored count/measurement segments used by
+``pytest_terminal_summary``, plus the best-effort browser opener for
+``--sift-open-report``. Color is dropped automatically when the terminal has no
+markup (not a TTY or ``--color=no``), so captured/CI output stays plain text.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from sift_client._internal.pytest_plugin.modes import mode_label, sdk_version
+from sift_client.sift_types.test_report import TestStatus
+from sift_client.util.test_results.context_manager import _quiet_fork_stderr
+
+LABEL_WIDTH = 13
+
+
+def sift_kv(terminalreporter: Any, label: str, value: str, **value_markup: bool) -> None:
+    """Write an indented ``label  value`` row, bolding the label.
+
+    ``value_markup`` (e.g. ``green=True``, ``cyan=True``) styles only the value.
+    Color is dropped automatically when the terminal has no markup (not a TTY or
+    ``--color=no``), so captured/CI output stays plain text.
+    """
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{label:<{LABEL_WIDTH}}", bold=True)
+    terminalreporter.write_line(value, **value_markup)
+
+
+# Step-count breakdown order and labels for the footer's "Steps" row.
+STEP_COUNT_ORDER: tuple[tuple[TestStatus, str], ...] = (
+    (TestStatus.PASSED, "passed"),
+    (TestStatus.FAILED, "failed"),
+    (TestStatus.ERROR, "error"),
+    (TestStatus.ABORTED, "aborted"),
+    (TestStatus.SKIPPED, "skipped"),
+    (TestStatus.IN_PROGRESS, "in progress"),
+)
+
+
+# Per-status color for the footer's step breakdown: green pass, red
+# failure/error/abort, yellow skip; in-progress (and anything else) stays plain.
+STEP_STATUS_MARKUP: dict[TestStatus, dict[str, bool]] = {
+    TestStatus.PASSED: {"green": True},
+    TestStatus.FAILED: {"red": True},
+    TestStatus.ERROR: {"red": True},
+    TestStatus.ABORTED: {"red": True},
+    TestStatus.SKIPPED: {"yellow": True},
+}
+
+
+def step_count_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
+    """Build ``(text, markup)`` segments for a step tally, non-zero only."""
+    return [
+        (f"{counts.get(status, 0)} {label}", STEP_STATUS_MARKUP.get(status, {}))
+        for status, label in STEP_COUNT_ORDER
+        if counts.get(status, 0)
+    ]
+
+
+def measurement_segments(counts: Any) -> list[tuple[str, dict[str, bool]]]:
+    """Build ``(text, markup)`` segments for a measurement tally, non-zero only."""
+    segments: list[tuple[str, dict[str, bool]]] = []
+    if counts.get(True, 0):
+        segments.append((f"{counts[True]} passed", {"green": True}))
+    if counts.get(False, 0):
+        segments.append((f"{counts[False]} failed", {"red": True}))
+    return segments
+
+
+def write_count_row(
+    terminalreporter: Any, label: str, segments: list[tuple[str, dict[str, bool]]]
+) -> None:
+    """Write a ``label  a · b · c`` row, applying each segment's color markup."""
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{label:<{LABEL_WIDTH}}", bold=True)
+    for index, (text, markup) in enumerate(segments):
+        if index:
+            terminalreporter.write(" · ")
+        terminalreporter.write(text, **markup)
+    terminalreporter.write_line("")
+
+
+def report_panel_title(report: Any, terminalreporter: Any) -> str:
+    """``Sift report · <name>`` for the section rule, truncated to the terminal width.
+
+    The report name embeds a timestamp (and, for invocation-based runs, the
+    pytest args), so a long name is truncated with an ellipsis to keep the
+    separator line from wrapping.
+    """
+    base = "Sift report"
+    name = getattr(report, "name", None)
+    if not name:
+        return base
+    title = f"{base} · {name}"
+    fullwidth = getattr(getattr(terminalreporter, "_tw", None), "fullwidth", 80)
+    # Reserve room for the separator characters and spaces write_sep adds.
+    limit = max(len(base), fullwidth - 8)
+    if len(title) > limit:
+        title = title[: limit - 1] + "…"
+    return title
+
+
+def maybe_open_report(url: str) -> None:
+    """Best-effort open the report URL in a browser (for ``--sift-open-report``).
+
+    Skipped on CI or non-interactive sessions so a committed ``sift_open_report``
+    setting can't spawn a browser on a headless agent; the flag is meant for
+    local development.
+    """
+    import sys
+    import webbrowser
+
+    if os.environ.get("CI") or not sys.stdout.isatty():
+        return
+    try:
+        # webbrowser.open forks/execs the platform opener while the gRPC client's
+        # background threads are live; redirect fd 2 across the fork to swallow
+        # gRPC's prefork notice (same treatment as the plugin's other fork sites).
+        with _quiet_fork_stderr():
+            webbrowser.open(url)
+    except Exception:
+        # Headless / no browser available: opening is a convenience, never fatal.
+        pass
+
+
+def write_disabled_summary(terminalreporter: Any) -> None:
+    """Print the one-line panel shown in ``--sift-disabled`` mode."""
+    terminalreporter.write_sep("=", "Sift", cyan=True, bold=True)
+    terminalreporter.write_line("Sift disabled — no test report created.")
+
+
+def write_report_summary(
+    terminalreporter: Any,
+    context: Any,
+    config: Any,
+    report_id: str | None,
+    report_url: str | None,
+    offline: bool,
+) -> None:
+    """Print the session-end report panel: outcome, tallies, provenance, action.
+
+    ``report_id`` / ``report_url`` come from ``resolve_report_link``. The action
+    row is a clickable link (online), the upload command (offline), or a replay
+    hint when the report never uploaded.
+    """
+    log_file = getattr(context, "log_file", None)
+
+    failed = bool(getattr(context, "any_failures", False))
+    status_word, status_markup = (
+        ("FAILED", {"red": True, "bold": True})
+        if failed
+        else ("PASSED", {"green": True, "bold": True})
+    )
+    # Offline results live only in the local log until replayed, so the status
+    # row calls that out instead of repeating the version (already in the header).
+    status_context = (
+        f"{mode_label(config)} · not uploaded"
+        if offline
+        else f"{mode_label(config)} · sift-stack-py {sdk_version()}"
+    )
+
+    report = context.report
+
+    terminalreporter.write_sep(
+        "=", report_panel_title(report, terminalreporter), cyan=True, bold=True
+    )
+
+    # Identity row: the test case (test path or pytest invocation).
+    if report.test_case:
+        sift_kv(terminalreporter, "Test case", str(report.test_case))
+
+    # Status row: colored outcome, then compact mode context.
+    terminalreporter.write("  ")
+    terminalreporter.write(f"{'Status':<{LABEL_WIDTH}}", bold=True)
+    terminalreporter.write(status_word, **status_markup)
+    terminalreporter.write_line(f"      {status_context}")
+
+    # Step + measurement tallies (green pass, red failure, yellow skip).
+    write_count_row(
+        terminalreporter,
+        "Steps",
+        step_count_segments(context.step_status_counts) or [("no steps", {})],
+    )
+    measurements = measurement_segments(context.measurement_counts)
+    if measurements:
+        write_count_row(terminalreporter, "Measurements", measurements)
+
+    # Provenance row: test system and operator.
+    system = " · ".join(part for part in (report.test_system_name, report.system_operator) if part)
+    if system:
+        sift_kv(terminalreporter, "System", system)
+
+    # Local log file (write-through backup online, sole sink offline).
+    if log_file is not None:
+        sift_kv(terminalreporter, "Log file", str(log_file))
+
+    if offline:
+        if log_file is not None:
+            terminalreporter.write_sep("-", "to upload to Sift")
+            terminalreporter.write_line(f"  >> import-test-result-log {log_file}", cyan=True)
+        return
+
+    if not report_id:
+        # Incremental upload never mapped the report (the worker died before
+        # replaying the create), so there's no real report to link.
+        sift_kv(
+            terminalreporter,
+            "Report",
+            f"not uploaded — replay with: import-test-result-log {log_file}",
+            yellow=True,
+        )
+    elif report_url is not None:
+        sift_kv(terminalreporter, "Report", report_url, cyan=True)
+    else:
+        sift_kv(
+            terminalreporter,
+            "Report",
+            f"id {report_id}  (set sift_app_url for a clickable link)",
+        )
+
+    if report_id and getattr(context, "replay_incomplete", False) and log_file is not None:
+        sift_kv(
+            terminalreporter,
+            "",
+            f"may be incomplete — finish with: import-test-result-log {log_file}",
+            yellow=True,
+        )
diff --git a/python/lib/sift_client/_internal/rest.py b/python/lib/sift_client/_internal/rest.py
index ee0239b79..6a9d1c9d1 100644
--- a/python/lib/sift_client/_internal/rest.py
+++ b/python/lib/sift_client/_internal/rest.py
@@ -6,7 +6,7 @@
 from typing_extensions import NotRequired
 from urllib3.util import Retry
 
-from sift_client._internal.grpc_transport.transport import _clean_uri
+from sift_client._internal.urls import parse_host
 
 _DEFAULT_REST_RETRY = Retry(total=3, status_forcelist=[500, 502, 503, 504], backoff_factor=1)
 
@@ -33,7 +33,7 @@ class SiftRestConfig(TypedDict):
 def compute_uri(restconf: SiftRestConfig) -> str:
     uri = restconf["uri"]
     use_ssl = restconf.get("use_ssl", True)
-    clean_uri = _clean_uri(uri, use_ssl)
+    clean_uri = parse_host(uri)
 
     if use_ssl:
         return f"https://{clean_uri}"
diff --git a/python/lib/sift_client/_internal/urls.py b/python/lib/sift_client/_internal/urls.py
new file mode 100644
index 000000000..99dd1816f
--- /dev/null
+++ b/python/lib/sift_client/_internal/urls.py
@@ -0,0 +1,55 @@
+"""Helpers for turning Sift API endpoints into web-app (frontend) URLs.
+
+The Sift frontend can be hosted on several domains and the backend exposes no
+field for its own URL, so the frontend origin is derived client-side from the
+API host. This table mirrors the canonical mapping used by the Grafana
+datasource (sift-stack/sift-grafana-datasource,
+``src/components/sharelink/getFrontendHostnameDefaults.ts``). Hosts outside the
+table (on-prem and custom deployments) require an explicit override.
+"""
+
+from __future__ import annotations
+
+from urllib.parse import urlparse
+
+# API host (host[:port], no scheme) -> frontend origin (with scheme).
+_API_HOST_TO_FRONTEND_ORIGIN: dict[str, str] = {
+    "api.siftstack.com": "https://app.siftstack.com",
+    "gov.api.siftstack.com": "https://gov.siftstack.com",
+}
+
+
+def parse_origin(url: str) -> str:
+    """Normalize a URL or bare host into a ``scheme://host[:port]`` origin.
+
+    Bare hosts (no scheme) are assumed to be ``https``.
+    """
+    candidate = url if "://" in url else f"https://{url}"
+    parsed = urlparse(candidate)
+    return f"{parsed.scheme}://{parsed.netloc}".rstrip("/")
+
+
+def parse_host(url: str) -> str:
+    """Extract ``host[:port]`` from a URL or bare host string."""
+    candidate = url if "://" in url else f"https://{url}"
+    return urlparse(candidate).netloc
+
+
+def frontend_origin_for_api(api_base_url: str, override: str | None = None) -> str | None:
+    """Return the Sift web-app origin for a given API base URL.
+
+    Args:
+        api_base_url: The REST API base URL (e.g. ``https://api.siftstack.com``).
+        override: An explicit frontend origin (host or full URL) to use instead
+            of the derived value. Set this for on-prem or custom deployments
+            whose API host isn't in the built-in mapping.
+
+    Returns:
+        The frontend origin (e.g. ``https://app.siftstack.com``), or ``None``
+        when no override is given and the API host isn't recognized.
+    """
+    if override:
+        return parse_origin(override)
+    if not api_base_url:
+        return None
+    return _API_HOST_TO_FRONTEND_ORIGIN.get(parse_host(api_base_url))
diff --git a/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py
new file mode 100644
index 000000000..ab95ddea8
--- /dev/null
+++ b/python/lib/sift_client/_tests/_internal/low_level_wrappers/test_incremental_replay.py
@@ -0,0 +1,143 @@
+"""Unit tests for incremental log-replay resume, with no live backend.
+
+These pin the resume-tick behavior of
+``TestResultsLowLevelClient.import_log_file(incremental=True)``: the
+CreateTestReport line is uploaded on an earlier tick, so a resuming tick rebuilds
+replay state from scratch and must apply the remaining lines without an
+in-memory report. The real gRPC create/update calls are stubbed, so these run
+offline -- unlike the end-to-end resume test, which needs the integration server.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+from sift_client._internal.low_level_wrappers.test_results import (
+    # Aliased so pytest doesn't try to collect the `Test`-prefixed client as a suite.
+    TestResultsLowLevelClient as ResultsLowLevelClient,
+)
+from sift_client.sift_types.test_report import (
+    TestReport,
+    TestReportCreate,
+    TestReportUpdate,
+    TestStatus,
+    TestStep,
+    TestStepCreate,
+    TestStepType,
+)
+
+T0 = datetime(2026, 1, 1, tzinfo=timezone.utc)
+
+
+def _make_report(id_: str) -> TestReport:
+    return TestReport(
+        id_=id_,
+        status=TestStatus.FAILED,
+        name="n",
+        test_system_name="s",
+        test_case="c",
+        start_time=T0,
+        end_time=T0,
+        metadata={},
+        is_archived=False,
+    )
+
+
+def _make_step(id_: str) -> TestStep:
+    return TestStep(
+        id_=id_,
+        test_report_id="real-report",
+        name="step",
+        step_type=TestStepType.ACTION,
+        step_path="1",
+        status=TestStatus.PASSED,
+        start_time=T0,
+        end_time=T0,
+    )
+
+
+def _report_create() -> TestReportCreate:
+    return TestReportCreate(
+        status=TestStatus.IN_PROGRESS,
+        name="n",
+        test_system_name="s",
+        test_case="c",
+        start_time=T0,
+        end_time=T0,
+    )
+
+
+@pytest.mark.asyncio
+async def test_resume_applies_trailing_report_update(tmp_path):
+    """Resume whose remaining chunk is the final UpdateTestReport must apply it.
+
+    Pre-fix this raised "UpdateTestReport found before CreateTestReport"; the
+    status update then never landed and the report stayed IN_PROGRESS.
+    """
+    log_file = tmp_path / "resume_report_update.jsonl"
+    client = ResultsLowLevelClient(grpc_client=MagicMock())
+
+    # Build the log offline via the simulate path: CreateTestReport + UpdateTestReport.
+    report = await client.create_test_report(test_report=_report_create(), log_file=log_file)
+    update = TestReportUpdate(status=TestStatus.FAILED)
+    update.resource_id = report.id_
+    await client.update_test_report(update=update, log_file=log_file)
+
+    # An earlier tick already uploaded the CreateTestReport (line 1); the report
+    # exists on the server under its real ID.
+    LogTracking(last_uploaded_line=1, id_map={report.id_: "real-report"}).save(log_file)
+
+    # Stub the real RPC the resumed tick will issue.
+    client.update_test_report = AsyncMock(return_value=_make_report("real-report"))
+
+    result = await client.import_log_file(log_file, incremental=True)
+
+    client.update_test_report.assert_awaited_once()
+    sent = client.update_test_report.await_args.kwargs["request"]
+    assert sent.test_report.test_report_id == "real-report"
+    assert sent.test_report.status == TestStatus.FAILED.value
+    assert result.report is not None
+    assert result.report.id_ == "real-report"
+
+
+@pytest.mark.asyncio
+async def test_resume_with_only_steps_does_not_require_report(tmp_path):
+    """A resume tick carrying only steps must not demand an in-memory report.
+
+    Pre-fix this raised "No CreateTestReport found in log file" (the field-report
+    trace), aborting replay of the remaining step lines.
+    """
+    log_file = tmp_path / "resume_steps_only.jsonl"
+    client = ResultsLowLevelClient(grpc_client=MagicMock())
+
+    report = await client.create_test_report(test_report=_report_create(), log_file=log_file)
+    await client.create_test_step(
+        test_step=TestStepCreate(
+            test_report_id=report.id_,
+            name="s1",
+            step_type=TestStepType.ACTION,
+            step_path="1",
+            status=TestStatus.PASSED,
+            start_time=T0,
+            end_time=T0,
+        ),
+        log_file=log_file,
+    )
+
+    LogTracking(last_uploaded_line=1, id_map={report.id_: "real-report"}).save(log_file)
+
+    client.create_test_step = AsyncMock(return_value=_make_step("real-step"))
+
+    result = await client.import_log_file(log_file, incremental=True)
+
+    client.create_test_step.assert_awaited_once()
+    sent = client.create_test_step.await_args.kwargs["request"]
+    # The step's report ID was remapped from the simulated ID to the real one.
+    assert sent.test_step.test_report_id == "real-report"
+    # The report was created on the earlier tick, so this resume tick has no report.
+    assert result.report is None
+    assert len(result.steps) == 1
diff --git a/python/lib/sift_client/_tests/conftest.py b/python/lib/sift_client/_tests/conftest.py
index 5683182e5..0b939ae39 100644
--- a/python/lib/sift_client/_tests/conftest.py
+++ b/python/lib/sift_client/_tests/conftest.py
@@ -78,10 +78,14 @@ def ci_pytest_tag(sift_client):
     return tag
 
 
-# Import the Sift test results fixtures the way we recommend to users.
-from sift_client.util.test_results import *  # noqa: F403
-
-
 def pytest_configure(config: pytest.Config) -> None:
-    """Enable the Sift connection-check mode for the fixtures used in this test suite since we run w/ mock client in non-integration tests."""
-    config.option.sift_test_results_check_connection = True
+    """Pick a Sift plugin mode based on whether integration tests are running.
+
+    Integration runs (``-m integration``) stay online with the default
+    log-file pipeline enabled so CI exercises the JSONL write + import
+    worker replay path that production users hit. Every other run defaults
+    to ``--sift-disabled`` so unit tests don't need credentials.
+    """
+    is_integration_run = "integration" in (config.option.markexpr or "")
+    if not is_integration_run:
+        config.option.sift_disabled = True
diff --git a/python/lib/sift_client/_tests/pytest_plugin/__init__.py b/python/lib/sift_client/_tests/pytest_plugin/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
new file mode 100644
index 000000000..74c498fd1
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/_step_status_capture.py
@@ -0,0 +1,186 @@
+"""Read step status sequences from a Sift offline-mode log file.
+
+The contract suite drives each scenario through an inner pytester session
+run with ``--sift-offline``, which causes the real plugin + ``ReportContext``
+to write every test-result API call to a JSONL log. This module parses
+that log into a per-step status timeline that ``test_pass_fail.py`` asserts
+against, with no test-only ``ReportContext`` fake required.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from sift_client._internal.low_level_wrappers._test_results_log import iter_log_data_lines
+from sift_client.sift_types.test_report import TestStatus
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+@dataclass
+class CapturedStep:
+    step_id: str
+    name: str
+    step_path: str
+    parent_step_id: str | None
+    statuses: list[TestStatus] = field(default_factory=list)
+    error_messages: list[str] = field(default_factory=list)
+    # ``startTime`` from the create entry; ``endTime`` is the latest seen across
+    # create/update entries. Both are RFC3339 strings.
+    start_time: str | None = None
+    end_time: str | None = None
+
+
+_PROTO_STATUS_NAMES = {
+    "TEST_STATUS_UNSPECIFIED": TestStatus.UNSPECIFIED,
+    "TEST_STATUS_DRAFT": TestStatus.DRAFT,
+    "TEST_STATUS_PASSED": TestStatus.PASSED,
+    "TEST_STATUS_FAILED": TestStatus.FAILED,
+    "TEST_STATUS_ABORTED": TestStatus.ABORTED,
+    "TEST_STATUS_ERROR": TestStatus.ERROR,
+    "TEST_STATUS_IN_PROGRESS": TestStatus.IN_PROGRESS,
+    "TEST_STATUS_SKIPPED": TestStatus.SKIPPED,
+}
+
+
+def _status(name: str | None) -> TestStatus:
+    if name is None:
+        return TestStatus.UNSPECIFIED
+    return _PROTO_STATUS_NAMES.get(name, TestStatus.UNSPECIFIED)
+
+
+def parse_log(log_path: Path) -> dict[str, CapturedStep]:
+    """Parse the offline log into ``{step_id: CapturedStep}``.
+
+    Walks the JSONL file in order, building a ``CapturedStep`` for each
+    ``CreateTestStep`` entry and appending the new status from each
+    ``UpdateTestStep`` entry.
+    """
+    steps: dict[str, CapturedStep] = {}
+    for request_type, response_id, json_str in iter_log_data_lines(log_path):
+        payload = json.loads(json_str)
+        test_step = payload.get("testStep", {})
+        error_message = test_step.get("errorInfo", {}).get("errorMessage")
+        if request_type == "CreateTestStep" and response_id:
+            steps[response_id] = CapturedStep(
+                step_id=response_id,
+                name=test_step.get("name", ""),
+                step_path=test_step.get("stepPath", ""),
+                parent_step_id=test_step.get("parentStepId") or None,
+                statuses=[_status(test_step.get("status"))],
+                error_messages=[error_message] if error_message else [],
+                start_time=test_step.get("startTime"),
+                end_time=test_step.get("endTime"),
+            )
+        elif request_type == "UpdateTestStep":
+            step_id = test_step.get("testStepId")
+            new_status = test_step.get("status")
+            if step_id and step_id in steps and new_status is not None:
+                steps[step_id].statuses.append(_status(new_status))
+                if error_message:
+                    steps[step_id].error_messages.append(error_message)
+                if test_step.get("endTime") is not None:
+                    steps[step_id].end_time = test_step.get("endTime")
+    return steps
+
+
+_active_log: Path | None = None
+_cached: dict[str, CapturedStep] | None = None
+
+
+def set_log(path: Path) -> None:
+    """Point subsequent queries at a new log file. Clears the parse cache."""
+    global _active_log, _cached
+    _active_log = path
+    _cached = None
+
+
+def _steps() -> dict[str, CapturedStep]:
+    global _cached
+    if _cached is None:
+        if _active_log is None or not _active_log.exists():
+            _cached = {}
+        else:
+            _cached = parse_log(_active_log)
+    return _cached
+
+
+def steps_by_name(name: str) -> list[CapturedStep]:
+    return [s for s in _steps().values() if s.name == name]
+
+
+def test_step(name: str) -> CapturedStep | None:
+    """The step the autouse ``step`` fixture creates for the test function.
+
+    Multiple steps can share a name (e.g. when the makereport hook records an
+    inline step for a collection-time skip on top of the autouse step). The
+    autouse step is the shallowest by path depth.
+    """
+    matches = steps_by_name(name)
+    if not matches:
+        return None
+    return min(matches, key=lambda s: s.step_path.count("."))
+
+
+def final_status(name: str) -> TestStatus | None:
+    step = test_step(name)
+    return step.statuses[-1] if step and step.statuses else None
+
+
+def final_error_message(name: str) -> str | None:
+    step = test_step(name)
+    return step.error_messages[-1] if step and step.error_messages else None
+
+
+def log_events(log_path: Path) -> list[tuple[str, str, TestStatus]]:
+    """Ordered ``(request_type, step_name, status)`` tuples as they appear in the log.
+
+    Unlike ``load_steps`` (which collapses each step to its final state), this
+    preserves write order, so tests can assert *when* a step resolved relative to
+    other entries — e.g. that a container's terminal ``UpdateTestStep`` precedes a
+    later sibling's ``CreateTestStep`` (proof it closed mid-session, not at the
+    end). ``UpdateTestStep`` entries carry only an id, so the name is resolved
+    from the preceding ``CreateTestStep``.
+    """
+    if not log_path.exists():
+        return []
+    id_to_name: dict[str, str] = {}
+    events: list[tuple[str, str, TestStatus]] = []
+    for request_type, response_id, json_str in iter_log_data_lines(log_path):
+        test_step = json.loads(json_str).get("testStep", {})
+        status = _status(test_step.get("status"))
+        if request_type == "CreateTestStep" and response_id:
+            name = test_step.get("name", "")
+            id_to_name[response_id] = name
+            events.append((request_type, name, status))
+        elif request_type == "UpdateTestStep":
+            events.append((request_type, id_to_name.get(test_step.get("testStepId"), ""), status))
+    return events
+
+
+def load_steps(log_path: Path) -> list[dict]:
+    """Load the offline log as a list of step records keyed by hierarchy fields.
+
+    Each record has ``id``, ``name``, ``parent_step_id``, ``step_path``, the
+    shape ``test_hierarchy.py`` expects for its ``_by_name`` and
+    ``_ancestor_names`` walkers. Returns an empty list if the log was never
+    created (e.g. every item in the inner session was ``sift_exclude``-d, so
+    the plugin's ``report_context`` fixture never fired).
+    """
+    if not log_path.exists():
+        return []
+    return [
+        {
+            "id": s.step_id,
+            "name": s.name,
+            "parent_step_id": s.parent_step_id,
+            "step_path": s.step_path,
+            "statuses": s.statuses,
+            "start_time": s.start_time,
+            "end_time": s.end_time,
+        }
+        for s in parse_log(log_path).values()
+    ]
diff --git a/python/lib/sift_client/_tests/pytest_plugin/conftest.py b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
new file mode 100644
index 000000000..ba775e04b
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/conftest.py
@@ -0,0 +1,67 @@
+"""Shared helpers for the pytest-plugin test suite.
+
+The tests in this directory drive inner pytester sessions to exercise the
+plugin's behavior in isolation. The fixtures below produce the boilerplate
+conftests those inner sessions need:
+
+- ``write_plugin_conftest``: minimal conftest that loads the plugin
+- ``write_probe_conftest``: conftest that loads the plugin and runs a probe
+  block inside ``pytest_configure``, useful for inspecting internal state
+  without running tests against a real backend
+
+The offline-log tests (``test_hierarchy.py``, ``test_pass_fail.py``) drive the
+inner session in-process via ``pytester.runpytest_inprocess(...)``. This is
+fast because the outer session already preloads the plugin (``pyproject.toml``
+sets ``addopts = "... -p sift_client.pytest_plugin ..."``), so the numpy C
+extensions the plugin pulls in are imported once for the whole outer process
+and reused by every inner run — no per-test interpreter spawn, and no
+``cannot load module more than once per process`` re-init guard to trip.
+
+Tests that need true process isolation (fresh env vars, credential and
+connection resolution, ini parsing) still use ``pytester.runpytest_subprocess(...)``
+so the inner session starts from a clean interpreter.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from typing import Callable
+
+import pytest
+
+_SIFT_ENV_VARS = ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI", "SIFT_DISABLED", "SIFT_APP_URL")
+
+
+@pytest.fixture
+def clear_sift_env(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Unset all ``SIFT_*`` environment variables for the duration of the test."""
+    for name in _SIFT_ENV_VARS:
+        monkeypatch.delenv(name, raising=False)
+
+
+@pytest.fixture
+def write_plugin_conftest(pytester: pytest.Pytester) -> Callable[[], None]:
+    """Return a callable that writes a minimal conftest loading the plugin."""
+
+    def _write() -> None:
+        pytester.makeconftest('pytest_plugins = ["sift_client.pytest_plugin"]')
+
+    return _write
+
+
+@pytest.fixture
+def write_probe_conftest(pytester: pytest.Pytester) -> Callable[[str], None]:
+    """Return a callable that writes a conftest running ``probe_body`` in ``pytest_configure``.
+
+    ``probe_body`` is python source that runs at config time with ``config``
+    in scope; use ``print(...)`` calls and capture them with
+    ``result.stdout.fnmatch_lines``.
+    """
+
+    def _write(probe_body: str) -> None:
+        pytester.makeconftest(
+            'pytest_plugins = ["sift_client.pytest_plugin"]\n\n'
+            "def pytest_configure(config):\n" + textwrap.indent(textwrap.dedent(probe_body), "    ")
+        )
+
+    return _write
diff --git a/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
new file mode 100644
index 000000000..cbd748c53
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/step_status_states.md
@@ -0,0 +1,108 @@
+# Pytest-plugin step-status: test scenarios
+
+Reference for the pass/fail scenarios covered by
+[`test_pass_fail.py`](test_pass_fail.py). Each row pairs a scenario with the
+`TestStatus` the plugin records, and maps to the user-facing contract in
+[`docs/guides/pytest_plugin/pass_fail_behavior.md`](../../../../docs/guides/pytest_plugin/pass_fail_behavior.md).
+
+`TestStatus` values come from `sift_client.sift_types.test_report.TestStatus`:
+`PASSED`, `FAILED`, `ERROR`, `SKIPPED`, `ABORTED`, `IN_PROGRESS`. Hard exits map
+to `ABORTED`, resolved during fixture teardown: from the call-phase report when
+there is one (`SystemExit`), or, when a `KeyboardInterrupt` aborts the session
+before that report, from setup having completed with no call outcome. The status
+reaches the report only because pytest runs finalizers as it unwinds; a step
+keeps the `IN_PROGRESS` it was created with only if the process is killed before
+those finalizers run.
+
+## Case ID scheme
+
+Each scenario has a stable case ID of the form `PREFIX-NN`. Tests in
+`test_pass_fail.py` reference their case ID in a leading comment so a test can
+be traced back to its row here without rereading the scenario:
+
+| Prefix  | Section                                  |
+| ------- | ---------------------------------------- |
+| `CALL`  | Call-phase exit paths                    |
+| `SKIP`  | Skip paths                               |
+| `XFAIL` | xfail / xpass                            |
+| `PHASE` | Setup / teardown phases                  |
+| `COLL`  | Collection / fixture-resolution failures |
+| `API`   | Plugin-API exit paths                    |
+
+
+## Call-phase exit paths
+
+| Case      | Scenario                        | Trigger                              | Outcome                                                                                                  |
+| --------- | ------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------- |
+| `CALL-01` | Test passes                     | function body returns cleanly        | `PASSED`                                                                                                 |
+| `CALL-02` | Assert failure in call phase    | `assert 1 == 2`                      | `FAILED`                                                                                                 |
+| `CALL-03` | Generic exception in call phase | `raise ValueError("boom")`           | `ERROR`                                                                                                  |
+| `CALL-04` | `pytest.fail("...")` from body  | `pytest.fail("intentional failure")` | `FAILED`                                                                                                 |
+| `CALL-05` | `SystemExit` from the test body | `sys.exit(1)`                        | `ABORTED`                                                                                                |
+| `CALL-06` | `KeyboardInterrupt` in body     | `raise KeyboardInterrupt`            | `ABORTED` — the session aborts before a call-phase report, but fixture teardown still runs, so the cut-off step resolves to `ABORTED` rather than staying `IN_PROGRESS` |
+| `CALL-07` | Substep raises non-Assertion exception | `with step.substep(...): raise ValueError("boom")` | Substep `ERROR`, test step `FAILED` (child-failed signal outranks the propagating exception) |
+
+## Skip paths
+
+| Case      | Scenario                         | Trigger                                      | Outcome                                                                  |
+| --------- | -------------------------------- | -------------------------------------------- | ------------------------------------------------------------------------ |
+| `SKIP-01` | Collection-time skip             | `@pytest.mark.skip(reason=...)`              | `SKIPPED` — only the makereport hook records a step; no autouse step ran |
+| `SKIP-02` | Conditional collection-time skip | `@pytest.mark.skipif(True, reason=...)`      | `SKIPPED` — same route as `@pytest.mark.skip`                            |
+| `SKIP-03` | Runtime skip in body             | `pytest.skip("...")`                         | Outer step `SKIPPED`; no duplicate nested step                           |
+| `SKIP-04` | Skip raised inside a fixture     | `@pytest.fixture` calls `pytest.skip("...")` | Outer step `SKIPPED` (setup-phase skip); no duplicate nested step        |
+
+## xfail / xpass
+
+| Case       | Scenario                                  | Trigger                                                    | Outcome                                                  |
+| ---------- | ----------------------------------------- | ---------------------------------------------------------- | -------------------------------------------------------- |
+| `XFAIL-01` | xfail-marked test that fails              | `@pytest.mark.xfail` + `assert 1 == 2`                     | `PASSED` — test fulfilled the xfail expectation          |
+| `XFAIL-02` | Strict xfail that unexpectedly passes     | `@pytest.mark.xfail(strict=True)` + `assert True`          | `FAILED` — mark no longer matches reality                |
+| `XFAIL-03` | Non-strict xfail that unexpectedly passes | `@pytest.mark.xfail()` + `assert True`                     | `PASSED` — `strict=False` doesn't insist on the failure  |
+| `XFAIL-04` | `xfail(raises=...)` with wrong exception  | `@pytest.mark.xfail(raises=ValueError)` + `raise KeyError` | `FAILED` — `raises=` mismatch is a real test failure     |
+| `XFAIL-05` | `xfail(run=False)`                        | `@pytest.mark.xfail(run=False)` (body never executed)      | `SKIPPED` — the test never ran                           |
+
+## Setup / teardown phases
+
+| Case       | Scenario                                     | Trigger                                                            | Outcome                                                                                                                          |
+| ---------- | -------------------------------------------- | ------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
+| `PHASE-01` | Setup-phase fixture failure (RuntimeError)   | `@pytest.fixture` raises before `yield`; test body never runs      | `ERROR` — plugin reads the setup-phase report and maps `failed` → `ERROR` (a `phase=setup` annotation is a planned follow-up)    |
+| `PHASE-02` | Teardown-phase fixture failure               | `@pytest.fixture` raises after `yield`; test body passed           | `FAILED` — plugin upgrades a passed step when the teardown report shows `failed` (a `phase=teardown` annotation is a planned follow-up) |
+| `PHASE-03` | Call-phase fail **plus** teardown-phase fail | `assert 1 == 2` in body AND `@pytest.fixture` raises after `yield` | `FAILED` — call-phase failure dominates; surfacing the teardown error alongside is a planned follow-up                           |
+
+## Collection / fixture-resolution failures
+
+| Case      | Scenario        | Trigger                            | Outcome                                                                                                            |
+| --------- | --------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `COLL-01` | Missing fixture | `def test_x(nonexistent_fixture):` | `ERROR` — missing fixture surfaces as a setup-phase failure (a `phase=setup` annotation is a planned follow-up)    |
+
+## Plugin-API exit paths (in-test mutations)
+
+| Case     | Scenario                          | Trigger                                                                   | Outcome                                                                                                                     |
+| -------- | --------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
+| `API-01` | Manual status override            | `step.current_step.update({"status": TestStatus.FAILED})`                 | `FAILED`                                                                                                                    |
+| `API-02` | `report_outcome(result=False)`    | `step.report_outcome("the_check", False, "did not match")`                | `FAILED`                                                                                                                    |
+| `API-03` | `measure(...)` out-of-bounds      | `step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})`     | `FAILED`                                                                                                                    |
+| `API-04` | Failed measurement on a substep   | `with step.substep(...) as s: s.measure(... out-of-bounds)`               | `FAILED` — propagates from substep to parent                                                                                |
+| `API-05` | Manually-skipped substep          | `with step.substep(...) as s: s.current_step.update({"status": SKIPPED})` | Parent step `PASSED` — skip does not propagate as a failure                                                                 |
+| `API-06` | Hard exit inside a nested substep | `with step.substep(...) as s: with s.substep(...): sys.exit(1)`           | Every open step on the unwind path records `ABORTED`; a sibling substep that closed before the abort keeps its prior status |
+
+## Out of scope
+
+Scenarios deliberately not covered by this suite:
+
+- **Timeout** — needs `pytest-timeout` or a manual signal harness.
+- **Signal (SIGKILL / SIGTERM)** — cannot be caught from inside the process;
+  needs a subprocess-level harness.
+- **`pytest.exit("...")`** — niche; the "aborts subsequent tests" behavior
+  is hard to characterize cleanly because each `pytester` invocation is
+  its own session.
+- **`os._exit()`** — bypasses Python cleanup entirely; can't be tested
+  in-process because it would kill the outer pytest run. Guaranteed
+  data-loss case alongside `SystemExit` / `SIGKILL`.
+- **Parametrize-level marks** (`pytest.param(..., marks=pytest.mark.xfail / skip)`)
+  — routes through a different selection path but produces the same
+  `report.outcome`, so behavior matches the function-level marks already
+  covered above.
+- **Import error / syntax error / `conftest.py` error** — these fail
+  collection entirely; no `item` is produced and no plugin hook fires, so
+  no Sift step is recorded.
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
new file mode 100644
index 000000000..a61035b90
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_configuration.py
@@ -0,0 +1,429 @@
+"""Tests for the plugin's CLI/ini configuration surface.
+
+Covers flag parsing, ini-key resolution, CLI-over-ini precedence, the
+defaults that apply when nothing is set, and the marker-based gate that
+governs the autouse fixtures. Credentials are tested in
+``test_credentials.py``.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestIniConfiguration:
+    """`addini` keys configure the plugin via pyproject.toml / pytest.ini."""
+
+    def test_ini_log_file_none(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_log_file = "none"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["RESOLVED: None"])
+
+    def test_python_false_disables_log_file(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """`config.option.sift_log_file = False` disables logging.
+
+        Conftests use this pattern (see lib/sift_client/_tests/util/conftest.py)
+        to opt their subtree out of log-file mode. Regression test for the
+        resolver case where Python `False` was previously confused with `None`
+        and silently kept the temp-file default.
+        """
+        write_probe_conftest(
+            """
+            config.option.sift_log_file = False
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["RESOLVED: None"])
+
+    def test_ini_log_file_path(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        log_path = tmp_path / "sift-run.jsonl"
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            f"""
+            [tool.pytest.ini_options]
+            sift_log_file = "{log_path}"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines([f"RESOLVED: {log_path}"])
+
+    def test_ini_offline_true(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.modes import is_offline
+            print("OFFLINE:", is_offline(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_offline = true
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["OFFLINE: True"])
+
+    def test_ini_disabled_true(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.modes import is_disabled
+            print("DISABLED:", is_disabled(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_disabled = true
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["DISABLED: True"])
+
+    def test_ini_git_metadata_false(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            print("INI_GIT:", config.getini("sift_git_metadata"))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_git_metadata = false
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(["INI_GIT: False"])
+
+    def test_cli_overrides_ini(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """A CLI flag takes precedence over the matching ini key."""
+        cli_path = tmp_path / "cli-wins.jsonl"
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
+            """,
+        )
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_log_file = "none"
+            """
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", f"--sift-log-file={cli_path}")
+        result.stdout.fnmatch_lines([f"RESOLVED: {cli_path}"])
+
+    def test_cli_offline_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--sift-offline`` CLI flag flips the resolver to True."""
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.modes import is_offline
+            print("OFFLINE:", is_offline(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-offline")
+        result.stdout.fnmatch_lines(["OFFLINE: True"])
+
+    def test_cli_disabled_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--sift-disabled`` CLI flag flips the resolver to True."""
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.modes import is_disabled
+            print("DISABLED:", is_disabled(config))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--sift-disabled")
+        result.stdout.fnmatch_lines(["DISABLED: True"])
+
+    def test_cli_no_git_metadata_flag(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        """The ``--no-sift-git-metadata`` CLI flag flips git_metadata to False.
+
+        Guards the negation flag's ``dest`` binding: the flag name doesn't match
+        the ini key, so a broken ``dest`` would silently fall back to the ini
+        default and pass every other test in this file.
+        """
+        write_probe_conftest(
+            """
+            print("CLI_GIT:", config.getoption("sift_git_metadata"))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co", "--no-sift-git-metadata")
+        result.stdout.fnmatch_lines(["CLI_GIT: False"])
+
+    def test_defaults_when_neither_set(
+        self,
+        pytester: pytest.Pytester,
+        write_probe_conftest: Callable[[str], None],
+    ) -> None:
+        write_probe_conftest(
+            """
+            from sift_client._internal.pytest_plugin.modes import is_disabled, is_offline
+            from sift_client._internal.pytest_plugin.report import resolve_log_file
+            print("RESOLVED:", resolve_log_file(config))
+            print("OFFLINE:", is_offline(config))
+            print("DISABLED:", is_disabled(config))
+            print("INI_GIT:", config.getini("sift_git_metadata"))
+            """,
+        )
+        pytester.makepyfile("def test_noop(): pass")
+        result = pytester.runpytest_subprocess("-s", "--co")
+        result.stdout.fnmatch_lines(
+            [
+                "RESOLVED: True",
+                "OFFLINE: False",
+                "DISABLED: False",
+                "INI_GIT: True",
+            ]
+        )
+
+
+# A session-scoped `report_context` stub for the autouse-gate tests. Overrides
+# the plugin's real `report_context` so the inner pytest sessions don't try to
+# talk to a Sift backend; the gate tests only need to observe whether `step`
+# resolves to a real value or to None.
+_GATE_INNER_CONFTEST = textwrap.dedent(
+    """
+    from unittest.mock import MagicMock
+
+    import pytest
+
+    pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+    @pytest.fixture(scope="session")
+    def report_context():
+        yield MagicMock()
+    """
+)
+
+
+class TestAutouseGate:
+    """`sift_include` / `sift_exclude` markers and the `sift_autouse` ini gate."""
+
+    def test_default_ini_true_activates(self, pytester: pytest.Pytester) -> None:
+        """Plugin default (ini absent) keeps the autouse fixtures active."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            def test_inner(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_default_ini_false_skips(self, pytester: pytest.Pytester) -> None:
+        """`sift_autouse = false` makes the autouse fixtures no-op by default."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_sift_include_marker_forces_on(self, pytester: pytest.Pytester) -> None:
+        """`@pytest.mark.sift_include` overrides ini-false to enable the gate."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_inner(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_sift_exclude_marker_forces_off(self, pytester: pytest.Pytester) -> None:
+        """`@pytest.mark.sift_exclude` overrides ini-true to disable the gate."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_exclude
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_exclude_beats_include(self, pytester: pytest.Pytester) -> None:
+        """When both markers are present, `sift_exclude` wins (safer default)."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            @pytest.mark.sift_exclude
+            def test_inner(step):
+                assert step is None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_module_pytestmark_inherits(self, pytester: pytest.Pytester) -> None:
+        """Module-level `pytestmark = pytest.mark.sift_include` covers every test in the module."""
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            pytestmark = pytest.mark.sift_include
+
+            def test_inner_a(step):
+                assert step is not None
+
+            def test_inner_b(step):
+                assert step is not None
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=2)
+
+    def test_bulk_apply_via_conftest_hook(self, pytester: pytest.Pytester) -> None:
+        """A subtree opts in via `pytest_collection_modifyitems`; siblings stay off.
+
+        Regression test for this repo's wiring pattern: the project default is
+        autouse-off, the integration subtree's conftest bulk-applies
+        `sift_include`, and sibling subtrees remain disabled. Verifies the
+        per-directory mechanism works in a single pytest invocation.
+        """
+        pytester.makeconftest(_GATE_INNER_CONFTEST)
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_autouse = false
+            """
+        )
+        included = pytester.mkdir("included_subtree")
+        (included / "conftest.py").write_text(
+            textwrap.dedent(
+                """
+                from pathlib import Path
+
+                import pytest
+
+                _HERE = Path(__file__).parent
+
+
+                def pytest_collection_modifyitems(config, items):
+                    for item in items:
+                        try:
+                            item.path.relative_to(_HERE)
+                        except ValueError:
+                            continue
+                        item.add_marker(pytest.mark.sift_include)
+                """
+            )
+        )
+        (included / "test_included.py").write_text(
+            "def test_included(step):\n    assert step is not None\n"
+        )
+        untouched = pytester.mkdir("untouched_subtree")
+        (untouched / "test_untouched.py").write_text(
+            "def test_untouched(step):\n    assert step is None\n"
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=2)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
new file mode 100644
index 000000000..3f6d22a6e
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_credentials.py
@@ -0,0 +1,117 @@
+"""Tests for the default ``sift_client`` fixture's credential resolution.
+
+Covers the env-var-then-ini fallback for URIs, the env-only handling of
+``SIFT_API_KEY``, and the error path that names missing credentials.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    import pytest
+
+
+class TestCredentials:
+    """The default ``sift_client`` fixture's resolution of env vars and ini keys."""
+
+    def test_uris_from_ini(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The default sift_client fixture reads URI credentials from ini when env vars are unset."""
+        monkeypatch.setenv("SIFT_API_KEY", "env-key")
+        monkeypatch.delenv("SIFT_GRPC_URI", raising=False)
+        monkeypatch.delenv("SIFT_REST_URI", raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            sift_offline = true
+
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_credentials_loaded(sift_client):
+                cfg = sift_client.grpc_client._config
+                assert cfg.api_key == "env-key"
+                assert "ini-grpc:1234" in cfg.uri
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_env_var_overrides_ini_uri(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """When both env var and ini set a URI, the env var wins."""
+        monkeypatch.setenv("SIFT_API_KEY", "env-key")
+        monkeypatch.setenv("SIFT_GRPC_URI", "env-grpc:9999")
+        monkeypatch.delenv("SIFT_REST_URI", raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            sift_offline = true
+
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_env_wins(sift_client):
+                assert "env-grpc:9999" in sift_client.grpc_client._config.uri
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        result.assert_outcomes(passed=1)
+
+    def test_api_key_ignored_from_ini(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """`sift_api_key` is not registered as an ini key; the fixture refuses to use it."""
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            monkeypatch.delenv(name, raising=False)
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.pytest.ini_options]
+            sift_api_key = "should-be-ignored"
+            sift_grpc_uri = "ini-grpc:1234"
+            sift_rest_uri = "https://ini-rest"
+            """
+        )
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "SIFT_API_KEY" in combined, combined
+
+    def test_missing_credentials_named_in_error(
+        self,
+        pytester: pytest.Pytester,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """A missing credential aborts with all missing names listed."""
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            monkeypatch.delenv(name, raising=False)
+        write_plugin_conftest()
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        for name in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            assert name in combined, combined
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
new file mode 100644
index 000000000..263ac03ac
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_disabled.py
@@ -0,0 +1,168 @@
+"""Tests for ``--sift-disabled`` mode.
+
+Disabled mode skips Sift entirely. Autouse fixtures yield stub objects so
+test code that calls ``step.measure(...)`` keeps working without any Sift
+configuration; ``measure*`` evaluates bounds locally and returns the real
+pass/fail boolean. Nothing reaches Sift and no log file is written.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestDisabledMode:
+    def test_in_bounds_passes_out_of_bounds_fails(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Stub measure* evaluates bounds locally; pass/fail matches the real plugin."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_passes_in_bounds(step):
+                assert step.measure(name="v", value=5.0, bounds={"min": 4.8, "max": 5.2})
+
+            def test_fails_out_of_bounds(step):
+                assert step.measure(name="v", value=99.0, bounds={"max": 5.2}) is False
+
+            def test_substep_and_report_outcome(step):
+                with step.substep(name="inner") as inner:
+                    assert inner.report_outcome(name="ok", result=True) is True
+
+            def test_string_bounds(step):
+                assert step.measure(name="fw", value="1.0", bounds="1.0") is True
+                assert step.measure(name="fw", value="1.0", bounds="2.0") is False
+
+            def test_measure_avg(step):
+                assert step.measure_avg(
+                    name="bus", values=[4.97, 5.01, 5.03], bounds={"min": 4.9, "max": 5.1}
+                ) is True
+
+            def test_measure_all_outlier(step):
+                assert step.measure_all(
+                    name="p", values=[10.1, 10.2, 99.9], bounds={"max": 11.0}
+                ) is False
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=6)
+
+    def test_disabled_does_not_require_credentials(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Disabled mode never reads SIFT_* env vars; runs cleanly without them."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_supersedes_offline(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-disabled`` wins when combined with ``--sift-offline``.
+
+        Disabled is the "skip Sift entirely" hammer; passing it alongside
+        offline shouldn't error. The session runs without credentials, without
+        a log file, and without the offline-mode replay machinery.
+        """
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_runs(step):
+                assert step.measure(name="v", value=5.0, bounds={"max": 10.0}) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled", "--sift-offline")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_yields_stub_fixtures(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """`report_context` / `step` are real instances backed by a simulate client."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            from sift_client.util.test_results import ReportContext
+            from sift_client.util.test_results.context_manager import NewStep
+
+            def test_types(step, report_context):
+                assert isinstance(report_context, ReportContext)
+                assert report_context.is_simulated is True
+                assert report_context.report.is_simulated is True
+                assert step.current_step.is_simulated is True
+                assert isinstance(step, NewStep)
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+
+    def test_disabled_writes_no_log_file_even_when_path_pinned(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Disabled mode skips the log-file pipeline even when a path is pinned."""
+        log_path = tmp_path / "should-not-exist.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        assert not log_path.exists(), f"log file unexpectedly created at {log_path}"
+
+    def test_disabled_skips_client_has_connection_and_sift_client(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Disabled mode never resolves ``client_has_connection`` or ``sift_client``.
+
+        The plugin's ``report_context`` short-circuits to the stub before
+        consulting either fixture. Overrides that raise on resolution stay
+        un-triggered, so the inner test passes cleanly.
+        """
+        pytester.makeconftest(
+            """
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                raise AssertionError("sift_client should not resolve in disabled mode")
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                raise AssertionError(
+                    "client_has_connection should not resolve in disabled mode"
+                )
+            """
+        )
+        pytester.makepyfile(
+            """
+            def test_runs(step):
+                assert step.measure(name="v", value=5.0, bounds={"max": 10.0}) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
new file mode 100644
index 000000000..18b03c194
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_hierarchy.py
@@ -0,0 +1,1369 @@
+"""Tests for the plugin's hierarchy-step nesting behavior.
+
+Covers every layer the plugin opens parent steps for — packages, modules,
+classes (including nested), parametrize axes — plus the ini opt-out flags,
+failure-cleanup semantics, and the drain helper.
+
+Each test spins up an inner pytest run via ``pytester`` configured with
+``--sift-offline`` and a known log path. The plugin writes every test-result
+API call to that JSONL log, and the outer test parses it via
+``_step_status_capture.load_steps`` to reconstruct the step tree.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from textwrap import dedent
+from types import SimpleNamespace
+from typing import TYPE_CHECKING
+
+import pytest
+
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
+from sift_client.sift_types.test_report import TestStatus
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+def _parse_ts(ts: str) -> datetime:
+    """Parse a protobuf-JSON RFC3339 timestamp across Python 3.8-3.14.
+
+    ``datetime.fromisoformat`` only accepts ``Z`` / arbitrary fractional digits
+    on 3.11+, so parse the second-precision base with ``strptime`` and apply the
+    fractional part by hand (protobuf emits 0/3/6/9 digits).
+    """
+    body = ts.rstrip("Z").split("+", 1)[0]
+    base, _, frac = body.partition(".")
+    # All Sift timestamps are UTC; tag it so comparisons stay unambiguous.
+    parsed = datetime.strptime(base, "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
+    if frac:
+        parsed = parsed.replace(microsecond=int(frac.ljust(6, "0")[:6]))
+    return parsed
+
+
+_INNER_CONFTEST = 'pytest_plugins = ["sift_client.pytest_plugin"]\n'
+
+
+def _base_ini_lines(log_path: Path) -> list[str]:
+    """Default ini settings every inner pytester run needs."""
+    return [
+        "[pytest]",
+        "sift_offline = true",
+        f"sift_log_file = {log_path}",
+        "sift_git_metadata = false",
+    ]
+
+
+@pytest.fixture
+def log_file(pytester: pytest.Pytester) -> Path:
+    path = pytester.path / "sift.log"
+    pytester.makeconftest(_INNER_CONFTEST)
+    pytester.makefile(".ini", pytest="\n".join(_base_ini_lines(path)) + "\n")
+    return path
+
+
+def _by_name(steps: list[dict]) -> dict[str, list[dict]]:
+    out: dict[str, list[dict]] = {}
+    for s in steps:
+        out.setdefault(s["name"], []).append(s)
+    return out
+
+
+def _ancestor_names(steps: list[dict], leaf: dict) -> list[str]:
+    """Walk from ``leaf`` to the root via parent_step_id, returning names."""
+    by_id = {s["id"]: s for s in steps}
+    chain: list[str] = []
+    cur: dict | None = leaf
+    while cur is not None:
+        chain.append(cur["name"])
+        parent_id = cur["parent_step_id"]
+        cur = by_id.get(parent_id) if parent_id else None
+    return chain
+
+
+def test_class_methods_cluster_under_class_step(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_klass=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert len(by_name["TestFoo"]) == 1
+    class_id = by_name["TestFoo"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == class_id
+    assert by_name["test_b"][0]["parent_step_id"] == class_id
+
+
+def test_collection_skipped_method_nests_under_its_class(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A collection-time skipped method nests under its class parent.
+
+    ``@pytest.mark.skip`` is evaluated before the autouse fixtures run, so the
+    skipped item's step comes from the makereport hook rather than the ``step``
+    fixture. The report-tree parents live off the step stack, so that inline step
+    must still resolve and attach to the class parent rather than the report root.
+    Order is pinned so the non-skipped sibling opens the class first.
+    """
+    pytester.makepyfile(
+        test_skip_nest=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                def test_run(self):
+                    pass
+
+                @pytest.mark.skip(reason="x")
+                def test_skipped(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=1, skipped=1)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert len(by_name["TestFoo"]) == 1
+    class_id = by_name["TestFoo"][0]["id"]
+    assert by_name["test_run"][0]["parent_step_id"] == class_id
+    assert by_name["test_skipped"][0]["parent_step_id"] == class_id
+    assert by_name["test_skipped"][0]["statuses"][-1] == TestStatus.SKIPPED
+
+
+def test_nested_classes_produce_nested_steps(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_nested=dedent(
+            """
+            class TestOuter:
+                class TestInner:
+                    def test_a(self):
+                        pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert len(by_name["TestOuter"]) == 1
+    assert len(by_name["TestInner"]) == 1
+    leaf = by_name["test_a"][0]
+    assert _ancestor_names(steps, leaf) == [
+        "test_a",
+        "TestInner",
+        "TestOuter",
+        "test_nested.py",
+    ]
+
+
+def test_class_parametrize_nests_under_class(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_cp=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    class_id = by_name["TestFoo"][0]["id"]
+    test_a_id = by_name["test_a"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == class_id
+    assert by_name["v=1"][0]["parent_step_id"] == test_a_id
+    assert by_name["v=2"][0]["parent_step_id"] == test_a_id
+
+
+def test_two_sibling_classes_in_module(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_sib=dedent(
+            """
+            class TestA:
+                def test_x(self):
+                    pass
+
+            class TestB:
+                def test_y(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    mod_id = by_name["test_sib.py"][0]["id"]
+    assert by_name["TestA"][0]["parent_step_id"] == mod_id
+    assert by_name["TestB"][0]["parent_step_id"] == mod_id
+    # Sanity: each class is opened exactly once (no duplicate parents).
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+
+
+def test_mixed_class_and_free_function(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_mix=dedent(
+            """
+            class TestA:
+                def test_x(self):
+                    pass
+
+            def test_free():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    mod_id = by_name["test_mix.py"][0]["id"]
+    # Class method parents to TestA; free function parents directly to module.
+    assert by_name["TestA"][0]["parent_step_id"] == mod_id
+    assert by_name["test_x"][0]["parent_step_id"] == by_name["TestA"][0]["id"]
+    assert by_name["test_free"][0]["parent_step_id"] == mod_id
+
+
+def test_class_with_all_excluded_methods_no_class_step(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_excl=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.sift_exclude
+                def test_a(self):
+                    pass
+
+                @pytest.mark.sift_exclude
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+    assert "test_b" not in by_name
+
+
+def test_sift_exclude_on_class_propagates(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_clsexcl=dedent(
+            """
+            import pytest
+
+            @pytest.mark.sift_exclude
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+
+
+def test_class_docstring_becomes_step_description(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_doc=dedent(
+            '''
+            class TestFoo:
+                """Class docstring."""
+
+                def test_a(self):
+                    pass
+            '''
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # The fake records step creation but not all fields — check the class
+    # step was recorded, then read the description via the FakeStep's
+    # description attribute by re-reading steps. The fake's create_step only
+    # records name/parent/path/id, so verify via the leaf chain only here.
+    leaf = by_name["test_a"][0]
+    assert _ancestor_names(steps, leaf)[:3] == ["test_a", "TestFoo", "test_doc.py"]
+
+
+def test_two_class_chains_keep_parametrize_isolated(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_trans=dedent(
+            """
+            import pytest
+
+            class TestA:
+                @pytest.mark.parametrize("v", [1])
+                def test_x(self, v):
+                    pass
+
+            class TestB:
+                @pytest.mark.parametrize("w", [2])
+                def test_y(self, w):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Each class opens exactly once; parametrize parents under the right class.
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+    test_x_id = by_name["test_x"][0]["id"]
+    test_y_id = by_name["test_y"][0]["id"]
+    assert by_name["v=1"][0]["parent_step_id"] == test_x_id
+    assert by_name["w=2"][0]["parent_step_id"] == test_y_id
+    # Confirm full chain: leaves trace up through correct class.
+    chain_x = _ancestor_names(steps, by_name["v=1"][0])
+    chain_y = _ancestor_names(steps, by_name["w=2"][0])
+    assert "TestA" in chain_x
+    assert "TestB" not in chain_x
+    assert "TestB" in chain_y
+    assert "TestA" not in chain_y
+
+
+# ---------------------------------------------------------------------------
+# Failure-cleanup tests
+# ---------------------------------------------------------------------------
+
+
+class _FakeParent:
+    """Minimal stand-in for an open ``NewStep`` parent in the plugin registries."""
+
+    def __init__(self, name: str, step_path: str, *, raises: str | None = None) -> None:
+        self.current_step = SimpleNamespace(name=name, step_path=step_path)
+        self._raises = raises
+        self.closed = False
+
+    def __exit__(self, *_: object) -> None:
+        if self._raises is not None:
+            raise RuntimeError(self._raises)
+        self.closed = True
+
+
+@pytest.fixture
+def clean_parent_registries():
+    """Save/restore the module-level parent registries and REPORT_CONTEXT.
+
+    The ``finalize_parents`` resilience test pokes the globals directly, so
+    isolate them from any real session state. Registries and ``finalize_parents``
+    live in ``_internal.pytest_plugin.steps``; ``REPORT_CONTEXT`` is the public
+    session global on ``sift_client.pytest_plugin``.
+    """
+    from sift_client import pytest_plugin
+    from sift_client._internal.pytest_plugin import steps
+
+    saved = (
+        dict(steps.hierarchy_parents),
+        dict(steps.parametrize_parents),
+        pytest_plugin.REPORT_CONTEXT,
+    )
+    steps.hierarchy_parents.clear()
+    steps.parametrize_parents.clear()
+    pytest_plugin.REPORT_CONTEXT = None  # skip the end_time override lookup
+    try:
+        yield steps
+    finally:
+        steps.hierarchy_parents.clear()
+        steps.hierarchy_parents.update(saved[0])
+        steps.parametrize_parents.clear()
+        steps.parametrize_parents.update(saved[1])
+        pytest_plugin.REPORT_CONTEXT = saved[2]
+
+
+def test_finalize_parents_continues_past_failing_exit(clean_parent_registries) -> None:
+    """Lenient mode: a misbehaving parent ``__exit__`` must not block the others."""
+    from sift_client.pytest_plugin import SiftPytestStepDrainWarning
+
+    steps = clean_parent_registries
+    good = _FakeParent("good", "1")
+    bad = _FakeParent("bad", "1.1", raises="boom")
+    steps.hierarchy_parents["good"] = good
+    steps.parametrize_parents[("t", "bad")] = bad
+
+    with pytest.warns(SiftPytestStepDrainWarning, match="boom"):
+        steps.finalize_parents()
+
+    assert good.closed
+    # Registries cleared regardless of the per-parent failure.
+    assert steps.hierarchy_parents == {}
+    assert steps.parametrize_parents == {}
+
+
+def test_failing_test_in_class_does_not_orphan_class_step(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A failing class method must not block the class step from cleaning up.
+
+    Sibling methods in the same class must still parent to the same class
+    step, and a later class in the module must open as a sibling (not nested
+    under an orphan).
+    """
+    pytester.makepyfile(
+        test_fail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert len(by_name["TestFoo"]) == 1
+    assert len(by_name["TestBar"]) == 1
+    foo_id = by_name["TestFoo"][0]["id"]
+    bar_id = by_name["TestBar"][0]["id"]
+    mod_id = by_name["test_fail.py"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == foo_id
+    assert by_name["test_b"][0]["parent_step_id"] == foo_id
+    assert by_name["test_c"][0]["parent_step_id"] == bar_id
+    # Both classes are siblings under the same module — TestBar didn't get
+    # nested under an orphan TestFoo.
+    assert by_name["TestFoo"][0]["parent_step_id"] == mod_id
+    assert by_name["TestBar"][0]["parent_step_id"] == mod_id
+
+
+def test_failing_parametrized_method_in_class_closes_full_chain(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A failing parametrized class method must not orphan its parametrize parents."""
+    pytester.makepyfile(
+        test_pfail=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    if v == 1:
+                        raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    foo_id = by_name["TestFoo"][0]["id"]
+    test_a_id = by_name["test_a"][0]["id"]
+    # Both parametrize leaves parent to the same test_a; test_b parents
+    # directly to TestFoo (no parametrize parent leaked across methods).
+    assert by_name["v=1"][0]["parent_step_id"] == test_a_id
+    assert by_name["v=2"][0]["parent_step_id"] == test_a_id
+    assert by_name["test_b"][0]["parent_step_id"] == foo_id
+
+
+# ---------------------------------------------------------------------------
+# Opt-out flag tests
+# ---------------------------------------------------------------------------
+
+
+def _write_ini(pytester: pytest.Pytester, log_file: Path, **overrides: object) -> None:
+    """Write a pytest.ini with the given sift_* overrides, preserving the
+    offline/log/git-metadata defaults the ``log_file`` fixture installs.
+    """
+    lines = _base_ini_lines(log_file)
+    for key, value in overrides.items():
+        lines.append(f"{key} = {value}")
+    pytester.makefile(".ini", pytest="\n".join(lines) + "\n")
+
+
+def test_sift_class_step_false_skips_class_steps(pytester: pytest.Pytester, log_file: Path) -> None:
+    _write_ini(pytester, log_file, sift_class_step="false")
+    pytester.makepyfile(
+        test_noclass=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert "TestFoo" not in by_name
+    mod_id = by_name["test_noclass.py"][0]["id"]
+    assert by_name["test_a"][0]["parent_step_id"] == mod_id
+    assert by_name["test_b"][0]["parent_step_id"] == mod_id
+
+
+def test_sift_module_step_false_skips_module_step(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    _write_ini(pytester, log_file, sift_module_step="false")
+    pytester.makepyfile(
+        test_nomod=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert "test_nomod.py" not in by_name
+    # TestFoo attaches to the report root (no parent recorded by the fake).
+    assert by_name["TestFoo"][0]["parent_step_id"] is None
+    assert by_name["test_a"][0]["parent_step_id"] == by_name["TestFoo"][0]["id"]
+
+
+def test_sift_parametrize_nesting_false_keeps_flat_leaves(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    _write_ini(pytester, log_file, sift_parametrize_nesting="false")
+    pytester.makepyfile(
+        test_flat=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [1, 2])
+            def test_a(v):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # No parametrize parent step.
+    assert "test_a" not in by_name
+    assert "v=1" not in by_name
+    # Leaves use the bracket-mangled pytest names.
+    assert "test_a[1]" in by_name
+    assert "test_a[2]" in by_name
+    mod_id = by_name["test_flat.py"][0]["id"]
+    assert by_name["test_a[1]"][0]["parent_step_id"] == mod_id
+    assert by_name["test_a[2]"][0]["parent_step_id"] == mod_id
+
+
+def test_sift_module_step_false_still_drains_across_modules(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """sift_module_step=false must not merge same-named classes across modules.
+
+    The hierarchy chain always includes the module ancestor for identity
+    (even when it's not rendered as a step), so two modules each declaring
+    ``class TestFoo`` produce two distinct ``TestFoo`` frames in the diff.
+    """
+    _write_ini(pytester, log_file, sift_module_step="false")
+    pytester.makepyfile(
+        test_a=dedent(
+            """
+            class TestFoo:
+                def test_x(self):
+                    pass
+            """
+        ),
+        test_b=dedent(
+            """
+            class TestFoo:
+                def test_y(self):
+                    pass
+            """
+        ),
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Two distinct TestFoo class steps — one per module — not a shared frame.
+    assert len(by_name["TestFoo"]) == 2
+    foo_ids = {s["id"] for s in by_name["TestFoo"]}
+    # Each test method parents to a different TestFoo id.
+    test_x_parent = by_name["test_x"][0]["parent_step_id"]
+    test_y_parent = by_name["test_y"][0]["parent_step_id"]
+    assert test_x_parent in foo_ids
+    assert test_y_parent in foo_ids
+    assert test_x_parent != test_y_parent
+
+
+def test_package_step_default_opens_for_init_dirs(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """Default: a directory with ``__init__.py`` produces a parent package step."""
+    pytester.mkpydir("pkg_a")
+    (pytester.path / "pkg_a" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert "pkg_a" in by_name
+    pkg_id = by_name["pkg_a"][0]["id"]
+    mod = by_name["test_x.py"][0]
+    assert mod["parent_step_id"] == pkg_id
+
+
+def test_same_named_packages_in_different_dirs_do_not_merge(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """Two packages with the same display name but different paths must stay distinct.
+
+    The hierarchy diff compares on ``nodeid`` (identity), not just the
+    display name — so a ``utils`` package under ``proj_a/`` and another
+    under ``proj_b/`` (where ``proj_a/`` and ``proj_b/`` are bare
+    directories that pytest treats as ``pytest.Dir`` nodes and the chain
+    walker skips) produce two distinct ``utils`` parent steps in the report
+    tree, not a silent merge.
+    """
+    (pytester.path / "proj_a" / "utils").mkdir(parents=True)
+    (pytester.path / "proj_a" / "utils" / "__init__.py").touch()
+    (pytester.path / "proj_a" / "utils" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    (pytester.path / "proj_b" / "utils").mkdir(parents=True)
+    (pytester.path / "proj_b" / "utils" / "__init__.py").touch()
+    (pytester.path / "proj_b" / "utils" / "test_y.py").write_text(
+        dedent(
+            """
+            def test_two():
+                pass
+            """
+        )
+    )
+    # ``importlib`` import mode is required so two packages with the same
+    # name on disk don't collide during sys.path-based import.
+    result = pytester.runpytest_inprocess("-v", "--import-mode=importlib")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Two distinct ``utils`` package steps — one per project.
+    assert len(by_name["utils"]) == 2
+    utils_ids = {s["id"] for s in by_name["utils"]}
+    # Each module step parents to a different ``utils`` instance.
+    parent_x = by_name["test_x.py"][0]["parent_step_id"]
+    parent_y = by_name["test_y.py"][0]["parent_step_id"]
+    assert parent_x in utils_ids
+    assert parent_y in utils_ids
+    assert parent_x != parent_y
+
+
+def test_sift_package_step_false_skips_package_steps(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """With ``sift_package_step=false`` the directory step is suppressed."""
+    _write_ini(pytester, log_file, sift_package_step="false")
+    pytester.mkpydir("pkg_a")
+    (pytester.path / "pkg_a" / "test_x.py").write_text(
+        dedent(
+            """
+            def test_one():
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert "pkg_a" not in by_name
+    # The module step still opens and is now the top-level frame.
+    assert by_name["test_x.py"][0]["parent_step_id"] is None
+
+
+def test_all_three_flags_false_matches_legacy_behavior(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    _write_ini(
+        pytester,
+        log_file,
+        sift_module_step="false",
+        sift_class_step="false",
+        sift_parametrize_nesting="false",
+    )
+    pytester.makepyfile(
+        test_legacy=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # No module, class, or parametrize parents — just bracket-mangled leaves.
+    assert "test_legacy.py" not in by_name
+    assert "TestFoo" not in by_name
+    assert "test_a" not in by_name
+    assert "test_a[1]" in by_name
+    assert "test_a[2]" in by_name
+    assert by_name["test_a[1]"][0]["parent_step_id"] is None
+    assert by_name["test_a[2]"][0]["parent_step_id"] is None
+
+
+# ---------------------------------------------------------------------------
+# Parametrize nesting
+# ---------------------------------------------------------------------------
+
+
+def test_single_parametrize_clusters_under_originalname(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_rail=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [3.3, 5.0])
+            def test_rail(v):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Module step + one shared `test_rail` parent + two leaves.
+    assert len(by_name["test_rail.py"]) == 1
+    assert len(by_name["test_rail"]) == 1
+    assert len(by_name["v=3.3"]) == 1
+    assert len(by_name["v=5.0"]) == 1
+    test_rail_id = by_name["test_rail"][0]["id"]
+    assert by_name["v=3.3"][0]["parent_step_id"] == test_rail_id
+    assert by_name["v=5.0"][0]["parent_step_id"] == test_rail_id
+
+
+def test_stacked_parametrize_nests_outer_to_inner(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_iso=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("voltage", ["high", "low"])
+            @pytest.mark.parametrize("component", ["motor", "ducer"])
+            def test_iso(voltage, component):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # One `test_iso` parent, two `voltage='…'` parents, four `component='…'` leaves.
+    assert len(by_name["test_iso"]) == 1
+    assert len(by_name["voltage='high'"]) == 1
+    assert len(by_name["voltage='low'"]) == 1
+    assert len(by_name["component='motor'"]) == 2  # one per voltage
+    assert len(by_name["component='ducer'"]) == 2
+    test_iso_id = by_name["test_iso"][0]["id"]
+    vh_id = by_name["voltage='high'"][0]["parent_step_id"]
+    vl_id = by_name["voltage='low'"][0]["parent_step_id"]
+    assert vh_id == test_iso_id
+    assert vl_id == test_iso_id
+    # Each component leaf parents to one of the voltage parents.
+    voltage_ids = {
+        by_name["voltage='high'"][0]["id"],
+        by_name["voltage='low'"][0]["id"],
+    }
+    for leaf in by_name["component='motor'"] + by_name["component='ducer'"]:
+        assert leaf["parent_step_id"] in voltage_ids
+
+
+def test_fixture_parametrization_participates(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_widget=dedent(
+            """
+            import pytest
+
+            @pytest.fixture(params=["a", "b"])
+            def widget(request):
+                return request.param
+
+            def test_widget(widget):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    assert len(by_name["test_widget"]) == 1
+    parent_id = by_name["test_widget"][0]["id"]
+    assert by_name["widget='a'"][0]["parent_step_id"] == parent_id
+    assert by_name["widget='b'"][0]["parent_step_id"] == parent_id
+
+
+def test_module_boundary_isolates_parametrize_stack(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_a=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("v", [1, 2])
+            def test_one(v):
+                pass
+            """
+        ),
+        test_b=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("w", ["x", "y"])
+            def test_two(w):
+                pass
+            """
+        ),
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Each module step contains its own `test_one`/`test_two` parametrize subtree.
+    mod_a = by_name["test_a.py"][0]
+    mod_b = by_name["test_b.py"][0]
+    assert by_name["test_one"][0]["parent_step_id"] == mod_a["id"]
+    assert by_name["test_two"][0]["parent_step_id"] == mod_b["id"]
+
+
+def test_leaf_parent_chain_terminates_at_report(pytester: pytest.Pytester, log_file: Path) -> None:
+    pytester.makepyfile(
+        test_chain=dedent(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("a", [1])
+            @pytest.mark.parametrize("b", ["x"])
+            def test_chain(a, b):
+                pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1)
+    steps = capture.load_steps(log_file)
+    leaf = next(s for s in steps if s["name"].startswith("b="))
+    chain = _ancestor_names(steps, leaf)
+    # leaf b=… → a=… → test_chain → test_chain.py (module step) → root
+    assert chain == ["b='x'", "a=1", "test_chain", "test_chain.py"]
+
+
+# ---------------------------------------------------------------------------
+# Order independence
+# ---------------------------------------------------------------------------
+
+
+def test_interleaved_execution_does_not_duplicate_parents(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """Sibling methods need not run contiguously to share one class parent.
+
+    A conftest hook interleaves the two classes' methods
+    (``A::a1, B::b1, A::a2, B::b2``) — the order the removed sort used to
+    forbid, and the order pytest's own fixture-scope reordering can produce.
+    Each class must still open exactly once and every method parent to the
+    right class.
+    """
+    # Overwrite the conftest with one that registers the plugin AND reorders
+    # items so the two classes interleave. The log_file fixture's pytest.ini
+    # (offline + log path) still applies.
+    pytester.makeconftest(
+        dedent(
+            """
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+            def pytest_collection_modifyitems(config, items):
+                a = [i for i in items if "TestA::" in i.nodeid]
+                b = [i for i in items if "TestB::" in i.nodeid]
+                interleaved = []
+                for x, y in zip(a, b):
+                    interleaved.append(x)
+                    interleaved.append(y)
+                items[:] = interleaved
+            """
+        )
+    )
+    pytester.makepyfile(
+        test_inter=dedent(
+            """
+            class TestA:
+                def test_a1(self):
+                    pass
+
+                def test_a2(self):
+                    pass
+
+            class TestB:
+                def test_b1(self):
+                    pass
+
+                def test_b2(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=4)
+    steps = capture.load_steps(log_file)
+    by_name = _by_name(steps)
+    # Each class opens exactly once despite the interleaved run order.
+    assert len(by_name["TestA"]) == 1
+    assert len(by_name["TestB"]) == 1
+    a_id = by_name["TestA"][0]["id"]
+    b_id = by_name["TestB"][0]["id"]
+    assert by_name["test_a1"][0]["parent_step_id"] == a_id
+    assert by_name["test_a2"][0]["parent_step_id"] == a_id
+    assert by_name["test_b1"][0]["parent_step_id"] == b_id
+    assert by_name["test_b2"][0]["parent_step_id"] == b_id
+
+
+# ---------------------------------------------------------------------------
+# Parent status resolution
+# ---------------------------------------------------------------------------
+
+
+def test_parent_status_passed_when_all_children_pass(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    pytester.makepyfile(
+        test_ok=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert by_name["TestFoo"][0]["statuses"][-1] == TestStatus.PASSED
+    assert by_name["test_ok.py"][0]["statuses"][-1] == TestStatus.PASSED
+
+
+def test_parent_status_failed_propagates_up_and_isolates_siblings(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A failing leaf marks its class and the module FAILED, but a sibling class
+    whose tests all pass stays PASSED.
+    """
+    pytester.makepyfile(
+        test_fail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2, failed=1)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert by_name["TestFoo"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["test_fail.py"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["TestBar"][0]["statuses"][-1] == TestStatus.PASSED
+
+
+def test_parent_status_failure_propagates_through_parametrize(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """One failing parametrization fails the whole chain: parametrize parent →
+    class → module.
+    """
+    pytester.makepyfile(
+        test_pfail=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.parametrize("v", [1, 2])
+                def test_a(self, v):
+                    if v == 1:
+                        raise AssertionError("boom")
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=1, failed=1)
+    by_name = _by_name(capture.load_steps(log_file))
+    assert by_name["test_a"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["TestFoo"][0]["statuses"][-1] == TestStatus.FAILED
+    assert by_name["test_pfail.py"][0]["statuses"][-1] == TestStatus.FAILED
+
+
+def test_parent_opens_in_progress_and_resolves_exactly_once(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A parent is created IN_PROGRESS and gets exactly one terminal status at
+    session end — it is never reopened, even as later siblings run under it.
+
+    This locks in the "stay in-progress until every child is done, then resolve
+    once" behavior: a parent emits a CreateTestStep (IN_PROGRESS) and a single
+    UpdateTestStep (terminal), so its status timeline is exactly two entries.
+    """
+    pytester.makepyfile(
+        test_once=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=2)
+    by_name = _by_name(capture.load_steps(log_file))
+    # Created in-progress, resolved once — no intermediate churn, no reopen.
+    assert by_name["TestFoo"][0]["statuses"] == [TestStatus.IN_PROGRESS, TestStatus.PASSED]
+    assert by_name["test_once.py"][0]["statuses"] == [TestStatus.IN_PROGRESS, TestStatus.PASSED]
+
+
+# ---------------------------------------------------------------------------
+# Parent timing
+# ---------------------------------------------------------------------------
+
+
+def test_parent_timing_spans_its_children(pytester: pytest.Pytester, log_file: Path) -> None:
+    """A parent's [start, end] window covers its whole subtree: it starts no
+    later than its first child and ends exactly at its last child's finish.
+    """
+    pytester.makepyfile(
+        test_span=dedent(
+            """
+            import time
+
+            class TestFoo:
+                def test_a(self):
+                    time.sleep(0.02)
+
+                def test_b(self):
+                    time.sleep(0.02)
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=2)
+    by_name = _by_name(capture.load_steps(log_file))
+    klass = by_name["TestFoo"][0]
+    module = by_name["test_span.py"][0]
+    leaves = [by_name["test_a"][0], by_name["test_b"][0]]
+    leaf_starts = [_parse_ts(leaf["start_time"]) for leaf in leaves]
+    leaf_ends = [_parse_ts(leaf["end_time"]) for leaf in leaves]
+
+    # Parent opened before (or with) its earliest child, and start precedes end.
+    assert _parse_ts(klass["start_time"]) <= min(leaf_starts)
+    assert _parse_ts(klass["start_time"]) <= _parse_ts(klass["end_time"])
+    # Parent end is exactly the latest descendant finish — not a session-end stamp.
+    assert _parse_ts(klass["end_time"]) == max(leaf_ends)
+    # The module parent spans the class and rolls the same finish up a level.
+    assert _parse_ts(module["start_time"]) <= _parse_ts(klass["start_time"])
+    assert _parse_ts(module["end_time"]) == max(leaf_ends)
+
+
+def test_parent_end_time_reflects_a_later_child_under_interleaving(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """When a parent's children run non-contiguously, its end_time tracks the
+    LAST child to finish — even one that runs after a different parent's child.
+
+    Execution order is pinned to ``a1, b1, a2`` via a conftest hook, so
+    ``TestA``'s second child (``a2``) closes after ``TestB``'s child. ``TestA``
+    must end at ``a2``'s finish, not ``a1``'s.
+    """
+    pytester.makeconftest(
+        dedent(
+            """
+            pytest_plugins = ["sift_client.pytest_plugin"]
+            import pytest
+
+            _ORDER = ["test_a1", "test_b1", "test_a2"]
+
+            @pytest.hookimpl(trylast=True)
+            def pytest_collection_modifyitems(config, items):
+                # trylast so this runs after any reordering plugin and wins.
+                items.sort(key=lambda i: _ORDER.index(i.name) if i.name in _ORDER else 99)
+            """
+        )
+    )
+    pytester.makepyfile(
+        test_il=dedent(
+            """
+            import time
+
+            class TestA:
+                def test_a1(self):
+                    pass
+
+                def test_a2(self):
+                    time.sleep(0.02)
+
+            class TestB:
+                def test_b1(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=3)
+    by_name = _by_name(capture.load_steps(log_file))
+    a_end = by_name["TestA"][0]["end_time"]
+    a1_end = by_name["test_a1"][0]["end_time"]
+    a2_end = by_name["test_a2"][0]["end_time"]
+    # TestA ends at its later child (a2), not the one that happened to run first.
+    assert a_end == a2_end
+    assert a_end != a1_end
+
+
+# ---------------------------------------------------------------------------
+# Early close — parents resolve as soon as their descendants finish
+# ---------------------------------------------------------------------------
+
+
+def _index(
+    events: list[tuple],
+    request_type: str,
+    name: str,
+    *,
+    terminal: bool = False,
+    status: TestStatus | None = None,
+) -> int:
+    """Index of the first matching log event.
+
+    ``status`` matches that exact status; ``terminal`` matches any resolved
+    (non-``IN_PROGRESS``) status.
+    """
+
+    def matches(rt: str, nm: str, st: TestStatus) -> bool:
+        if rt != request_type or nm != name:
+            return False
+        if status is not None:
+            return st == status
+        return not terminal or st != TestStatus.IN_PROGRESS
+
+    return next(i for i, (rt, nm, st) in enumerate(events) if matches(rt, nm, st))
+
+
+_INTERLEAVE_CONFTEST = """
+pytest_plugins = ["sift_client.pytest_plugin"]
+import pytest
+
+_ORDER = ["test_a1", "test_b1", "test_a2"]
+
+@pytest.hookimpl(trylast=True)
+def pytest_collection_modifyitems(config, items):
+    # trylast so this wins over any reordering plugin; pins A::a1, B::b1, A::a2.
+    items.sort(key=lambda i: _ORDER.index(i.name) if i.name in _ORDER else 99)
+"""
+
+
+def test_parent_closes_mid_session_not_at_end(pytester: pytest.Pytester, log_file: Path) -> None:
+    """A container resolves as soon as its last child finishes — before the next
+    container even opens — rather than all flipping at session end.
+    """
+    pytester.makepyfile(
+        test_mid=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=3)
+    events = capture.log_events(log_file)
+    # TestFoo reaches a terminal status before TestBar is even created.
+    assert _index(events, "UpdateTestStep", "TestFoo", terminal=True) < _index(
+        events, "CreateTestStep", "TestBar"
+    )
+
+
+def test_failing_parent_resolves_failed_mid_session(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """Early close carries status too: a class with a failing test resolves FAILED
+    as soon as its subtree finishes, before the next class opens.
+    """
+    pytester.makepyfile(
+        test_midfail=dedent(
+            """
+            class TestFoo:
+                def test_a(self):
+                    raise AssertionError("boom")
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=1, failed=1)
+    events = capture.log_events(log_file)
+    foo_failed = _index(events, "UpdateTestStep", "TestFoo", status=TestStatus.FAILED)
+    assert foo_failed < _index(events, "CreateTestStep", "TestBar")
+
+
+def test_close_is_completion_driven_not_order_driven(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A single-child container closes the moment that child finishes, even though
+    a sibling container's test (collected earlier) runs afterward.
+
+    Order is pinned to ``a1, b1, a2``: ``TestB`` (only child ``b1``) must resolve
+    before ``test_a2`` runs, proving close is driven by descendant completion, not
+    by reaching some position in the item list.
+    """
+    pytester.makeconftest(_INTERLEAVE_CONFTEST)
+    pytester.makepyfile(
+        test_cd=dedent(
+            """
+            class TestA:
+                def test_a1(self):
+                    pass
+
+                def test_a2(self):
+                    pass
+
+            class TestB:
+                def test_b1(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v")
+    result.assert_outcomes(passed=3)
+    events = capture.log_events(log_file)
+    # TestB resolves before test_a2 is even created.
+    assert _index(events, "UpdateTestStep", "TestB", terminal=True) < _index(
+        events, "CreateTestStep", "test_a2"
+    )
+
+
+def test_excluded_sibling_does_not_stall_parent_close(
+    pytester: pytest.Pytester, log_file: Path
+) -> None:
+    """A ``sift_exclude``-d method is not counted toward its class's descendants,
+    so the class still closes promptly once its included tests finish.
+
+    If the excluded test inflated the count, ``TestFoo`` could never reach zero
+    and would only resolve at the session-end drain — i.e. after ``TestBar`` is
+    created. Asserting it resolves *before* ``TestBar`` proves the gate filter.
+    """
+    pytester.makepyfile(
+        test_excl_close=dedent(
+            """
+            import pytest
+
+            class TestFoo:
+                @pytest.mark.sift_exclude
+                def test_a(self):
+                    pass
+
+                def test_b(self):
+                    pass
+
+            class TestBar:
+                def test_c(self):
+                    pass
+            """
+        )
+    )
+    result = pytester.runpytest_inprocess("-v", "-p", "no:randomly")
+    result.assert_outcomes(passed=3)
+    events = capture.log_events(log_file)
+    assert _index(events, "UpdateTestStep", "TestFoo", terminal=True) < _index(
+        events, "CreateTestStep", "TestBar"
+    )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_offline.py b/python/lib/sift_client/_tests/pytest_plugin/test_offline.py
new file mode 100644
index 000000000..f0470bad3
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_offline.py
@@ -0,0 +1,135 @@
+"""Tests for ``--sift-offline`` mode.
+
+Offline mode routes every create/update through the JSONL log file without
+contacting Sift. The session-start ping is skipped, the import worker is not
+spawned, and missing ``SIFT_*`` env vars are tolerated (placeholders are
+filled). Offline + ``--sift-log-file=none`` is rejected as a
+usage error since the log file is the sole sink in this mode.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestOfflineMode:
+    def test_offline_runs_without_network(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode constructs the client locally and never pings."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_in_bounds(step):
+                assert step.measure(name="v", value=5.0, bounds={"min": 4.8, "max": 5.2})
+
+            def test_out_of_bounds(step):
+                assert step.measure(name="v", value=10.0, bounds={"max": 5.2}) is False
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=2)
+
+    def test_log_file_none_incompatible_with_offline(
+        self,
+        pytester: pytest.Pytester,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-log-file=none`` + ``--sift-offline`` is a usage error."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_should_not_run(): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", "--sift-log-file=none")
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "incompatible with --sift-offline" in combined, combined
+
+    def test_offline_yields_real_fixtures(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode runs a real ReportContext; entities still report `is_simulated=True` because the log-file path synthesizes responses prior to replay."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            from sift_client.util.test_results import ReportContext
+            from sift_client.util.test_results.context_manager import NewStep
+
+            def test_types(step, report_context):
+                assert isinstance(report_context, ReportContext)
+                assert isinstance(step, NewStep)
+                assert report_context.client._simulate is False
+                # log-file mode synthesizes responses, so entities are flagged simulated.
+                assert report_context.is_simulated is True
+                assert step.current_step.is_simulated is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=1)
+
+    def test_offline_writes_jsonl_to_pinned_log_file(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline mode populates the pinned JSONL file with create/update entries."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            def test_one(step):
+                assert step.measure(
+                    name="v", value=5.0, bounds={"min": 4.8, "max": 5.2}
+                ) is True
+            """
+        )
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        assert log_path.exists(), f"offline mode did not create {log_path}"
+        content = log_path.read_text()
+        assert content.strip(), "log file is empty"
+        # Each non-empty line is ``[Operation:uuid] {json}``. A successful
+        # session produces at least the report create + step create lines.
+        lines = [line for line in content.splitlines() if line.strip()]
+        assert any(line.startswith("[CreateTestReport:") for line in lines), content
+        assert any(line.startswith("[CreateTestStep:") for line in lines), content
+
+    def test_offline_skips_client_has_connection(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Offline mode never resolves ``client_has_connection``.
+
+        Override the fixture to raise on resolution. If the override is
+        invoked, the session aborts. If it isn't, the inner test passes
+        cleanly, which confirms the offline path skipped the ping check.
+        """
+        pytester.makeconftest(
+            """
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                raise AssertionError(
+                    "client_has_connection should not resolve in offline mode"
+                )
+            """
+        )
+        pytester.makepyfile("def test_runs(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline")
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_online.py b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
new file mode 100644
index 000000000..19a666d04
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_online.py
@@ -0,0 +1,140 @@
+"""Tests for online mode (the default).
+
+Online mode requires connectivity to Sift. The plugin pings via
+``client_has_connection`` at session start and aborts via ``pytest.exit`` on
+failure, so the message prints once before any test runs. Missing
+``SIFT_API_KEY`` / ``SIFT_GRPC_URI`` / ``SIFT_REST_URI`` env vars are reported
+as a usage error so the failure is actionable.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestOnlineMode:
+    def test_ping_failure_aborts(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+    ) -> None:
+        """Online mode with an unreachable ping aborts the session before any test runs."""
+        pytester.makeconftest(
+            """
+            import pytest
+            from unittest.mock import MagicMock
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                client = MagicMock()
+                client.ping.ping.side_effect = ConnectionError("unreachable")
+                return client
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_should_not_run():
+                assert True
+
+            @pytest.mark.sift_include
+            def test_should_not_run_either():
+                assert True
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        # ``pytest.exit`` stops on the first gated test's setup: the message
+        # appears once (not once per test) and nothing runs.
+        assert combined.count("Sift ping failed") == 1, combined
+        result.assert_outcomes()
+
+    def test_missing_env_vars_named_in_error(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The default ``sift_client`` fixture names missing env vars in its error."""
+        write_plugin_conftest()
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_should_not_run():
+                pass
+            """
+        )
+        result = pytester.runpytest_subprocess()
+        assert result.ret != 0
+        combined = "\n".join(result.outlines + result.errlines)
+        for var in ("SIFT_API_KEY", "SIFT_GRPC_URI", "SIFT_REST_URI"):
+            assert var in combined, combined
+
+    def test_online_resolves_client_has_connection_once(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+    ) -> None:
+        """Online mode resolves ``client_has_connection`` exactly once at session start.
+
+        Overrides the fixture to bump a counter persisted to a file the outer
+        test reads after the inner session finishes. Outcomes aren't asserted
+        because the real ``ReportContext`` constructed against a ``MagicMock``
+        client crashes downstream when Pydantic sees mock IDs; what we're
+        verifying is the ping path itself, which runs before construction.
+        """
+        counter_file = tmp_path / "ping_calls.txt"
+        pytester.makeconftest(
+            f"""
+            from pathlib import Path
+            from unittest.mock import MagicMock
+
+            import pytest
+
+            pytest_plugins = ["sift_client.pytest_plugin"]
+
+            _COUNTER = Path({str(counter_file)!r})
+
+
+            @pytest.fixture(scope="session")
+            def sift_client():
+                return MagicMock()
+
+
+            @pytest.fixture(scope="session")
+            def client_has_connection():
+                prior = int(_COUNTER.read_text()) if _COUNTER.exists() else 0
+                _COUNTER.write_text(str(prior + 1))
+                return True
+            """
+        )
+        pytester.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.sift_include
+            def test_a(): pass
+
+            @pytest.mark.sift_include
+            def test_b(): pass
+            """
+        )
+        pytester.runpytest_subprocess()
+        assert counter_file.exists(), "client_has_connection was not resolved"
+        assert counter_file.read_text() == "1", (
+            f"expected session-scoped fixture to resolve once, got {counter_file.read_text()}"
+        )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
new file mode 100644
index 000000000..112ef4055
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_pass_fail.py
@@ -0,0 +1,638 @@
+"""Contract suite: maps each pytest exit path to the ``TestStatus`` the
+Sift pytest plugin is required to record on the outer step.
+
+Each scenario writes a tiny inner test file and runs it through pytester
+with a fake ``sift_client`` injected via a generated conftest. The fake
+records every step status write into ``_step_status_capture.CAPTURED_STEPS``
+so this outer test can assert on what the plugin produced.
+
+Assertions encode the contract from
+``docs/guides/pytest_plugin/pass_fail_behavior.md``. Tests for scenarios the
+plugin does not yet handle correctly are expected to **fail today** — they
+are the punch list. ``lib/sift_client/_tests/pytest_plugin/step_status_states.md``
+tracks each scenario's observed-today behavior next to the target so the
+remaining gaps are visible without running the suite.
+"""
+
+from __future__ import annotations
+
+import textwrap
+
+import pytest
+
+from sift_client._tests.pytest_plugin import _step_status_capture as capture
+from sift_client.sift_types.test_report import TestStatus
+
+pytest_plugins = ["pytester"]
+
+
+_INNER_CONFTEST_SRC = '''
+"""Auto-generated conftest. Loading the Sift plugin is the only thing the
+inner session needs. ``--sift-offline`` on the CLI causes the plugin's
+default ``sift_client`` fixture to construct a placeholder client and the
+real ``ReportContext`` writes every API call to the JSONL log without
+contacting Sift.
+"""
+
+pytest_plugins = ["sift_client.pytest_plugin"]
+'''
+
+
+@pytest.fixture
+def inner(pytester):
+    """Install the inner conftest. Returns ``pytester``."""
+    pytester.makeconftest(_INNER_CONFTEST_SRC)
+    return pytester
+
+
+# Prepended to every inner test file. Pytest skips marker-based ``skip`` items
+# before any autouse fixture runs, which would leave ``REPORT_CONTEXT`` unset
+# and the plugin's inline-skip recording inert. A single passing item up-front
+# forces ``report_context`` to initialize so the makereport hook can record
+# the skip into the same session's JSONL.
+_WARMUP = "def test_sift_warmup(): pass\n\n"
+
+
+def _run(pytester, body: str) -> None:
+    pytester.makepyfile(_WARMUP + textwrap.dedent(body))
+    log_path = pytester.path / "sift.log"
+    capture.set_log(log_path)
+    pytester.runpytest_inprocess(
+        "--sift-offline",
+        f"--sift-log-file={log_path}",
+        "--no-sift-git-metadata",
+        # Pin the inner session to definition order so ``test_sift_warmup`` runs
+        # before a marker-skipped ``test_x`` (see ``_WARMUP``). ``-p no:randomly``
+        # is a no-op when pytest-randomly isn't installed, and keeps these tests
+        # deterministic when it is.
+        "-p",
+        "no:randomly",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Call-phase exit paths
+# ---------------------------------------------------------------------------
+
+
+def test_pass_maps_to_passed(inner):
+    # Case: CALL-01
+    _run(
+        inner,
+        """
+        def test_x():
+            assert True
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.PASSED
+
+
+def test_assert_failure_maps_to_failed(inner):
+    # Case: CALL-02
+    _run(
+        inner,
+        """
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+    # The concise assertion message is recorded on error_info for the UI, but
+    # without the full traceback frames.
+    message = capture.final_error_message("test_x")
+    assert message is not None
+    assert "assert 1 == 2" in message
+    assert "Traceback (most recent call last)" not in message
+
+
+def test_generic_exception_maps_to_error(inner):
+    # Case: CALL-03
+    _run(
+        inner,
+        """
+        def test_x():
+            raise ValueError("boom")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ERROR
+
+
+def test_system_exit_maps_to_aborted(inner):
+    # Case: CALL-05
+    _run(
+        inner,
+        """
+        import sys
+        def test_x():
+            sys.exit(1)
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.ABORTED
+
+
+def test_pytest_fail_maps_to_failed(inner):
+    # Case: CALL-04
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.fail("intentional failure")
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_pytest_fail_if_step_failed_fails_without_error_info(inner):
+    # An out-of-bounds measurement plus step.pytest_fail_if_step_failed()
+    # fails the test via pytest.fail, so the step is FAILED with no assertion
+    # message in error_info (the reason this helper exists over `assert`).
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="b", value=99.0, bounds={"min": 0.0, "max": 2.0})
+            step.pytest_fail_if_step_failed()
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+    assert capture.final_error_message("test_x") is None
+
+
+def test_pytest_fail_if_step_failed_fails_on_failed_substep(inner):
+    # A failed substep (here via report_outcome) leaves no out-of-bounds
+    # measurement on the step, but the report still marks the step FAILED.
+    # pytest_fail_if_step_failed must fail the test so the verdict matches.
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.report_outcome("check", False, "deliberately failing")
+            step.pytest_fail_if_step_failed()
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_pytest_fail_if_step_failed_passes_when_in_bounds(inner):
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="a", value=1.0, bounds={"min": 0.0, "max": 2.0})
+            step.pytest_fail_if_step_failed()
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.PASSED
+
+
+def test_keyboard_interrupt_resolves_step_to_aborted(inner):
+    # Case: CALL-06
+    # KeyboardInterrupt aborts the session before the call-phase makereport
+    # fires; the plugin can't observe the interrupt directly. Setup completed
+    # but no call outcome was seen, so the step resolves to ABORTED rather than
+    # being left IN_PROGRESS (a finalized report should not carry a step that
+    # still reads as in-progress) or coerced to PASSED.
+    try:
+        _run(
+            inner,
+            """
+            def test_x():
+                raise KeyboardInterrupt
+            """,
+        )
+    except KeyboardInterrupt:
+        pass
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ABORTED
+
+
+def test_substep_exception_records_error_with_failed_parent(inner):
+    # Case: CALL-07
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner"):
+                raise ValueError("boom")
+        """,
+    )
+    # Only the originating substep records ERROR. The test step inherits the
+    # child-failed signal and resolves to FAILED, even though the same
+    # ValueError propagated through its scope.
+    inner_sub = next(iter(capture.steps_by_name("inner")), None)
+    test_x = capture.test_step("test_x")
+    assert inner_sub is not None
+    assert test_x is not None
+    assert inner_sub.statuses[-1] == TestStatus.ERROR
+    assert test_x.statuses[-1] == TestStatus.FAILED
+
+
+def test_substep_assert_failure_records_message_with_failed(inner):
+    # Case: CALL-02 (substep). A substep inherits assertion_as_fail_not_error
+    # from the autouse step (False under pytest), so a failed assertion in a
+    # substep resolves to FAILED and records the concise assertion message.
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner"):
+                assert 1 == 2
+        """,
+    )
+    inner_sub = next(iter(capture.steps_by_name("inner")), None)
+    assert inner_sub is not None
+    assert inner_sub.statuses[-1] == TestStatus.FAILED
+    assert inner_sub.error_messages
+    message = inner_sub.error_messages[-1]
+    assert "assert 1 == 2" in message
+    assert "Traceback (most recent call last)" not in message
+
+
+# ---------------------------------------------------------------------------
+# Skip paths
+# ---------------------------------------------------------------------------
+
+
+def test_pytest_skip_in_body_maps_to_skipped(inner):
+    # Case: SKIP-03
+    _run(
+        inner,
+        """
+        import pytest
+        def test_x():
+            pytest.skip("not today")
+        """,
+    )
+    # Runtime skip in the body resolves the outer step to SKIPPED. The
+    # makereport hook must not create a duplicate nested step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+def test_pytest_mark_skip_records_skipped(inner):
+    # Case: SKIP-01
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skip(reason="collection-time skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # Collection-time skip: the autouse step fixture never runs. Only the
+    # makereport hook creates a step, with status SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_pytest_mark_skipif_records_skipped(inner):
+    # Case: SKIP-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.skipif(True, reason="conditional skip")
+        def test_x():
+            assert False
+        """,
+    )
+    # `skipif` with a truthy condition follows the same path as
+    # `@pytest.mark.skip`; only the makereport hook records a step.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+def test_skip_inside_fixture_setup(inner):
+    # Case: SKIP-04
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def skipping_fixture():
+            pytest.skip("environment not ready")
+
+        def test_x(skipping_fixture):
+            assert True
+        """,
+    )
+    # A setup-phase skip resolves the outer step to SKIPPED. The makereport
+    # hook must not create a duplicate nested step.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.SKIPPED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+# ---------------------------------------------------------------------------
+# xfail / xpass
+# ---------------------------------------------------------------------------
+
+
+def test_xfail_marked_test_that_fails(inner):
+    # Case: XFAIL-01
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="known issue")
+        def test_x():
+            assert 1 == 2
+        """,
+    )
+    # xfail + expected failure fulfills the contract; outer step resolves to
+    # PASSED. No duplicate nested step from the makereport hook.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+    duplicates = [s for s in capture.steps_by_name("test_x") if s is not outer]
+    assert not duplicates, f"expected no duplicate nested step; got {len(duplicates)}"
+
+
+def test_xfail_strict_unexpected_pass(inner):
+    # Case: XFAIL-02
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(strict=True, reason="should fail")
+        def test_x():
+            assert True
+        """,
+    )
+    # strict xfail that passes must surface as FAILED: either the bug was
+    # fixed (remove the mark) or the test stopped exercising what it claimed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_xfail_non_strict_unexpected_pass(inner):
+    # Case: XFAIL-03
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(reason="might pass sometimes")
+        def test_x():
+            assert True
+        """,
+    )
+    # Non-strict xfail does not insist on the failure, so a passing run is
+    # PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_xfail_raises_mismatch(inner):
+    # Case: XFAIL-04
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(raises=ValueError, reason="expected ValueError")
+        def test_x():
+            raise KeyError("wrong exception")
+        """,
+    )
+    # `raises=` mismatch is a real test failure — the contract required a
+    # specific exception type and a different one was thrown.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_xfail_run_false(inner):
+    # Case: XFAIL-05
+    _run(
+        inner,
+        """
+        import pytest
+        @pytest.mark.xfail(run=False, reason="never run")
+        def test_x():
+            assert False
+        """,
+    )
+    # The test never ran; outer step is SKIPPED.
+    assert capture.final_status("test_x") == TestStatus.SKIPPED
+
+
+# ---------------------------------------------------------------------------
+# Setup-phase / teardown-phase fixture failures
+# ---------------------------------------------------------------------------
+
+
+def test_setup_phase_fixture_failure(inner):
+    # Case: PHASE-01
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_setup():
+            raise RuntimeError("setup boom")
+
+        def test_x(bad_setup):
+            assert True
+        """,
+    )
+    # A fixture that raises before `yield` fails the setup phase. The outer
+    # step must surface this as ERROR; the test body never executed and a
+    # silently green step would hide the failure.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+def test_teardown_phase_fixture_failure(inner):
+    # Case: PHASE-02
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert True
+        """,
+    )
+    # A fixture that raises after `yield` fails the teardown phase. The
+    # outer step's status reflects the teardown failure as FAILED rather
+    # than the call-phase pass.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_call_fail_plus_teardown_fail(inner):
+    # Case: PHASE-03
+    _run(
+        inner,
+        """
+        import pytest
+
+        @pytest.fixture
+        def bad_teardown():
+            yield
+            raise RuntimeError("teardown boom")
+
+        def test_x(bad_teardown):
+            assert 1 == 2
+        """,
+    )
+    # Call-phase failure dominates the outer step status; the contract also
+    # requires the teardown error to be surfaced somewhere on the step
+    # (mechanism TBD — see pass_fail_behavior.md). This test asserts the
+    # status today; tighten once a surfacing mechanism is chosen.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+# ---------------------------------------------------------------------------
+# Collection-phase failures
+# ---------------------------------------------------------------------------
+
+
+def test_missing_fixture_maps_to_error(inner):
+    # Case: COLL-01
+    _run(
+        inner,
+        """
+        def test_x(nonexistent_fixture):
+            assert True
+        """,
+    )
+    # An unresolved fixture is a setup-phase failure. The outer step
+    # surfaces as ERROR rather than a misleading green pass for a test
+    # that never executed.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ERROR
+
+
+# ---------------------------------------------------------------------------
+# Plugin-API exit paths (in-test mutations)
+# ---------------------------------------------------------------------------
+
+
+def test_manual_status_update_to_failed(inner):
+    # Case: API-01
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            step.current_step.update({"status": TestStatus.FAILED})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_report_outcome_false_maps_to_failed(inner):
+    # Case: API-02
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.report_outcome("the_check", False, "did not match")
+        """,
+    )
+    # Outer step sees a failed substep and rolls up to FAILED.
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_measure_out_of_bounds_maps_to_failed(inner):
+    # Case: API-03
+    _run(
+        inner,
+        """
+        def test_x(step):
+            step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    assert capture.final_status("test_x") == TestStatus.FAILED
+
+
+def test_substep_failure_propagates_to_parent(inner):
+    # Case: API-04
+    _run(
+        inner,
+        """
+        def test_x(step):
+            with step.substep(name="inner") as inner_step:
+                inner_step.measure(name="m", value=10.0, bounds={"min": 0.0, "max": 5.0})
+        """,
+    )
+    # `test_measure_out_of_bounds_maps_to_failed` exercises a failed
+    # measurement on the function step itself; this one verifies the same
+    # failure on a nested substep propagates up to the parent.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.FAILED
+
+
+def test_skipped_substep_does_not_fail_parent(inner):
+    # Case: API-05
+    _run(
+        inner,
+        """
+        from sift_client.sift_types.test_report import TestStatus
+        def test_x(step):
+            with step.substep(name="optional_check") as cal:
+                cal.current_step.update(
+                    {"status": TestStatus.SKIPPED},
+                    log_file=step.report_context.log_file,
+                )
+        """,
+    )
+    # A manually-resolved SKIPPED on a substep must not propagate as a failure
+    # to the parent. The outer step has no measurements of its own and resolves
+    # to PASSED.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.PASSED
+
+
+def test_abort_inside_substep_marks_every_open_step_aborted(inner):
+    # Case: API-06
+    _run(
+        inner,
+        """
+        import sys
+        def test_x(step):
+            with step.substep(name="completed_sub"):
+                pass
+            with step.substep(name="outer_sub") as outer_sub:
+                with outer_sub.substep(name="inner_sub"):
+                    sys.exit(1)
+        """,
+    )
+    # SystemExit unwinds the substep stack on the way out. Every step that was
+    # open when the abort fired (inner substep, outer substep, test step)
+    # must record ABORTED. The sibling substep that closed cleanly before the
+    # abort must retain its PASSED status.
+    outer = capture.test_step("test_x")
+    assert outer is not None
+    assert outer.statuses[-1] == TestStatus.ABORTED
+    outer_sub = next(iter(capture.steps_by_name("outer_sub")), None)
+    inner_sub = next(iter(capture.steps_by_name("inner_sub")), None)
+    completed_sub = next(iter(capture.steps_by_name("completed_sub")), None)
+    assert outer_sub is not None
+    assert inner_sub is not None
+    assert completed_sub is not None
+    assert outer_sub.statuses[-1] == TestStatus.ABORTED
+    assert inner_sub.statuses[-1] == TestStatus.ABORTED
+    assert completed_sub.statuses[-1] == TestStatus.PASSED
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py b/python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py
new file mode 100644
index 000000000..a4c723b47
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_report_fields.py
@@ -0,0 +1,272 @@
+"""Tests for [tool.sift.pytest.report] and the report-content env-var overrides.
+
+Report-content fields are configured under ``[tool.sift.pytest.report]`` in
+pyproject.toml and overridden per-run via ``SIFT_REPORT_*`` env vars. These
+tests drive offline-mode inner sessions and inspect the JSONL
+``CreateTestReport`` line, which serializes every report field with its proto
+type intact.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Callable
+
+from google.protobuf import json_format
+from sift.metadata.v1.metadata_pb2 import MetadataValue
+
+from sift_client.util.metadata import metadata_proto_to_dict
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+def _create_report_dict(log_text: str) -> dict:
+    """Parse the JSON payload from the ``[CreateTestReport:...]`` log line."""
+    for line in log_text.splitlines():
+        if line.startswith("[CreateTestReport:"):
+            return json.loads(line[line.index("{") :])
+    raise AssertionError(f"no CreateTestReport line in log:\n{log_text}")
+
+
+def _metadata_pairs(report: dict) -> dict[str, str | float | bool]:
+    """Unwrap the report's JSON metadata map into a ``{key: value}`` dict.
+
+    Each entry is the JSON form of a ``MetadataValue`` proto, so parse it back
+    into the proto and reuse the canonical ``metadata_proto_to_dict`` converter
+    rather than hand-walking the value slots.
+    """
+    protos = [json_format.ParseDict(entry, MetadataValue()) for entry in report.get("metadata", [])]
+    return metadata_proto_to_dict(protos)
+
+
+class TestReportFields:
+    def test_toml_resolves_every_field(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Every report-content field resolves from ``[tool.sift.pytest.report]``."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            test_case        = "case-from-toml"
+            test_system_name = "rig-7"
+            system_operator  = "ci-bot"
+            serial_number    = "SN-001"
+            part_number      = "PN-9000"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == "case-from-toml"
+        assert report["testSystemName"] == "rig-7"
+        assert report["systemOperator"] == "ci-bot"
+        assert report["serialNumber"] == "SN-001"
+        assert report["partNumber"] == "PN-9000"
+
+    def test_test_case_template_renders(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``test_case`` accepts the same template placeholders as ``name``."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            test_case = "case-{rootdir}-{count}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"].startswith("case-"), report["testCase"]
+        assert report["testCase"].endswith("-1"), report["testCase"]
+
+    def test_default_target_single_test_is_function(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """One test collected -> default test_case is the project-anchored function nodeid.
+
+        Derivation is from the collected items, so it doesn't depend on flag
+        order or which path form was typed; the value is anchored to the
+        rootdir (project) name.
+        """
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(test_demo="def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/test_demo.py::test_one", report[
+            "testCase"
+        ]
+
+    def test_default_target_single_test_strips_param(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """A parametrized single test drops the ``[param]`` suffix from the key."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(
+            test_demo=(
+                "import pytest\n@pytest.mark.parametrize('v', [12])\ndef test_p(step, v): pass\n"
+            )
+        )
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/test_demo.py::test_p", report[
+            "testCase"
+        ]
+
+    def test_default_target_single_file(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Multiple tests in one file -> the default target is that file (anchored)."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile(test_demo="def test_a(step): pass\ndef test_b(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=2)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/test_demo.py", report["testCase"]
+
+    def test_default_target_multiple_files_common_dir(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Tests across several files -> the default target is their common directory (anchored)."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        suite = pytester.mkdir("suite")
+        (suite / "test_a.py").write_text("def test_a(step): pass\n")
+        (suite / "test_b.py").write_text("def test_b(step): pass\n")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=2)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == f"{pytester.path.name}/suite", report["testCase"]
+
+    def test_default_target_whole_tree_is_project(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Tests spanning the rootdir -> the default target is the bare project name."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        # Two files directly under rootdir -> common path is rootdir itself.
+        pytester.makepyfile(test_a="def test_a(step): pass", test_b="def test_b(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=2)
+        report = _create_report_dict(log_path.read_text())
+        assert report["testCase"] == pytester.path.name, report["testCase"]
+
+    def test_env_overrides_toml(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        monkeypatch: pytest.MonkeyPatch,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An env var wins over a value set in ``[tool.sift.pytest.report]``."""
+        log_path = tmp_path / "run.jsonl"
+        monkeypatch.setenv("SIFT_REPORT_SYSTEM_OPERATOR", "env-wins")
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            system_operator = "ci-bot"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        report = _create_report_dict(log_path.read_text())
+        assert report["systemOperator"] == "env-wins"
+
+    def test_metadata_table_typed_values(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``[tool.sift.pytest.report.metadata]`` keeps TOML types end-to-end."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report.metadata]
+            build_id = "v1.2.3"
+            lane     = 2
+            verbose  = true
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        pairs = _metadata_pairs(_create_report_dict(log_path.read_text()))
+        assert pairs.get("build_id") == "v1.2.3"
+        # Ints and floats share the proto's numeric slot.
+        assert pairs.get("lane") == 2
+        assert pairs.get("verbose") is True
+        # Auto-recorded keys still present alongside the typed entries.
+        assert "pytest_command" in pairs
+
+    def test_loader_warns_on_bad_toml(
+        self,
+        tmp_path: Path,
+        recwarn: pytest.WarningsRecorder,
+    ) -> None:
+        """A malformed pyproject.toml emits a warning and the loader returns ``{}``.
+
+        pytest itself aborts the session when its own ``pyproject.toml`` is
+        unparseable, so the loader's graceful warning path only matters when
+        the file is reachable via the loader's own discovery (e.g. an upward
+        walk in a monorepo). Exercise the loader directly here.
+        """
+        from types import SimpleNamespace
+
+        from sift_client._internal.pyproject_config import load_tool_sift
+
+        bad = tmp_path / "pyproject.toml"
+        bad.write_text('[tool.sift]\ngrpc_uri = "unterminated\n')
+        fake_config = SimpleNamespace(inipath=bad, rootpath=tmp_path)
+
+        result = load_tool_sift(fake_config)  # type: ignore[arg-type]
+
+        assert result == {}
+        messages = [str(w.message) for w in recwarn.list]
+        assert any("[tool.sift]" in m and "Failed to read" in m for m in messages), messages
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_report_name.py b/python/lib/sift_client/_tests/pytest_plugin/test_report_name.py
new file mode 100644
index 000000000..5808c5a78
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_report_name.py
@@ -0,0 +1,120 @@
+"""Tests for report display-name templating.
+
+The report ``name`` is rendered from a template set under
+``[tool.sift.pytest.report] name`` and defaults to ``"{target} {timestamp}"``.
+The full pytest invocation is preserved on the report's metadata under
+``pytest_command``. These tests drive offline-mode inner sessions and inspect
+the JSONL ``CreateTestReport`` line for the rendered values.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+def _create_report_line(content: str) -> str:
+    """Return the ``[CreateTestReport:...]`` JSONL line from a log file."""
+    for line in content.splitlines():
+        if line.startswith("[CreateTestReport:"):
+            return line
+    raise AssertionError(f"no CreateTestReport line in log:\n{content}")
+
+
+class TestReportName:
+    def test_toml_template(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``[tool.sift.pytest.report] name`` renders placeholders into the report name."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            name = "TomlReport-{count}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        line = _create_report_line(log_path.read_text())
+        assert '"name":"TomlReport-1"' in line, line
+
+    def test_full_command_preserved_in_metadata(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The full pytest invocation is stored on the report metadata."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        line = _create_report_line(log_path.read_text())
+        assert '"pytest_command"' in line, line
+        # The recorded command reflects the actual invocation.
+        assert "--sift-offline" in line, line
+
+    def test_git_placeholders_render_empty_outside_repo(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Git placeholders are recognized and render empty when not in a repo.
+
+        The inner pytester session runs in a temp dir that is not a git
+        checkout, so ``{git_branch}`` resolves to an empty string rather than
+        triggering the unknown-placeholder fallback.
+        """
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            name = "R-{git_branch}-{count}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Invalid sift_report_name template" not in combined, combined
+        line = _create_report_line(log_path.read_text())
+        assert '"name":"R--1"' in line, line
+
+    def test_invalid_template_falls_back_and_warns(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An unknown placeholder warns and falls back without aborting the session."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            name = "{nope}"
+            """
+        )
+        pytester.makepyfile("def test_one(step): pass")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Invalid sift_report_name template" in combined, combined
+        # The report is still created despite the bad template.
+        _create_report_line(log_path.read_text())
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py b/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
new file mode 100644
index 000000000..0bb46c76f
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py
@@ -0,0 +1,39 @@
+"""Guard rail that pins the docs settings table to the ``PLUGIN_OPTIONS`` registry.
+
+If you add or change a setting in ``lib/sift_client/pytest_plugin.py`` without
+regenerating the Markdown table in ``docs/guides/pytest_plugin/configuration.md``,
+this test fails with the up-to-date block to paste in.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import pytest
+
+
+# python/lib/sift_client/_tests/pytest_plugin/test_settings_reference.py -> python/
+_REPO_PYTHON_DIR = Path(__file__).resolve().parents[4]
+_DOCS_PATH = _REPO_PYTHON_DIR / "docs/guides/pytest_plugin/configuration.md"
+
+
+def test_settings_reference_docs_in_sync(pytestconfig: pytest.Config) -> None:
+    """The Markdown table under '## Settings reference' matches the registry verbatim."""
+    if not _DOCS_PATH.exists():
+        import pytest
+
+        pytest.skip(f"{_DOCS_PATH} not present in this checkout")
+    from sift_client._internal.pytest_plugin.options import render_settings_reference
+
+    rendered = render_settings_reference()
+    content = _DOCS_PATH.read_text()
+    if rendered not in content:
+        import pytest
+
+        pytest.fail(
+            "Settings reference is out of sync with the PLUGIN_OPTIONS registry. Replace the "
+            "table under '## Settings reference' in "
+            "docs/guides/pytest_plugin/configuration.md with:\n\n" + rendered
+        )
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py b/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
new file mode 100644
index 000000000..0845f143b
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_terminal_output.py
@@ -0,0 +1,195 @@
+"""Tests for the plugin's terminal output (session header + report footer).
+
+Driven through inner pytester sessions. Online output is exercised by the
+``SiftClient.app_url`` unit tests (``_tests/test_urls.py``) since a live link
+needs a real backend; here we cover the deterministic disabled/offline footers
+and the ``-q`` suppression both share.
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Callable
+
+from sift_client._internal.low_level_wrappers._test_results_log import LogTracking
+from sift_client._internal.pytest_plugin.report import resolve_real_report_id
+from sift_client._internal.pytest_plugin.terminal import (
+    measurement_segments,
+    step_count_segments,
+)
+from sift_client.sift_types.test_report import TestStatus
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+class TestStepCountSegments:
+    def test_lists_nonzero_statuses_in_order_with_color(self) -> None:
+        counts = Counter({TestStatus.PASSED: 4, TestStatus.FAILED: 2, TestStatus.SKIPPED: 1})
+        assert step_count_segments(counts) == [
+            ("4 passed", {"green": True}),
+            ("2 failed", {"red": True}),
+            ("1 skipped", {"yellow": True}),
+        ]
+
+    def test_error_and_aborted_are_red(self) -> None:
+        counts = Counter({TestStatus.ERROR: 1, TestStatus.ABORTED: 1})
+        assert step_count_segments(counts) == [
+            ("1 error", {"red": True}),
+            ("1 aborted", {"red": True}),
+        ]
+
+    def test_empty_is_empty(self) -> None:
+        assert step_count_segments(Counter()) == []
+
+
+class TestMeasurementSegments:
+    def test_passed_green_failed_red(self) -> None:
+        assert measurement_segments(Counter({True: 2, False: 1})) == [
+            ("2 passed", {"green": True}),
+            ("1 failed", {"red": True}),
+        ]
+
+    def test_empty_is_empty(self) -> None:
+        assert measurement_segments(Counter()) == []
+
+
+class TestResolveRealReportId:
+    """``resolve_real_report_id`` maps the footer to the real server report id."""
+
+    def test_synchronous_online_uses_report_id_directly(self) -> None:
+        # No log file, non-simulated report (``--sift-log-file=false`` path).
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="real-123", is_simulated=False),
+            log_file=None,
+        )
+        assert resolve_real_report_id(context) == "real-123"
+
+    def test_incremental_resolves_via_sidecar(self, tmp_path: Path) -> None:
+        log_file = tmp_path / "run.jsonl"
+        log_file.write_text("")
+        LogTracking(id_map={"sim-1": "real-1"}).save(log_file)
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="sim-1", is_simulated=True),
+            log_file=log_file,
+        )
+        assert resolve_real_report_id(context) == "real-1"
+
+    def test_empty_report_id_returns_none(self) -> None:
+        # An unset/empty id must not produce a ``/test-results/`` link.
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="", is_simulated=False),
+            log_file=None,
+        )
+        assert resolve_real_report_id(context) is None
+
+    def test_incremental_unmapped_returns_none(self, tmp_path: Path) -> None:
+        # Worker died before mapping the report: no sidecar entry.
+        log_file = tmp_path / "run.jsonl"
+        log_file.write_text("")
+        context = SimpleNamespace(
+            report=SimpleNamespace(id_="sim-1", is_simulated=True),
+            log_file=log_file,
+        )
+        assert resolve_real_report_id(context) is None
+
+
+class TestHeader:
+    def test_header_shows_version_and_mode(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """The session header reports the SDK version and the active mode."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+        result.stdout.fnmatch_lines(["*sift-stack-py*disabled mode*"])
+
+    def test_header_suppressed_under_quiet(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``-q`` suppresses the header, matching pytest's own platform header."""
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", "-q")
+        result.assert_outcomes(passed=1)
+        result.stdout.no_fnmatch_line("*sift-stack-py*")
+
+
+class TestDisabledFooter:
+    def test_footer_notes_no_report(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        result.assert_outcomes(passed=1)
+        result.stdout.fnmatch_lines(["*Sift disabled*no test report created*"])
+
+    def test_footer_suppressed_under_quiet(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-disabled", "-q")
+        result.assert_outcomes(passed=1)
+        result.stdout.no_fnmatch_line("*Sift disabled*")
+
+
+class TestOfflineFooter:
+    def test_footer_shows_log_path_and_replay_command(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Offline footer points at the saved log file and the replay command."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess("--sift-offline", f"--sift-log-file={log_path}")
+        result.assert_outcomes(passed=1)
+        result.stdout.fnmatch_lines(
+            [
+                "*Test case*",
+                "*Status*offline*not uploaded*",
+                "*Steps*passed*",
+                "*Measurements*1 passed*",
+                "*System*",
+                f"*Log file*{log_path}",
+                "*to upload to Sift*",
+                f"*import-test-result-log {log_path}",
+            ]
+        )
+
+    def test_sift_open_report_flag_is_accepted_offline(
+        self,
+        pytester: pytest.Pytester,
+        tmp_path: Path,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``--sift-open-report`` is a no-op offline (no resolvable URL) and never errors."""
+        log_path = tmp_path / "run.jsonl"
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(step): step.measure(name='v', value=1.0)")
+        result = pytester.runpytest_subprocess(
+            "--sift-offline", f"--sift-log-file={log_path}", "--sift-open-report"
+        )
+        result.assert_outcomes(passed=1)
diff --git a/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py b/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
new file mode 100644
index 000000000..435170ed5
--- /dev/null
+++ b/python/lib/sift_client/_tests/pytest_plugin/test_typo_detector.py
@@ -0,0 +1,113 @@
+"""Tests for the unknown-setting warnings fired in ``pytest_configure``.
+
+The plugin scans ``SIFT_*`` env vars and ``[tool.sift.pytest.*]`` keys at
+session start and emits a ``SiftPytestPluginWarning`` for anything not
+declared in the central ``PLUGIN_OPTIONS`` registry. A typo (`SIFT_REPORT_SERIALNUM`
+instead of `SIFT_REPORT_SERIAL_NUMBER`) would otherwise silently no-op.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    import pytest
+
+
+class TestTypoDetector:
+    def test_unknown_env_var_warns(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An unknown ``SIFT_*`` env var emits a warning with a closest-match hint."""
+        monkeypatch.setenv("SIFT_REPORT_SERIALNUM", "SN-1")  # missing underscore
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown SIFT_* env var `SIFT_REPORT_SERIALNUM`" in combined, combined
+        assert "did you mean `SIFT_REPORT_SERIAL_NUMBER`" in combined, combined
+
+    def test_known_env_var_silent(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        monkeypatch: pytest.MonkeyPatch,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Declared env vars don't warn."""
+        monkeypatch.setenv("SIFT_REPORT_SERIAL_NUMBER", "SN-1")
+        write_plugin_conftest()
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown SIFT_*" not in combined, combined
+
+    def test_unknown_toml_key_warns(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """An unknown ``[tool.sift.pytest.report]`` key warns with a suggestion."""
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report]
+            serial_numbr = "SN-1"
+            """
+        )
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown sift config key" in combined, combined
+        assert "pytest.report.serial_numbr" in combined, combined
+        assert "did you mean" in combined, combined
+        assert "serial_number" in combined, combined
+
+    def test_unknown_toml_outside_pytest_scope_silent(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """``[tool.sift.X]`` outside ``tool.sift.pytest`` is not the plugin's concern.
+
+        Other Sift tools may use ``tool.sift.<other-subtree>`` (the build-time
+        ``[tool.sift.extras]`` in this repo's own pyproject is one example);
+        the detector intentionally only walks ``tool.sift.pytest``.
+        """
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.something_else]
+            anything = "goes"
+            """
+        )
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown sift config key" not in combined, combined
+
+    def test_metadata_subtree_keys_are_user_defined(
+        self,
+        pytester: pytest.Pytester,
+        clear_sift_env: None,
+        write_plugin_conftest: Callable[[], None],
+    ) -> None:
+        """Keys under ``[tool.sift.pytest.report.metadata]`` don't trigger warnings."""
+        write_plugin_conftest()
+        pytester.makepyprojecttoml(
+            """
+            [tool.sift.pytest.report.metadata]
+            anything_at_all = "value"
+            another_thing   = 42
+            """
+        )
+        pytester.makepyfile("def test_runs(): pass")
+        result = pytester.runpytest_subprocess("--sift-disabled")
+        combined = "\n".join(result.outlines + result.errlines)
+        assert "Unknown sift config key" not in combined, combined
diff --git a/python/lib/sift_client/_tests/resources/test_test_results.py b/python/lib/sift_client/_tests/resources/test_test_results.py
index d0ccf4d1b..ce6d7707a 100644
--- a/python/lib/sift_client/_tests/resources/test_test_results.py
+++ b/python/lib/sift_client/_tests/resources/test_test_results.py
@@ -715,6 +715,75 @@ def test_import_log_file_round_trip(self, sift_client, nostromo_run, tmp_path):
             replayed_m = replayed_measurements_by_name[direct_m.name]
             compare_test_measurement_fields(replayed_m, direct_m)
 
+    def test_incremental_import_resumes_after_report_created(
+        self, sift_client, nostromo_run, tmp_path
+    ):
+        """Incremental replay must survive a resume after the report was created.
+
+        Regression: a resume tick rebuilds replay state from scratch, so the
+        CreateTestReport line (already uploaded on an earlier tick) is skipped and
+        the in-memory report is None. The replay must still apply the remaining
+        lines -- including the final UpdateTestReport -- rather than raising
+        "No CreateTestReport found" and leaving the report stuck IN_PROGRESS.
+        """
+        t0 = datetime.now(timezone.utc)
+        log_file = tmp_path / "incremental_resume.jsonl"
+
+        # Build a complete simulation log (no real resources created yet).
+        report = sift_client.test_results.create(
+            {
+                "status": TestStatus.IN_PROGRESS,
+                "name": "Incremental Resume Report",
+                "test_system_name": "IR System",
+                "test_case": "IR Case",
+                "start_time": t0,
+                "end_time": t0 + timedelta(seconds=30),
+                "run_id": nostromo_run.id_,
+            },
+            log_file=log_file,
+        )
+        step = sift_client.test_results.create_step(
+            TestStepCreate(
+                test_report_id=report.id_,
+                name="IR Step 1",
+                step_type=TestStepType.ACTION,
+                step_path="1",
+                status=TestStatus.IN_PROGRESS,
+                start_time=t0,
+                end_time=t0 + timedelta(seconds=10),
+            ),
+            log_file=log_file,
+        )
+        sift_client.test_results.update_step(
+            step,
+            {"status": TestStatus.FAILED},
+            log_file=log_file,
+        )
+        sift_client.test_results.update(
+            test_report=report,
+            update=TestReportUpdate(status=TestStatus.FAILED),
+            log_file=log_file,
+        )
+
+        all_lines = log_file.read_text().splitlines()
+        assert all_lines[0].startswith("[CreateTestReport:")
+
+        # First tick: only the CreateTestReport is present. This creates the real
+        # report and advances the tracking cursor past line 1.
+        log_file.write_text(all_lines[0] + "\n")
+        first = sift_client.test_results.import_log_file(log_file, incremental=True)
+        real_report_id = first.report.id_
+        assert real_report_id is not None
+
+        # Later tick: the rest of the log is now available. Resuming past the
+        # CreateTestReport line must not raise, and the final UpdateTestReport must
+        # land so the report ends FAILED rather than IN_PROGRESS.
+        log_file.write_text("\n".join(all_lines) + "\n")
+        sift_client.test_results.import_log_file(log_file, incremental=True)
+
+        refetched = sift_client.test_results.get(test_report_id=real_report_id)
+        assert refetched.status == TestStatus.FAILED
+
     @pytest.mark.asyncio
     async def test_malformed_log_line_skipped(self, tmp_path):
         """Malformed lines raise a ValueError during iteration."""
diff --git a/python/lib/sift_client/_tests/test_urls.py b/python/lib/sift_client/_tests/test_urls.py
new file mode 100644
index 000000000..be9febd52
--- /dev/null
+++ b/python/lib/sift_client/_tests/test_urls.py
@@ -0,0 +1,74 @@
+"""Tests for web-app URL derivation (``_internal/urls.py`` and ``SiftClient.app_url``)."""
+
+from __future__ import annotations
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.urls import frontend_origin_for_api
+
+
+class TestFrontendOriginForApi:
+    @pytest.mark.parametrize(
+        ("api_base_url", "expected"),
+        [
+            ("https://api.siftstack.com", "https://app.siftstack.com"),
+            ("https://gov.api.siftstack.com", "https://gov.siftstack.com"),
+            # Bare host (no scheme) resolves the same as the full URL.
+            ("api.siftstack.com", "https://app.siftstack.com"),
+        ],
+    )
+    def test_known_hosts(self, api_base_url: str, expected: str) -> None:
+        assert frontend_origin_for_api(api_base_url) == expected
+
+    def test_unknown_host_returns_none(self) -> None:
+        assert frontend_origin_for_api("https://api.acme.example.com") is None
+
+    def test_empty_returns_none(self) -> None:
+        assert frontend_origin_for_api("") is None
+
+    def test_override_wins_over_derivation(self) -> None:
+        # Override applies even for a known host.
+        assert (
+            frontend_origin_for_api("https://api.siftstack.com", override="https://app.acme.test")
+            == "https://app.acme.test"
+        )
+
+    def test_override_normalizes_bare_host(self) -> None:
+        assert (
+            frontend_origin_for_api("https://api.acme.example.com", override="sift.acme.test")
+            == "https://sift.acme.test"
+        )
+
+
+class TestSiftClientAppUrl:
+    def _client(self, rest_url: str, app_url: str | None = None) -> SiftClient:
+        return SiftClient(
+            connection_config=SiftConnectionConfig(
+                api_key="k",
+                grpc_url="grpc-api.siftstack.com:443",
+                rest_url=rest_url,
+            ),
+            app_url=app_url,
+        )
+
+    def test_derives_from_known_rest_host(self) -> None:
+        assert self._client("https://api.siftstack.com").app_url == "https://app.siftstack.com"
+
+    def test_unknown_host_without_override_is_none(self) -> None:
+        assert self._client("https://api.acme.example.com").app_url is None
+
+    def test_override_used_for_unknown_host(self) -> None:
+        client = self._client("https://api.acme.example.com", app_url="https://sift.acme.test")
+        assert client.app_url == "https://sift.acme.test"
+
+    def test_override_from_connection_config(self) -> None:
+        client = SiftClient(
+            connection_config=SiftConnectionConfig(
+                api_key="k",
+                grpc_url="grpc-api.siftstack.com:443",
+                rest_url="https://api.acme.example.com",
+                app_url="https://sift.acme.test",
+            )
+        )
+        assert client.app_url == "https://sift.acme.test"
diff --git a/python/lib/sift_client/_tests/util/conftest.py b/python/lib/sift_client/_tests/util/conftest.py
index 45279cca6..9e255da8a 100644
--- a/python/lib/sift_client/_tests/util/conftest.py
+++ b/python/lib/sift_client/_tests/util/conftest.py
@@ -1,14 +1,35 @@
-import pytest
+from pathlib import Path
 
+import pytest
 
-def pytest_addoption(parser: pytest.Parser) -> None:
-    existing_options = [opt.names() for opt in parser._anonymous.options]
-    # Flatten the list of lists into a single list of strings
-    flat_options = [item for sublist in existing_options for item in sublist]
-    if not any("--sift-test-results-log-file" in name for name in flat_options):
-        parser.addoption("--sift-test-results-log-file", action="store_true", default=False)
+_HERE = Path(__file__).parent
 
 
 def pytest_configure(config: pytest.Config) -> None:
     """Configure the pytest configuration to disable the Sift test results log file."""
-    config.option.sift_test_results_log_file = False
+    config.option.sift_log_file = False
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: "list[pytest.Item]") -> None:
+    """Bulk-apply ``@pytest.mark.sift_include`` to integration tests under util/.
+
+    The project-wide default in ``pyproject.toml`` is ``sift_autouse
+    = false`` so unit tests pay nothing for the globally-loaded Sift plugin.
+    Integration tests in this subtree still need the autouse fixtures, so this
+    hook flips the gate back on for any test already marked
+    ``@pytest.mark.integration``. Unit tests in the same directory (e.g.
+    ``test_cel_utils.py``) are left alone.
+
+    ``pytest_collection_modifyitems`` receives all items in the session (pytest
+    does not auto-scope it to the conftest's directory), so we filter by path
+    explicitly. ``Path.relative_to`` is the 3.8-compatible form of the path
+    containment check (``Path.is_relative_to`` arrived in 3.9).
+    """
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        if item.get_closest_marker("integration") is None:
+            continue
+        item.add_marker(pytest.mark.sift_include)
diff --git a/python/lib/sift_client/_tests/util/test_report_context.py b/python/lib/sift_client/_tests/util/test_report_context.py
new file mode 100644
index 000000000..73d738a7d
--- /dev/null
+++ b/python/lib/sift_client/_tests/util/test_report_context.py
@@ -0,0 +1,101 @@
+"""Tier 1 tests for `ReportContext.__exit__`'s replay-worker handling.
+
+Each test substitutes the `import-test-result-log` argv with a tiny Python
+`-c` invocation that produces a controlled end-state (clean exit / hang /
+non-zero exit), then enters and exits a `ReportContext` against a
+simulate-mode `SiftClient`. This validates that real subprocess outcomes
+route to the right branch of `__exit__` without depending on the real
+replay binary or a Sift backend.
+"""
+
+from __future__ import annotations
+
+import sys
+import warnings
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client.errors import SiftWarning
+from sift_client.util.test_results import ReportContext
+
+
+def _make_simulate_client() -> SiftClient:
+    """Build a SiftClient flagged for in-process simulation.
+
+    Constructor URLs are placeholders; nothing dials them because every
+    test-results write short-circuits through the simulate path.
+    """
+    client = SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key="test",
+            grpc_url="test.invalid:0",
+            rest_url="http://test.invalid",
+        )
+    )
+    client._simulate = True
+    return client
+
+
+def _make_context(command: list[str], *, timeout: float = 0.5) -> ReportContext:
+    """Build a ReportContext whose replay subprocess is the provided command.
+
+    ``log_file=True`` triggers the temp-file path so ``_open_import_proc`` fires
+    on ``__enter__``. The substitute argv is swapped in via the public-ish
+    ``_build_replay_command`` hook so the production Popen kwargs stay
+    exercised. ``timeout`` overrides the worker grace window so tests don't
+    wait the full production timeout for the timeout branch to trigger.
+    """
+    rc = ReportContext(_make_simulate_client(), name="test", log_file=True)
+    rc._build_replay_command = lambda: command  # type: ignore[method-assign]
+    rc._import_proc_timeout = timeout
+    return rc
+
+
+def test_worker_clean_exit_is_silent() -> None:
+    """Worker exits with code 0 → __exit__ emits no SiftWarning (case 1)."""
+    rc = _make_context([sys.executable, "-c", "pass"])
+    with warnings.catch_warnings(record=True) as recorded:
+        warnings.simplefilter("always")
+        with rc:
+            pass
+    sift_warnings = [w for w in recorded if issubclass(w.category, SiftWarning)]
+    assert sift_warnings == []
+    assert rc._import_proc is not None
+    assert rc._import_proc.returncode == 0
+
+
+def test_worker_timeout_kills_and_warns() -> None:
+    """Worker still running at session end → kill + SiftWarning, no raise (case 2)."""
+    rc = _make_context([sys.executable, "-c", "import time; time.sleep(30)"], timeout=0.2)
+    with pytest.warns(SiftWarning) as recorded:
+        with rc:
+            pass
+    assert rc._import_proc is not None
+    # `kill()` + `wait()` were called; process is dead.
+    assert rc._import_proc.poll() is not None
+    messages = "\n".join(str(w.message) for w in recorded)
+    assert "did not exit in 0.2s" in messages
+    # Recovery must resume from the tracking cursor, not batch-replay (which would
+    # duplicate already-uploaded entries), so the hint carries --incremental.
+    assert "import-test-result-log --incremental" in messages
+
+
+def test_worker_nonzero_exit_warns_stderr_no_raise() -> None:
+    """Worker exits non-zero with stderr → SiftWarning with stderr + replay hint, no raise (case 3)."""
+    rc = _make_context(
+        [
+            sys.executable,
+            "-c",
+            "import sys; sys.stderr.write('rpc deadline exceeded'); sys.exit(2)",
+        ]
+    )
+    with pytest.warns(SiftWarning) as recorded:
+        with rc:
+            pass
+    assert rc._import_proc is not None
+    assert rc._import_proc.returncode == 2
+    messages = "\n".join(str(w.message) for w in recorded)
+    assert "exited with code 2" in messages
+    assert "rpc deadline exceeded" in messages
+    assert "import-test-result-log --incremental" in messages
diff --git a/python/lib/sift_client/_tests/util/test_test_results_utils.py b/python/lib/sift_client/_tests/util/test_test_results_utils.py
index 256803769..c41587314 100644
--- a/python/lib/sift_client/_tests/util/test_test_results_utils.py
+++ b/python/lib/sift_client/_tests/util/test_test_results_utils.py
@@ -385,6 +385,46 @@ def test_report_outcome(self, report_context, step):
         if not initial_any_failures:
             report_context.any_failures = False
 
+    def test_measurements_passed_property(self, report_context, step):
+        """``step.measurements_passed`` counts only direct ``measure*`` calls
+        on this step, and stays True when only a substep or ``report_outcome``
+        records a failure.
+        """
+        current_step_path = step.current_step.step_path
+        initial_open_step_result = report_context.open_step_results.get(current_step_path, True)
+        initial_any_failures = report_context.any_failures
+
+        # No measurements yet, vacuously True.
+        assert step.measurements_passed is True
+
+        # In-bounds measurement keeps it True.
+        step.measure(name="ok", value=1.0, bounds={"min": 0.0, "max": 2.0})
+        assert step.measurements_passed is True
+
+        # A failing report_outcome doesn't flip measurements_passed because
+        # it isn't a direct measure() call on this step.
+        step.report_outcome("substep-fail", False, "deliberately failing")
+        assert step.measurements_passed is True
+
+        # Out-of-bounds measurement flips ``measurements_passed`` False.
+        step.measure(name="bad", value=99.0, bounds={"min": 0.0, "max": 2.0})
+        assert step.measurements_passed is False
+
+        # measure_avg / measure_all go through ``measure`` internally and
+        # also increment the counter on out-of-bounds values.
+        step.measure_avg(
+            name="bad_avg",
+            values=[50.0, 60.0, 70.0],  # mean 60 is well outside [0, 2]
+            bounds={"min": 0.0, "max": 2.0},
+        )
+        assert step.measurements_passed is False
+
+        # Restore state.
+        if initial_open_step_result:
+            report_context.open_step_results[current_step_path] = True
+        if not initial_any_failures:
+            report_context.any_failures = False
+
     def test_bad_assert(self, report_context, step):
         # Capture current state of report context's failures so we can keep things passed at a high level if the test's induced failures happen as expected.
         current_step_path = step.current_step.step_path
@@ -423,7 +463,11 @@ def test_bad_assert(self, report_context, step):
         assert parent_step.status == TestStatus.FAILED
         assert substep.status == TestStatus.FAILED
         assert nested_substep.status == TestStatus.FAILED
-        assert nested_substep.error_info is None
+        # The assertion-as-fail path records the concise assertion message (no
+        # traceback frames) on error_info while keeping the FAILED status.
+        assert nested_substep.error_info is not None
+        assert "AssertionError" in nested_substep.error_info.error_message
+        assert "Traceback (most recent call last)" not in nested_substep.error_info.error_message
         assert nested_substep_2.status == TestStatus.ERROR
         assert "AssertionError" in nested_substep_2.error_info.error_message
         assert sibling_substep.status == TestStatus.PASSED
diff --git a/python/lib/sift_client/client.py b/python/lib/sift_client/client.py
index 95fd25b71..d77aff6c0 100644
--- a/python/lib/sift_client/client.py
+++ b/python/lib/sift_client/client.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from sift_client._internal.urls import frontend_origin_for_api
 from sift_client.resources import (
     AssetsAPI,
     AssetsAPIAsync,
@@ -124,6 +125,7 @@ def __init__(
         grpc_url: str | None = None,
         rest_url: str | None = None,
         connection_config: SiftConnectionConfig | None = None,
+        app_url: str | None = None,
     ):
         """Initialize the SiftClient with specific connection parameters or a connection_config.
 
@@ -132,6 +134,10 @@ def __init__(
             grpc_url: The Sift gRPC API URL.
             rest_url: The Sift REST API URL.
             connection_config: A SiftConnectionConfig object to configure the connection behavior of the SiftClient.
+            app_url: The Sift web-app origin (e.g. ``https://app.siftstack.com``).
+                Set this for on-prem or custom deployments whose API host can't be
+                mapped to a frontend automatically; see the ``app_url`` property.
+                A value here takes precedence over ``connection_config.app_url``.
         """
         if not (api_key and grpc_url and rest_url) and not connection_config:
             raise ValueError(
@@ -152,6 +158,17 @@ def __init__(
         WithGrpcClient.__init__(self, grpc_client=grpc_client)
         WithRestClient.__init__(self, rest_client=rest_client)
 
+        # Explicit web-app origin override; falls back to the connection config's
+        # value, then to host-based derivation in the ``app_url`` property.
+        self._app_url: str | None = app_url or (
+            connection_config.app_url if connection_config else None
+        )
+
+        # When set, test-results writes return synthesized responses without
+        # contacting Sift. Read by `TestResultsAPIAsync._simulate`. Used by the
+        # pytest plugin's ``--sift-disabled`` mode.
+        self._simulate: bool = False
+
         self.ping = PingAPI(self)
         self.assets = AssetsAPI(self)
         self.calculated_channels = CalculatedChannelsAPI(self)
@@ -193,3 +210,18 @@ def grpc_client(self) -> GrpcClient:
     def rest_client(self) -> RestClient:
         """The REST client used by the SiftClient for making REST API calls."""
         return self._rest_client
+
+    @property
+    def app_url(self) -> str | None:
+        """The Sift web-app origin for this client, or None if it can't be determined.
+
+        Uses the explicit override passed at construction when set, otherwise
+        derives the origin from the REST host for known Sift deployments (e.g.
+        ``https://api.siftstack.com`` -> ``https://app.siftstack.com``). Returns
+        None for unrecognized hosts with no override.
+
+        # TODO: Add a ``WithAppPage`` mixin on BaseType so resources (TestReport,
+        # Run, ...) can expose their own web-app link from ``_client.app_url`` plus
+        # a per-type path, instead of callers assembling paths by hand.
+        """
+        return frontend_origin_for_api(self.rest_client.base_url, override=self._app_url)
diff --git a/python/lib/sift_client/pytest_plugin.py b/python/lib/sift_client/pytest_plugin.py
new file mode 100644
index 000000000..7e4c3c120
--- /dev/null
+++ b/python/lib/sift_client/pytest_plugin.py
@@ -0,0 +1,475 @@
+"""Sift pytest plugin: records each test as a step in a Sift test report.
+
+Load it from a project's ``conftest.py``::
+
+    pytest_plugins = ["sift_client.pytest_plugin"]
+
+This module holds only the plugin's public surface: the catchable warnings,
+the session-state globals a conftest may read, the fixtures a project can
+request or override, and pytest's hook entry points. The implementation
+(settings registry, step stacks, report construction, terminal formatting)
+lives under ``sift_client._internal.pytest_plugin``.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any, Generator
+
+import pytest
+
+from sift_client import SiftClient, SiftConnectionConfig
+from sift_client._internal.pytest_plugin.modes import (
+    gate_enabled,
+    is_disabled,
+    is_offline,
+    mode_label,
+    sdk_version,
+)
+from sift_client._internal.pytest_plugin.options import (
+    API_KEY_OPTION,
+    APP_URL_OPTION,
+    GRPC_URI_OPTION,
+    OPEN_OPTION,
+    REST_URI_OPTION,
+    register_options,
+    warn_on_unknown_env_vars,
+    warn_on_unknown_toml_keys,
+)
+from sift_client._internal.pytest_plugin.report import (
+    OFFLINE_DEFAULTS,
+    build_disabled_client,
+    finalize_after_teardown,
+    report_context_impl,
+    resolve_report_link,
+    step_impl,
+)
+from sift_client._internal.pytest_plugin.steps import (
+    build_hierarchy_chain,
+    build_parametrize_path,
+    finalize_parents,
+    get_or_create_parent_chain,
+    hierarchy_key,
+    parametrize_path_key,
+    release_finished_leaf,
+    resolve_parent_chain_in_context,
+    tally_expected_parents,
+)
+from sift_client._internal.pytest_plugin.terminal import (
+    maybe_open_report,
+    write_disabled_summary,
+    write_report_summary,
+)
+from sift_client.errors import SiftWarning
+from sift_client.sift_types.test_report import TestStatus
+from sift_client.util.test_results import ReportContext
+from sift_client.util.test_results.context_manager import NewStep
+
+__all__ = [
+    "REPORT_CONTEXT",
+    "SIFT_REPORT_ID_STASH_KEY",
+    "SIFT_REPORT_URL_STASH_KEY",
+    "NewStep",
+    "ReportContext",
+    "SiftPytestPluginWarning",
+    "SiftPytestStepDrainWarning",
+    "client_has_connection",
+    "report_context",
+    "sift_client",
+    "step",
+]
+
+
+# ---------------------------------------------------------------------------
+# Public warnings.
+# ---------------------------------------------------------------------------
+
+
+class SiftPytestPluginWarning(SiftWarning):
+    """Base warning for issues raised by the Sift pytest plugin."""
+
+
+class SiftPytestStepDrainWarning(SiftPytestPluginWarning):
+    """A parent step's ``__exit__`` raised while the plugin was closing it.
+
+    Surfaced when a parent step is closed (early as its subtree finishes, or at
+    session end) so the close can continue and pytest test outcomes stay
+    unaffected; the underlying exception is included in the message for debugging.
+    """
+
+
+# ---------------------------------------------------------------------------
+# Public session state and stash keys.
+# ---------------------------------------------------------------------------
+
+REPORT_CONTEXT: Any = None
+
+# Set at session end with the resolved (real) report id/URL when online and
+# uploaded. Read from a project's conftest in a later hook (e.g.
+# ``pytest_unconfigure``) to post the link, write a file, etc.
+SIFT_REPORT_ID_STASH_KEY = pytest.StashKey[str]()
+SIFT_REPORT_URL_STASH_KEY = pytest.StashKey[str]()
+
+
+# ---------------------------------------------------------------------------
+# Fixtures.
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def sift_client(pytestconfig: pytest.Config) -> SiftClient:
+    """Default ``SiftClient`` resolved from environment variables and ini keys.
+
+    Each credential is read from its environment variable first. The URIs
+    (``SIFT_GRPC_URI``, ``SIFT_REST_URI``) additionally fall back to the
+    ``sift_grpc_uri`` / ``sift_rest_uri`` ini keys, since they are stable
+    per-org values that are safe to commit. ``SIFT_API_KEY`` is intentionally
+    env-only; use ``pytest-dotenv`` (already a project dependency) to load
+    it from a ``.env`` file kept out of version control.
+
+    Projects that need custom construction (TLS toggles, custom timeouts,
+    etc.) can override this fixture by defining their own ``sift_client``
+    in their ``conftest.py``; pytest fixture resolution prefers the local
+    definition.
+
+    In ``--sift-offline`` mode the missing-credential check is relaxed:
+    real env vars and ini values still win when set (so the client is
+    constructible against a real backend even though no calls are made), but
+    anything still missing is filled with a placeholder. In ``--sift-disabled``
+    mode the credential resolution is skipped entirely and placeholders are
+    always used.
+    """
+    if is_disabled(pytestconfig):
+        return build_disabled_client()
+    resolved = {
+        "SIFT_API_KEY": API_KEY_OPTION.resolve(pytestconfig),
+        "SIFT_GRPC_URI": GRPC_URI_OPTION.resolve(pytestconfig),
+        "SIFT_REST_URI": REST_URI_OPTION.resolve(pytestconfig),
+    }
+    missing = [env for env, value in resolved.items() if not value]
+    if missing and not is_offline(pytestconfig):
+        raise pytest.UsageError(
+            "Sift credentials missing: "
+            + ", ".join(missing)
+            + ". Set the environment variable(s) (pytest-dotenv loads them "
+            "from a `.env` file automatically), or set the URIs under "
+            "`sift_grpc_uri` / `sift_rest_uri` in `[tool.pytest.ini_options]` "
+            "in pyproject.toml, or override the sift_client fixture in your "
+            "conftest.py, or pass --sift-offline / --sift-disabled to run "
+            "without contacting Sift."
+        )
+    for env in missing:
+        resolved[env] = OFFLINE_DEFAULTS[env]
+    # Web-app origin for the report link: the SIFT_APP_URL env var wins, then the
+    # sift_app_url ini key, else host-based derivation in SiftClient.app_url.
+    app_url = APP_URL_OPTION.resolve(pytestconfig)
+    return SiftClient(
+        connection_config=SiftConnectionConfig(
+            api_key=resolved["SIFT_API_KEY"] or "",
+            grpc_url=resolved["SIFT_GRPC_URI"] or "",
+            rest_url=resolved["SIFT_REST_URI"] or "",
+            app_url=app_url or None,
+        )
+    )
+
+
+@pytest.fixture(scope="session")
+def client_has_connection(pytestconfig: pytest.Config, request: pytest.FixtureRequest) -> bool:
+    """Verify the ``SiftClient`` can reach Sift via ``/ping``.
+
+    Consulted at session start by ``report_context`` in online mode. A failed
+    ping aborts the session via ``pytest.exit``. Override this fixture in your
+    conftest to use a
+    different reachability signal (e.g. a cached auth token) for environments
+    where pinging is the wrong check. Returns ``False`` in ``--sift-disabled``
+    mode without constructing a client.
+    """
+    if is_disabled(pytestconfig):
+        return False
+    sift_client = request.getfixturevalue("sift_client")
+    sift_client.ping.ping()
+    return True
+
+
+def _set_report_context(
+    contexts: Generator[ReportContext, None, None],
+) -> Generator[ReportContext, None, None]:
+    """Publish each yielded ReportContext to the module-level ``REPORT_CONTEXT``.
+
+    ``report_context_impl`` stays pure: it builds and yields the context.
+    Ownership of the reassignable global lives here so the terminal-summary and
+    makereport hooks (which read ``REPORT_CONTEXT``) see it. The global is set
+    after the context opens and before tests run, then the impl's ``finally``
+    still drains the step stacks before the context exits.
+    """
+    global REPORT_CONTEXT
+    for context in contexts:
+        REPORT_CONTEXT = context
+        yield context
+
+
+@pytest.fixture(scope="session")
+def report_context(
+    request: pytest.FixtureRequest, pytestconfig: pytest.Config
+) -> Generator[ReportContext, None, None]:
+    """Lazy session-scoped Sift ReportContext.
+
+    The fixture is no longer autouse; it's instantiated on the first call
+    to ``request.getfixturevalue("report_context")``, which today happens
+    inside the gated ``step`` and ``_sift_parents`` fixtures. If every test in
+    the session is excluded via the marker gate, this fixture is never resolved
+    and no ReportContext (or teardown subprocess) is created.
+
+    What gets yielded depends on the mode:
+
+    * ``--sift-disabled``: a real ``ReportContext`` against a placeholder
+      ``SiftClient`` with ``_simulate=True``. Every test-results write
+      returns a synthesized response without contacting Sift; no log file
+      is written; the replay subprocess never spawns. Test code that calls
+      ``step.measure(...)`` keeps working because bounds are evaluated as
+      usual and routed through the simulate path.
+    * ``--sift-offline``: a real ReportContext, but the session-start ping
+      is skipped, all create/update calls go to the JSONL log file, and
+      the import-test-result-log replay subprocess is not spawned at
+      session end.
+    * default (online): verify connectivity via ``client_has_connection``
+      before constructing the context. A failed ping aborts the session
+      with ``pytest.exit`` and points at ``--sift-offline`` and
+      ``--sift-disabled`` as escape hatches.
+
+    The log-file destination is controlled by
+    ``--sift-log-file``; defaults to a temp file when unset.
+    """
+    if is_disabled(pytestconfig):
+        yield from _set_report_context(
+            report_context_impl(build_disabled_client(), request, pytestconfig=pytestconfig)
+        )
+        return
+    sift_client = request.getfixturevalue("sift_client")
+    if not is_offline(pytestconfig):
+        try:
+            request.getfixturevalue("client_has_connection")
+        except pytest.UsageError:
+            raise
+        except Exception as exc:
+            grpc_config = getattr(getattr(sift_client, "grpc_client", None), "_config", None)
+            grpc_url = getattr(grpc_config, "uri", "<unknown>")
+            pytest.exit(
+                f"Sift ping failed against {grpc_url}: {exc}. "
+                "Pass --sift-offline to run without contacting Sift, or "
+                "--sift-disabled to skip Sift entirely.",
+                returncode=4,
+            )
+    yield from _set_report_context(
+        report_context_impl(sift_client, request, pytestconfig=pytestconfig)
+    )
+
+
+@pytest.fixture(autouse=True)
+def step(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+    _sift_parents: None,
+) -> Generator[NewStep | None, None, None]:
+    """Create an outer step for the function when the Sift gate is on.
+
+    Resolves the gate via `gate_enabled`: the `sift_exclude` marker forces off,
+    `sift_include` forces on, otherwise the `sift_autouse` ini default applies.
+    When on, requests the session `report_context` lazily; the first gated test
+    in the session triggers its creation, subsequent gated tests reuse it. In
+    ``--sift-disabled`` mode the report context is backed by a
+    ``SiftClient(_simulate=True)`` placeholder, so every write returns a
+    synthesized response without contacting Sift.
+    """
+    if not gate_enabled(request.node, pytestconfig):
+        yield None
+        return
+    rc = request.getfixturevalue("report_context")
+    yield from step_impl(rc, request)
+
+
+@pytest.fixture(autouse=True)
+def _sift_parents(
+    request: pytest.FixtureRequest,
+    pytestconfig: pytest.Config,
+) -> None:
+    """Resolve (get-or-create) the report-tree parent for the current item.
+
+    Builds the item's hierarchy (packages / modules / classes) and parametrize
+    parents via ``get_or_create_parent_chain`` and stashes the innermost one on
+    the node as ``_sift_parent`` for the ``step`` fixture to nest the leaf under.
+    Parents are keyed by identity and reused across sibling items in any order, so
+    no reordering of test items is needed.
+
+    Gated off when the item is excluded so excluded items never eagerly create
+    ``report_context`` (preserving its lazy, first-gated-test creation).
+    """
+    if not gate_enabled(request.node, pytestconfig):
+        return
+    request.node._sift_parent = get_or_create_parent_chain(request.node, pytestconfig, request)
+
+
+# ---------------------------------------------------------------------------
+# Hooks (in lifecycle fire order).
+# ---------------------------------------------------------------------------
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Register every CLI flag and pytest ini key declared in ``PLUGIN_OPTIONS``."""
+    register_options(parser)
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the Sift gate markers and warn on unknown ``SIFT_*`` settings."""
+    config.addinivalue_line(
+        "markers",
+        "sift_include: force the Sift autouse fixtures to activate for this test "
+        "regardless of the `sift_autouse` ini default.",
+    )
+    config.addinivalue_line(
+        "markers",
+        "sift_exclude: force the Sift autouse fixtures to skip this test "
+        "regardless of the `sift_autouse` ini default.",
+    )
+    # Surface typos in env vars and [tool.sift...] keys at session start so a
+    # silent no-op (env var that doesn't match anything, table key the loader
+    # ignores) becomes visible. The registry is the source of truth for what's
+    # known.
+    warn_on_unknown_env_vars()
+    warn_on_unknown_toml_keys(config)
+
+
+def pytest_itemcollected(item: pytest.Item) -> None:
+    """Cache each test item's hierarchy chain and parametrize path at collection.
+
+    This is a per-item hook, not ``pytest_collection_modifyitems`` — the plugin
+    never touches the ``items`` list or its order, so it cannot conflict with a
+    user's (or another plugin's) collection-ordering hook. The report tree is
+    built from an identity-keyed registry (see ``get_or_create_parent_chain``),
+    so item order is irrelevant to nesting; ``pytest-randomly``,
+    ``pytest-ordering``, and pytest's own fixture-scope reordering are all
+    preserved untouched.
+
+    The stash is a cache the autouse fixtures read back; both keys have an
+    on-demand recompute fallback, so an item a later hook injects without going
+    through this hook still resolves correctly.
+    """
+    item.stash[hierarchy_key] = build_hierarchy_chain(item, item.config)
+    item.stash[parametrize_path_key] = build_parametrize_path(item)
+
+
+def pytest_collection_finish(session: pytest.Session) -> None:
+    """Tally each parent's descendant leaves so parents can close mid-session.
+
+    Delegates to ``tally_expected_parents``; runs after deselection so the counts
+    reflect only the selected, gated-in items. See ``release_finished_leaf``.
+    """
+    tally_expected_parents(session)
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
+    """Capture per-phase reports and finalize step status after teardown.
+
+    Stashes both ``rep_<when>`` (the ``CallInfo``, kept for pytest plugins that
+    expect that conventional attribute) and ``_sift_phase_<when>`` (a
+    ``SimpleNamespace(call, report)`` used by ``resolve_initial_status``). The
+    collection-time skip path is strictly gated on ``_sift_step`` being unset
+    so it does not duplicate steps the fixture already created.
+    """
+    outcome = yield
+    report = outcome.get_result()
+    setattr(item, "rep_" + report.when, call)
+    setattr(item, "_sift_phase_" + report.when, SimpleNamespace(call=call, report=report))
+
+    # Collection-time skip (``@pytest.mark.skip`` / ``skipif``): the autouse
+    # ``step`` fixture never runs, so the hook is the only place that can
+    # record a step. Presence of ``_sift_step`` is the "fixture ran" signal.
+    if (
+        REPORT_CONTEXT
+        and report.when == "setup"
+        and report.outcome == "skipped"
+        and getattr(item, "_sift_step", None) is None
+    ):
+        # Nest the inline step under the same registry parents a running sibling
+        # would use. The autouse ``_sift_parents`` fixture never ran for a
+        # marker-skipped item, and the report-tree parents live off the step
+        # stack, so without resolving the parent here the step lands at the
+        # report root instead of under its module/class.
+        parent_ns = resolve_parent_chain_in_context(item, item.config, REPORT_CONTEXT)
+        parent_step = parent_ns.current_step if parent_ns is not None else None
+        with REPORT_CONTEXT.new_step(name=item.name, parent=parent_step) as inline_step:
+            inline_step.current_step.update({"status": TestStatus.SKIPPED})
+
+    if report.when == "teardown":
+        finalize_after_teardown(item, report)
+
+
+def pytest_runtest_logfinish(nodeid: str, location: tuple[str, int | None, str]) -> None:
+    """Close report-tree parents whose subtree finished with this item.
+
+    Fires once per item (pass / fail / skip / error); delegates to
+    ``release_finished_leaf``, which decrements the item's parents' remaining-leaf
+    counts and closes any that reach zero — so containers resolve progressively
+    rather than all at session end.
+    """
+    release_finished_leaf(nodeid)
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
+    """Close any report-tree parents still open at session end (innermost first).
+
+    Normally a no-op: ``report_context_impl`` finalizes the parents inside the
+    ``ReportContext`` block so their updates reach the log before the import
+    worker drains, and most parents already closed early as their subtrees
+    finished. This is the idempotent backstop for anything still open.
+    """
+    finalize_parents()
+
+
+def pytest_report_header(config: pytest.Config) -> str | None:
+    """Emit a session-start header with the SDK version and active mode.
+
+    Suppressed under ``-q`` (negative verbosity), matching how pytest hides its
+    own platform/plugin header.
+    """
+    if config.get_verbosity() < 0:
+        return None
+    return f"Sift: sift-stack-py {sdk_version()} — {mode_label(config)} mode"
+
+
+def pytest_terminal_summary(terminalreporter: Any, exitstatus: int, config: pytest.Config) -> None:
+    """Emit a session-end Sift report summary, adapting per mode.
+
+    The printed panel is suppressed under ``-q``, but programmatic side effects
+    (stashing the report ref for ``conftest.py``, ``--sift-open-report``) still run so
+    other plugins and CI steps can consume the result. The panel itself is
+    rendered by ``write_report_summary``; this hook handles the side effects.
+    """
+    quiet = config.get_verbosity() < 0
+
+    if is_disabled(config):
+        if not quiet:
+            write_disabled_summary(terminalreporter)
+        return
+
+    context = REPORT_CONTEXT
+    if context is None:
+        # No gated test ran, so no report context was created. Nothing to show.
+        return
+
+    offline = is_offline(config)
+    # Resolve the link first so stashing and --sift-open-report run even under -q;
+    # programmatic consumers don't care about verbosity.
+    report_id, report_url = resolve_report_link(context, offline)
+    if report_id:
+        config.stash[SIFT_REPORT_ID_STASH_KEY] = report_id
+    if report_url is not None:
+        config.stash[SIFT_REPORT_URL_STASH_KEY] = report_url
+        if OPEN_OPTION.resolve(config):
+            maybe_open_report(report_url)
+
+    if quiet:
+        return
+
+    write_report_summary(terminalreporter, context, config, report_id, report_url, offline)
diff --git a/python/lib/sift_client/resources/test_results.py b/python/lib/sift_client/resources/test_results.py
index 22e984b5e..10ef70920 100644
--- a/python/lib/sift_client/resources/test_results.py
+++ b/python/lib/sift_client/resources/test_results.py
@@ -96,6 +96,7 @@ async def create(
         created_report = await self._low_level_client.create_test_report(
             test_report=test_report,
             log_file=log_file,
+            simulate=self.client._simulate,
         )
         return self._finalize(created_report, log_file)
 
@@ -271,7 +272,7 @@ async def update(
         update.resource_id = test_report_id
         existing = test_report if isinstance(test_report, TestReport) else None
         updated_test_report = await self._low_level_client.update_test_report(
-            update, log_file=log_file, existing=existing
+            update, log_file=log_file, existing=existing, simulate=self.client._simulate
         )
         return self._finalize(updated_test_report, log_file)
 
@@ -319,7 +320,7 @@ async def create_step(
         if isinstance(test_step, dict):
             test_step = TestStepCreate.model_validate(test_step)
         test_step_result = await self._low_level_client.create_test_step(
-            test_step, log_file=log_file
+            test_step, log_file=log_file, simulate=self.client._simulate
         )
         return self._finalize(test_step_result, log_file)
 
@@ -450,7 +451,7 @@ async def update_step(
         update.resource_id = test_step_id
         existing = test_step if isinstance(test_step, TestStep) else None
         updated_test_step = await self._low_level_client.update_test_step(
-            update, log_file=log_file, existing=existing
+            update, log_file=log_file, existing=existing, simulate=self.client._simulate
         )
         return self._finalize(updated_test_step, log_file)
 
@@ -484,10 +485,10 @@ async def create_measurement(
         if isinstance(test_measurement, dict):
             test_measurement = TestMeasurementCreate.model_validate(test_measurement)
         test_measurement_result = await self._low_level_client.create_test_measurement(
-            test_measurement, log_file=log_file
+            test_measurement, log_file=log_file, simulate=self.client._simulate
         )
         measurement = self._finalize(test_measurement_result, log_file)
-        if update_step and log_file is None:
+        if update_step and log_file is None and not self.client._simulate:
             step = await self.get_step(test_step=test_measurement_result.test_step_id)
             if step.status == TestStatus.PASSED and not measurement.passed:
                 await self.update_step(test_step=step, update={"status": TestStatus.FAILED})
@@ -508,7 +509,7 @@ async def create_measurements(
             A tuple of (measurements_created_count, measurement_ids).
         """
         return await self._low_level_client.create_test_measurements(
-            test_measurements, log_file=log_file
+            test_measurements, log_file=log_file, simulate=self.client._simulate
         )
 
     async def list_measurements(
@@ -621,10 +622,16 @@ async def update_measurement(
 
         update.resource_id = test_measurement.id_
         updated_test_measurement = await self._low_level_client.update_test_measurement(
-            update, log_file=log_file, existing=test_measurement
+            update, log_file=log_file, existing=test_measurement, simulate=self.client._simulate
         )
         updated_test_measurement = self._finalize(updated_test_measurement, log_file)
-        if update_step and log_file is None and update.passed is not None and not update.passed:
+        if (
+            update_step
+            and log_file is None
+            and not self.client._simulate
+            and update.passed is not None
+            and not update.passed
+        ):
             step = await self.get_step(test_step=updated_test_measurement.test_step_id)
             if step.status == TestStatus.PASSED:
                 await self.update_step(test_step=step, update={"status": TestStatus.FAILED})
@@ -664,7 +671,8 @@ async def import_log_file(
             A ReplayResult containing the created report, steps, and measurements.
         """
         result = await self._low_level_client.import_log_file(log_file, incremental=incremental)
-        result.report = self._apply_client_to_instance(result.report)
+        if result.report is not None:
+            result.report = self._apply_client_to_instance(result.report)
         result.steps = self._apply_client_to_instances(result.steps)
         result.measurements = self._apply_client_to_instances(result.measurements)
         return result
diff --git a/python/lib/sift_client/scripts/import_test_result_log.py b/python/lib/sift_client/scripts/import_test_result_log.py
index 7e14e4d59..3f66af1da 100644
--- a/python/lib/sift_client/scripts/import_test_result_log.py
+++ b/python/lib/sift_client/scripts/import_test_result_log.py
@@ -20,7 +20,8 @@
 
 
 def _print_result(result: ReplayResult) -> None:
-    print(f"Report: {result.report.name} (id={result.report.id_})")
+    if result.report is not None:
+        print(f"Report: {result.report.name} (id={result.report.id_})")
     print(f"Steps:  {len(result.steps)}")
     for step in result.steps:
         print(f"  - {step.step_path} [{step.status}]")
diff --git a/python/lib/sift_client/sift_types/_mixins/metadata.py b/python/lib/sift_client/sift_types/_mixins/metadata.py
new file mode 100644
index 000000000..b53fa5dce
--- /dev/null
+++ b/python/lib/sift_client/sift_types/_mixins/metadata.py
@@ -0,0 +1,19 @@
+"""Placeholder for a future ``MetadataMixin`` (not yet implemented).
+
+TODO(metadata-mixin): metadata updates REPLACE the whole map.
+``entity.update({"metadata": {...}})`` builds a field mask over ``metadata``
+(see ``ModelUpdate.to_proto_with_mask`` in ``sift_types/_base.py``) and replaces
+it server-side — callers must spread the current ``.metadata`` first or silently
+drop existing keys (config defaults, git fields, ``pytest_command``).
+
+Planned shape: a ``MetadataMixin`` exposing a read-merge-write helper such as
+``add_metadata(**kv)`` / ``merge_metadata(dict)``, implemented as
+``self.update({"metadata": {**self.metadata, **kv}})``. Mix into every read
+entity that carries a ``metadata`` field — ``Asset``, ``Run``, ``Report``,
+``TestReport``, ``TestStep``, ``TestMeasurement`` — alongside
+``FileAttachmentsMixin`` and ``SimulatedMixin``. It stays a mixin (not a
+``BaseType`` method) because it relies on the ``metadata`` field, which not
+every ``BaseType`` subclass has (e.g. ``CalculatedChannel`` exposes metadata
+only on its Create/Update models, so it is out of scope). Until it exists,
+merge at the call site.
+"""
diff --git a/python/lib/sift_client/sift_types/_mixins/simulated.py b/python/lib/sift_client/sift_types/_mixins/simulated.py
new file mode 100644
index 000000000..bdc2c572a
--- /dev/null
+++ b/python/lib/sift_client/sift_types/_mixins/simulated.py
@@ -0,0 +1,32 @@
+"""Mixin that exposes ``is_simulated`` on test-results entity types."""
+
+from __future__ import annotations
+
+
+class SimulatedMixin:
+    """Mixin for sift_types whose response can be produced by the simulate path.
+
+    The low-level wrapper stamps ``_simulated=True`` on entities it returns from
+    a simulated branch (see ``TestResultsLowLevelClient._mark_simulated``). This
+    mixin exposes that flag as a read-only ``is_simulated`` property so
+    consumers and tests can detect when an instance was synthesized rather than
+    round-tripped through Sift.
+
+    Inheriting classes are expected to declare a private field
+    ``_simulated: bool = False`` so pydantic tracks the default correctly.
+    """
+
+    _simulated: bool
+
+    @property
+    def is_simulated(self) -> bool:
+        """True when this instance was returned from the simulate path.
+
+        Set by the low-level wrapper when the call short-circuited to a
+        synthesized response (either ``SiftClient._simulate`` mode or per-call
+        ``log_file`` / ``simulate=True``). False for entities returned from a
+        normal online call or constructed manually outside the SDK. Offline
+        mode also reports True since responses are synthesized prior to
+        replay.
+        """
+        return self._simulated
diff --git a/python/lib/sift_client/sift_types/asset.py b/python/lib/sift_client/sift_types/asset.py
index 78217934f..ea0895929 100644
--- a/python/lib/sift_client/sift_types/asset.py
+++ b/python/lib/sift_client/sift_types/asset.py
@@ -27,6 +27,8 @@ class Asset(BaseType[AssetProto, "Asset"], FileAttachmentsMixin):
     modified_date: datetime
     modified_by_user_id: str
     tags: list[str | Tag]
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     is_archived: bool
 
diff --git a/python/lib/sift_client/sift_types/report.py b/python/lib/sift_client/sift_types/report.py
index 42f349f42..34f64e2f1 100644
--- a/python/lib/sift_client/sift_types/report.py
+++ b/python/lib/sift_client/sift_types/report.py
@@ -108,6 +108,8 @@ class Report(BaseType[ReportProto, "Report"]):
     summaries: list[ReportRuleSummary]
     tags: list[str]
     rerun_from_report_id: str | None = None
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     job_id: str
     archived_date: datetime | None = None
diff --git a/python/lib/sift_client/sift_types/run.py b/python/lib/sift_client/sift_types/run.py
index ec6690896..e91225342 100644
--- a/python/lib/sift_client/sift_types/run.py
+++ b/python/lib/sift_client/sift_types/run.py
@@ -40,6 +40,8 @@ class Run(BaseType[RunProto, "Run"], FileAttachmentsMixin):
     created_by_user_id: str
     modified_by_user_id: str
     organization_id: str
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     tags: list[str]
     asset_ids: list[str]
diff --git a/python/lib/sift_client/sift_types/test_report.py b/python/lib/sift_client/sift_types/test_report.py
index ecc24f52f..b8b1f2236 100644
--- a/python/lib/sift_client/sift_types/test_report.py
+++ b/python/lib/sift_client/sift_types/test_report.py
@@ -36,6 +36,7 @@
     ModelUpdate,
 )
 from sift_client.sift_types._mixins.file_attachments import FileAttachmentsMixin
+from sift_client.sift_types._mixins.simulated import SimulatedMixin
 from sift_client.sift_types.channel import Channel
 from sift_client.util.metadata import metadata_dict_to_proto, metadata_proto_to_dict
 
@@ -153,7 +154,7 @@ def to_proto(self) -> TestStepProto:
         return proto
 
 
-class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin):
+class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin, SimulatedMixin):
     """TestStep model representing a step in a test."""
 
     test_report_id: str
@@ -166,9 +167,13 @@ class TestStep(BaseType[TestStepProto, "TestStep"], FileAttachmentsMixin):
     start_time: datetime
     end_time: datetime
     error_info: ErrorInfo | None = None
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool] | None = None
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(cls, proto: TestStepProto, sift_client: SiftClient | None = None) -> TestStep:
@@ -383,7 +388,7 @@ def to_proto(self) -> TestMeasurementProto:
         return proto
 
 
-class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"]):
+class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"], SimulatedMixin):
     """TestMeasurement model representing a measurement in a test."""
 
     measurement_type: TestMeasurementType
@@ -399,11 +404,47 @@ class TestMeasurement(BaseType[TestMeasurementProto, "TestMeasurement"]):
     passed: bool
     timestamp: datetime
     description: str | None = None
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool] | None = None
     channel_names: list[str] | None = None
 
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
+
+    def __str__(self) -> str:
+        """Human-readable form: ``[STATUS] name = value [unit] (bounds)``.
+
+        Used for failure messages, logs, and the REPL. The string omits whichever
+        parts aren't set (no unit, no bounds), and falls back to ``?`` if no
+        value type is populated. The status prefix reflects ``self.passed``.
+        """
+        status = "PASSED" if self.passed else "FAILED"
+        if self.numeric_value is not None:
+            value = f"{self.numeric_value}"
+            if self.unit:
+                value += f" {self.unit}"
+        elif self.string_value is not None:
+            value = repr(self.string_value)
+        elif self.boolean_value is not None:
+            value = str(self.boolean_value).lower()
+        else:
+            value = "?"
+        bounds = ""
+        nb = self.numeric_bounds
+        if nb is not None:
+            parts: list[str] = []
+            if nb.min is not None:
+                parts.append(f"min {nb.min}")
+            if nb.max is not None:
+                parts.append(f"max {nb.max}")
+            if parts:
+                bounds = f" ({', '.join(parts)})"
+        elif self.string_expected_value:
+            bounds = f" (expected {self.string_expected_value!r})"
+        return f"[{status}] {self.name} = {value}{bounds}"
 
     @classmethod
     def _from_proto(
@@ -599,7 +640,7 @@ def _to_proto(self) -> ErrorInfoProto:
         )
 
 
-class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
+class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin, SimulatedMixin):
     """TestReport model representing a test report."""
 
     status: TestStatus
@@ -608,6 +649,8 @@ class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
     test_case: str
     start_time: datetime
     end_time: datetime
+    # NOTE: update() replaces this map wholesale. See TODO(metadata-mixin) in
+    # sift_types/_mixins/metadata.py before adding keys at runtime.
     metadata: dict[str, str | float | bool]
     serial_number: str | None = None
     part_number: str | None = None
@@ -617,6 +660,8 @@ class TestReport(BaseType[TestReportProto, "TestReport"], FileAttachmentsMixin):
     is_archived: bool
     # Set by the resource layer when this instance was produced from a logging-mode call
     _log_file: str | Path | None = None
+    # Set by the low-level wrapper when this instance came from the simulate path
+    _simulated: bool = False
 
     @classmethod
     def _from_proto(
diff --git a/python/lib/sift_client/transport/base_connection.py b/python/lib/sift_client/transport/base_connection.py
index 02f0e096e..6586412fe 100644
--- a/python/lib/sift_client/transport/base_connection.py
+++ b/python/lib/sift_client/transport/base_connection.py
@@ -24,6 +24,7 @@ def __init__(
         api_key: str,
         use_ssl: bool = True,
         cert_via_openssl: bool = False,
+        app_url: str | None = None,
     ):
         """Initialize the connection configuration.
 
@@ -33,12 +34,17 @@ def __init__(
             api_key: The API key for authentication.
             use_ssl: Whether to use SSL/TLS for secure connections.
             cert_via_openssl: Whether to use OpenSSL for certificate validation.
+            app_url: The Sift web-app origin (e.g. ``https://app.siftstack.com``).
+                Set this for on-prem or custom deployments whose API host can't be
+                mapped to a frontend automatically. When unset, the web-app URL is
+                derived from ``rest_url`` for known hosts.
         """
         self.api_key = api_key
         self.grpc_url = grpc_url
         self.rest_url = rest_url
         self.use_ssl = use_ssl
         self.cert_via_openssl = cert_via_openssl
+        self.app_url = app_url
 
     def get_grpc_config(self):
         """Create and return a GrpcConfig with the current settings.
diff --git a/python/lib/sift_client/util/test_results/__init__.py b/python/lib/sift_client/util/test_results/__init__.py
index e7a82866c..a3ac081bc 100644
--- a/python/lib/sift_client/util/test_results/__init__.py
+++ b/python/lib/sift_client/util/test_results/__init__.py
@@ -49,78 +49,105 @@ def main(self):
     cleanup()
 ```
 
-## Pytest Fixtures
+## Pytest Plugin
 
-The report context and steps can also be accessed in pytest by importing the `report_context` and `step` fixtures.
+The pytest plugin lives at `sift_client.pytest_plugin`. Opt in
+from your `conftest.py`:
 
-### How to use:
-- These fixtures are set to autouse and will automatically create a report and steps for each test function.
-  - If you want each module(file) to be marked as a step w/ each test as a substep, import the `module_substep` fixture as well.
-- The `report_context` fixture requires a fixture `sift_client` returning an `SiftClient` instance to be passed in.
-
-Note: FedRAMP users: report_context will log test results to a temp file to avoid API calls during test execution. If this is a shared environment, you can disable logging by passing ``--sift-test-results-log-file=false``.
-
-#### Configuration
-
-Import the `pytest_addoption` function to add configuration options for Test Results to the commandline or add the options to your pyproject.toml file (https://docs.pytest.org/en/stable/reference/customize.html#configuration). If ommitted, will use the default values described below.
-
-- Git metadata: Include git metadata (repo, branch, commit) in the test results. Default is True. You can disable it by passing `--no-sift-test-results-git-metadata`.
-- Log file: Write test results to a file. This happens automatically but you can configure specify a specific log file by passing `--sift-test-results-log-file=<path>` or disable logging by passing `--sift-test-results-log-file=false`.
-- Check connection: Pass `--sift-test-results-check-connection` (off by default) to make the `report_context`, `step`, and `module_substep` fixtures no-op when the Sift client has no connection to the server. Requires a `client_has_connection` fixture to be available.
+```python
+# conftest.py
+pytest_plugins = ["sift_client.pytest_plugin"]
+```
 
-###### Example at top of your test file or in your conftest.py file:
+By default, every test in the session produces a Sift report: one
+`TestReport` per session, one step per test function (`step`), and one
+parent step per Python package (directory with `__init__.py`), test file,
+and test class
+above it. Individual layers can be flattened via the `sift_package_step`,
+`sift_module_step`, `sift_class_step`, and `sift_parametrize_nesting` ini
+flags. The plugin also registers a default `sift_client` fixture that reads
+`SIFT_API_KEY`, `SIFT_GRPC_URI`, and `SIFT_REST_URI` from the environment.
+Override it by defining your own `sift_client` fixture in your conftest.
+
+Note: FedRAMP users: results are buffered to a temp file and uploaded by a
+subprocess at session end (no API calls during the run). Disable the buffer
+entirely with `--sift-log-file=false` for inline uploads.
+
+### Controlling which tests produce reports
+
+The autouse fixtures fire for every test by default. To narrow that:
+
+- Set `sift_autouse = false` in `pyproject.toml` to flip the
+  project default off, then opt tests back in below.
+- `@pytest.mark.sift_include` forces reporting on for a test, class, or
+  module. `@pytest.mark.sift_exclude` forces it off. Closest marker wins.
+  `sift_exclude` beats `sift_include` when both apply.
+- `pytestmark` at the class or module level inherits to every test in scope.
+- For a whole directory, apply the marker in bulk from that directory's
+  `conftest.py`:
 
 ```python
-import pytest
+# tests/integration/conftest.py
+from pathlib import Path
 
-@pytest.fixture(scope="session")
-def sift_client() -> SiftClient:
-    grpc_url = os.getenv("SIFT_GRPC_URI", "localhost:50051")
-    rest_url = os.getenv("SIFT_REST_URI", "localhost:8080")
-    api_key = os.getenv("SIFT_API_KEY", "")
+import pytest
 
-    client = SiftClient(api_key=api_key, grpc_url=grpc_url, rest_url=rest_url)
+_HERE = Path(__file__).parent
 
-    return client
 
-from sift_client.util.test_results import *
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        try:
+            item.path.relative_to(_HERE)
+        except ValueError:
+            continue
+        item.add_marker(pytest.mark.sift_include)
 ```
 
-###### Then in your test file:
+#### Configuration
 
-```python
-# Because step was already imported and set autouse=True, this test will automatically get a step created for it.
-def test_no_includes():
-    assert condition, "Example failure"
-
-# Passing the fixtures to the test function allows you to take measurements or create substeps.
-def test_example(report_context, step):
-    # This will add a measurement to the current step for this function
-    step.measure(name="Example Measurement", value=test_string_value, bounds="expected_string_value")
-
-    with report_context.new_step(name="Example Step") as substep:
-        example_measurement = tlm.read(channel_name)
-        substep.measure(name="Substep Measurement", value=example_measurement, bounds=(min=74.9, max=75.1))
+CLI options registered by the plugin:
+
+- `--sift-offline`: Run without contacting Sift. All create/update calls are
+  written to the JSONL log file for later replay via `import-test-result-log`.
+  No session-start ping is attempted.
+- `--sift-disabled`: Skip Sift entirely. Nothing contacts the API and no
+  log file is written. `step.measure(...)` still evaluates bounds and
+  returns a real pass/fail boolean. Returned entities expose
+  ``is_simulated == True``. Also honored via the `SIFT_DISABLED` env
+  var. Supersedes every other flag.
+- `--sift-log-file`: Path to write the JSONL log file. `true`
+  (default) auto-creates a temp file. `false` or `none` disables logging.
+  Any other value is treated as a file path.
+- `--no-sift-git-metadata`: Exclude git metadata (repo, branch,
+  commit) from the test report. Included by default.
+
+Each option has a matching ini key for per-project configuration under
+``[tool.pytest.ini_options]`` in ``pyproject.toml`` (or ``[pytest]`` in
+``pytest.ini``). CLI flags override ini values. The
+``sift_autouse`` ini key (bool, default ``true``) sets the
+project-wide default for the gate described above. The default
+``sift_client`` fixture reads ``sift_grpc_uri`` and ``sift_rest_uri`` as
+fallbacks when the corresponding env vars are unset (env vars win when
+both are set). ``SIFT_API_KEY`` is env-only. Load it from a ``.env`` file
+via the ``pytest-dotenv`` plugin or inject it via your CI secret manager.
+
+```toml
+[tool.pytest.ini_options]
+sift_autouse = false
+sift_offline = true
+sift_git_metadata = false
+sift_grpc_uri = "your-org.sift.example:443"
+sift_rest_uri = "https://your-org.sift.example"
 ```
+
+To disable the plugin for a single run:
+`pytest -p no:sift_client.pytest_plugin`.
 """
 
 from .context_manager import NewStep, ReportContext
-from .pytest_util import (
-    client_has_connection,
-    module_substep,
-    pytest_addoption,
-    pytest_runtest_makereport,
-    report_context,
-    step,
-)
 
 __all__ = [
     "NewStep",
     "ReportContext",
-    "client_has_connection",
-    "module_substep",
-    "pytest_addoption",
-    "pytest_runtest_makereport",
-    "report_context",
-    "step",
 ]
diff --git a/python/lib/sift_client/util/test_results/bounds.py b/python/lib/sift_client/util/test_results/bounds.py
index ef5c67ce5..b734cc126 100644
--- a/python/lib/sift_client/util/test_results/bounds.py
+++ b/python/lib/sift_client/util/test_results/bounds.py
@@ -1,5 +1,10 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
 from sift_client.sift_types.test_report import (
     NumericBounds,
     TestMeasurement,
@@ -8,6 +13,55 @@
     TestMeasurementUpdate,
 )
 
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+def to_numpy_array(
+    values: list[float | int] | NDArray[np.float64] | pd.Series,
+) -> NDArray[np.float64]:
+    """Normalize a list / ndarray / pandas Series into a numpy array.
+
+    Shared by ``measure_avg`` and ``measure_all`` on ``NewStep`` so the
+    accepted input types stay in sync across measurement variants.
+    """
+    if isinstance(values, list):
+        return np.array(values)
+    if isinstance(values, np.ndarray):
+        return values
+    if isinstance(values, pd.Series):
+        return values.to_numpy()
+    raise ValueError(f"Invalid value type: {type(values)}")
+
+
+def out_of_bounds_mask(
+    arr: NDArray[np.float64],
+    bounds: dict[str, float] | NumericBounds,
+) -> NDArray[np.bool_]:
+    """Return a boolean mask selecting elements of ``arr`` that violate ``bounds``.
+
+    Raises ``ValueError`` when ``bounds`` has neither ``min`` nor ``max`` set.
+    """
+    if isinstance(bounds, dict):
+        bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
+    mask: NDArray[np.bool_] | None = None
+    if bounds.min is not None:
+        mask = arr < bounds.min
+    if bounds.max is not None:
+        above = arr > bounds.max
+        mask = mask | above if mask is not None else above
+    if mask is None:
+        raise ValueError("No bounds provided")
+    return mask
+
+
+def all_within_bounds(
+    arr: NDArray[np.float64],
+    bounds: dict[str, float] | NumericBounds,
+) -> bool:
+    """Return True when every element of ``arr`` is within ``bounds``."""
+    return bool(arr[out_of_bounds_mask(arr, bounds)].size == 0)
+
 
 def assign_value_to_measurement(
     measurement: TestMeasurement | TestMeasurementCreate | TestMeasurementUpdate,
@@ -32,6 +86,38 @@ def assign_value_to_measurement(
         raise ValueError(f"Invalid value type: {type(value)}")
 
 
+def value_passes_bounds(
+    value: float | str | bool,
+    bounds: dict[str, float] | NumericBounds | str | bool | None,
+) -> bool:
+    """Evaluate a value against bounds without recording a measurement."""
+    if bounds is None:
+        return True
+    if isinstance(bounds, dict):
+        bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
+    if isinstance(bounds, bool):
+        if isinstance(value, str):
+            return str(value).lower() == str(bounds).lower()
+        return bool(value) == bounds
+    if isinstance(bounds, str):
+        if not (isinstance(value, str) or isinstance(value, bool)):
+            raise ValueError("Value must be a string if bounds provided is a string")
+        if isinstance(value, bool):
+            return str(value).lower() == str(bounds).lower()
+        return value == bounds
+    # NumericBounds
+    try:
+        if bounds.min is not None and bounds.min > value:  # type: ignore[operator]
+            return False
+        if bounds.max is not None and bounds.max < value:  # type: ignore[operator]
+            return False
+    except TypeError:
+        raise TypeError(
+            f"Value must be a float or int to evaluate numeric bounds but gave {type(value)}"
+        ) from None
+    return True
+
+
 def evaluate_measurement_bounds(
     measurement: TestMeasurement | TestMeasurementCreate | TestMeasurementUpdate,
     value: float | str | bool,
@@ -53,31 +139,10 @@ def evaluate_measurement_bounds(
 
     if isinstance(bounds, dict):
         bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))
-    if isinstance(bounds, bool):
-        if isinstance(value, str):
-            measurement.passed = str(value).lower() == str(bounds).lower()
-        else:
-            measurement.passed = bool(value) == bounds
-        return bool(measurement.passed)
-    elif isinstance(bounds, str):
-        if not (isinstance(value, str) or isinstance(value, bool)):
-            raise ValueError("Value must be a string if bounds provided is a string")
+    if isinstance(bounds, str) and not isinstance(bounds, bool):
         measurement.string_expected_value = bounds
-        if isinstance(value, bool):
-            measurement.passed = str(value).lower() == str(bounds).lower()
-        else:
-            measurement.passed = value == bounds
     elif isinstance(bounds, NumericBounds):
         measurement.numeric_bounds = bounds
-        measurement.passed = True
-        try:
-            if measurement.numeric_bounds.min is not None:
-                measurement.passed = measurement.passed and measurement.numeric_bounds.min <= value  # type: ignore
-            if measurement.numeric_bounds.max is not None:
-                measurement.passed = measurement.passed and measurement.numeric_bounds.max >= value  # type: ignore
-        except TypeError:
-            raise TypeError(
-                f"Value must be a float or int to evaluate numeric bounds but gave {type(value)}"
-            ) from None
 
+    measurement.passed = value_passes_bounds(value, bounds)
     return bool(measurement.passed)
diff --git a/python/lib/sift_client/util/test_results/context_manager.py b/python/lib/sift_client/util/test_results/context_manager.py
index 354f8564d..84b97dab8 100644
--- a/python/lib/sift_client/util/test_results/context_manager.py
+++ b/python/lib/sift_client/util/test_results/context_manager.py
@@ -7,17 +7,20 @@
 import subprocess
 import tempfile
 import traceback
+import warnings
+from collections import Counter
 from contextlib import AbstractContextManager, contextmanager
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
-import pandas as pd
 
+from sift_client.errors import SiftWarning
 from sift_client.sift_types.test_report import (
     ErrorInfo,
     NumericBounds,
+    TestMeasurement,
     TestMeasurementCreate,
     TestReport,
     TestReportCreate,
@@ -28,9 +31,12 @@
 )
 from sift_client.util.test_results.bounds import (
     evaluate_measurement_bounds,
+    out_of_bounds_mask,
+    to_numpy_array,
 )
 
 if TYPE_CHECKING:
+    import pandas as pd
     from numpy.typing import NDArray
 
     from sift_client.client import SiftClient
@@ -38,17 +44,53 @@
 
 logger = logging.getLogger(__name__)
 
+# Sentinel for ``create_step``/``new_step``'s ``parent`` argument. Distinguishes
+# "parent omitted -> use the top of the step stack" (the default, linear
+# behavior) from an explicit ``parent=None`` (create at the report root). The
+# pytest plugin passes an explicit parent to build its report tree out of
+# execution order; everyday ``new_step``/``substep`` callers omit it.
+_USE_STACK_TOP = object()
+
+
+def format_truncated_traceback(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+    tb: object | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from a traceback, keeping the first frame and the last 10."""
+    stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore[arg-type]
+    stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
+    return ErrorInfo(error_code=1, error_message="".join(stack))
+
+
+def format_assertion_message(
+    exc: type[BaseException] | None,
+    exc_value: BaseException | None,
+) -> ErrorInfo:
+    """Format an ErrorInfo from just the exception line(s), no traceback frames.
+
+    For assertion failures the rewritten ``assert`` explanation lives on the
+    exception itself, so stack frames add noise without information. Equivalent
+    to pytest's ``excinfo.exconly()``.
+    """
+    lines = traceback.format_exception_only(exc, exc_value)  # type: ignore[arg-type]
+    return ErrorInfo(error_code=1, error_message="".join(lines))
+
 
 def log_replay_instructions(log_file: str | Path | None) -> None:
-    """Log instructions for manually replaying a test result log file.
+    """Surface replay instructions when an import/replay attempt fails.
 
-    Used when an import/replay attempt fails so the user can retry against the same file.
+    Emitted as a ``SiftWarning`` (not a logger.error) so pytest and other
+    runners surface it in their warning summary; logger.error is suppressed
+    by default in most CLI tools.
     """
     if log_file is None:
         return
-    logger.error(
-        f"Error replaying log file: {log_file}.\n"
-        f"  Can replay with `replay-test-result-log {log_file}`."
+    warnings.warn(
+        f"Sift log file was not fully replayed: {log_file}. "
+        f"Re-run with `import-test-result-log --incremental {log_file}` to complete the upload.",
+        SiftWarning,
+        stacklevel=2,
     )
 
 
@@ -104,10 +146,38 @@ class ReportContext(AbstractContextManager):
     log_file: Path | None
     step_is_open: bool
     step_stack: list[TestStep]
-    step_number_at_depth: dict[int, int]
+    # Per-parent child counter keyed by the parent's ``step_path`` (``""`` is the
+    # root bucket). Drives parent-relative path numbering so two parents at the
+    # same depth never collide and a step's path is stable regardless of the
+    # order siblings are created in.
+    child_counts: dict[str, int]
     open_step_results: dict[str, bool]
+    # Latest child ``end_time`` seen for each parent, keyed by the parent's
+    # ``step_path``. A parent that stays open across the whole run (e.g. a
+    # hierarchy/parametrize parent the pytest plugin holds in its registry) is
+    # closed with this time, so its duration spans first-child-start to
+    # last-descendant-finish rather than wall-clock at session end.
+    parent_end_times: dict[str, datetime]
     any_failures: bool
+    # Every step created in this report (including hierarchy/parametrize
+    # parents), retained after close so end-of-run summaries can tally final
+    # statuses. ``update`` mutates step instances in place, so these references
+    # reflect late status changes (e.g. a teardown-phase failure).
+    created_steps: list[TestStep]
+    # Every measurement recorded in this report, retained for end-of-run
+    # summaries. Appended in ``NewStep.measure``. A measurement's ``passed`` is
+    # fixed at creation, so the retained references stay accurate.
+    created_measurements: list[TestMeasurement]
+    # Set True in ``__exit__`` when the background replay worker timed out or
+    # exited non-zero, so callers (e.g. the pytest plugin footer) can flag that
+    # the uploaded report may be missing entries.
+    replay_incomplete: bool = False
     _import_proc: subprocess.Popen | None = None
+    # Seconds to wait for the import worker subprocess to finish uploading
+    # the JSONL backlog at session end before killing it. Tests substitute
+    # a smaller value (via ``_make_context`` patching) so they don't wait
+    # the full window for the timeout branch to trigger.
+    _import_proc_timeout: float = 30.0
 
     def __init__(
         self,
@@ -116,8 +186,12 @@ def __init__(
         test_system_name: str | None = None,
         system_operator: str | None = None,
         test_case: str | None = None,
+        serial_number: str | None = None,
+        part_number: str | None = None,
         log_file: str | Path | bool | None = None,
         include_git_metadata: bool = False,
+        replay_log_file: bool = True,
+        metadata: dict[str, str | float | bool] | None = None,
     ):
         """Initialize a new report context.
 
@@ -127,16 +201,33 @@ def __init__(
             test_system_name: The name of the test system. Will default to the hostname if not provided.
             system_operator: The operator of the test system. Will default to the current user if not provided.
             test_case: The name of the test case. Will default to the basename of the file containing the test if not provided.
+            serial_number: Optional serial_number stored on the report. Unset when None.
+            part_number: Optional part_number stored on the report. Unset when None.
             log_file: If True, create a temp log file. If a path, use that path.
-                All create/update operations will be logged to this file.
+                If False/None, no log file is written and create/update calls
+                the API.
             include_git_metadata: If True, include git metadata in the report.
+            metadata: Structured key/value metadata to attach to the report. Merged
+                on top of git metadata when ``include_git_metadata`` is True, so
+                explicit keys win on collision.
+            replay_log_file: When True (the default) and ``log_file`` is set,
+                spawn ``import-test-result-log --incremental`` to push log
+                entries to Sift in the background during the session. When
+                False, the log file is just a record and no worker is spawned.
+                Replay happens later via ``replay-test-result-log <path>``.
+                Has no effect when ``log_file`` is None.
         """
         self.client = client
+        self.replay_log_file = replay_log_file
         self.step_is_open = False
         self.step_stack = []
-        self.step_number_at_depth = {}
+        self.child_counts = {}
         self.open_step_results = {}
+        self.parent_end_times = {}
         self.any_failures = False
+        self.created_steps = []
+        self.created_measurements = []
+        self.replay_incomplete = False
 
         if log_file is True:
             tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False)
@@ -151,6 +242,10 @@ def __init__(
         test_case = test_case if test_case else os.path.basename(__file__)
         test_system_name = test_system_name if test_system_name else socket.gethostname()
         system_operator = system_operator if system_operator else getpass.getuser()
+        combined_metadata = {
+            **(_git_metadata() or {} if include_git_metadata else {}),
+            **(metadata or {}),
+        }
         create = TestReportCreate(
             name=name,
             test_system_name=test_system_name,
@@ -159,32 +254,47 @@ def __init__(
             end_time=datetime.now(timezone.utc),
             status=TestStatus.IN_PROGRESS,
             system_operator=system_operator,
-            metadata=_git_metadata() if include_git_metadata else None,  # type: ignore
+            serial_number=serial_number,
+            part_number=part_number,
+            metadata=combined_metadata or None,  # type: ignore
         )
         self.report = client.test_results.create(create, log_file=self.log_file)
 
+    def _build_replay_command(self) -> list[str]:
+        """Build the argv for the import-test-result-log replay subprocess.
+
+        Factored out for testability — tests substitute commands that exit
+        with controlled returncodes / stderr to exercise the ``__exit__``
+        branches without depending on the real replay binary.
+        """
+        return [
+            "import-test-result-log",
+            "--incremental",
+            str(self.log_file),
+            "--grpc-url",
+            self.client.grpc_client._config.uri,
+            "--rest-url",
+            self.client.rest_client._config.base_url,
+            "--api-key",
+            self.client.grpc_client._config.api_key,
+        ]
+
     def _open_import_proc(self):
-        """Open a subprocess to import the log file."""
+        """Open a subprocess to import the log file.
+
+        ``stderr`` is captured so a worker crash mid-session can surface its
+        error at session end via ``__exit__`` rather than failing silently.
+        """
         with _quiet_fork_stderr():
             self._import_proc = subprocess.Popen(
-                [
-                    "import-test-result-log",
-                    "--incremental",
-                    str(self.log_file),
-                    "--grpc-url",
-                    self.client.grpc_client._config.uri,
-                    "--rest-url",
-                    self.client.rest_client._config.base_url,
-                    "--api-key",
-                    self.client.grpc_client._config.api_key,
-                ],
+                self._build_replay_command(),
                 stdin=subprocess.PIPE,
                 stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
             )
 
     def __enter__(self):
-        if self.log_file:
+        if self.log_file and self.replay_log_file:
             self._open_import_proc()
         return self
 
@@ -199,46 +309,130 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.report.update(update)
 
         if self._import_proc is not None:
+            # Three outcomes for the replay worker at session end. None of
+            # them fail the session — tests already ran and their outcome
+            # is independent of delivery. The local log file is the source
+            # of recovery for both failure modes via
+            # `import-test-result-log <path>`:
+            #   1. Exits cleanly (returncode 0). Silent.
+            #   2. Still running after the grace window (TimeoutExpired).
+            #      Healthy worker with a large backlog; kill and surface
+            #      replay instructions. 30 seconds is enough for a normal
+            #      test suite to drain; pathological backlogs should opt
+            #      into inline mode (`--sift-log-file=false`) instead.
+            #   3. Exited with non-zero. Connection failures and API call
+            #      errors land here — the worker's replay loop has no retry,
+            #      so the first failed RPC crashes the subprocess. Surface
+            #      the captured stderr with replay instructions.
             try:
-                self._import_proc.communicate(timeout=1)
+                _, stderr_bytes = self._import_proc.communicate(timeout=self._import_proc_timeout)
             except subprocess.TimeoutExpired:
-                logger.error("Import process did not exit in 10s, killing it")
                 self._import_proc.kill()
                 self._import_proc.wait()
+                self.replay_incomplete = True
+                warnings.warn(
+                    f"Sift import worker did not exit in "
+                    f"{self._import_proc_timeout}s; killing it. "
+                    "Local log file is preserved for manual replay.",
+                    SiftWarning,
+                    stacklevel=2,
+                )
+                log_replay_instructions(self.log_file)
+                return True  # Ensures the session is marked as passed in pytest
+            if self._import_proc.returncode != 0:
+                self.replay_incomplete = True
+                stderr_text = (
+                    stderr_bytes.decode("utf-8", errors="replace").strip() if stderr_bytes else ""
+                )
+                warnings.warn(
+                    f"Sift import worker exited with code "
+                    f"{self._import_proc.returncode}. stderr: {stderr_text or '<empty>'}",
+                    SiftWarning,
+                    stacklevel=2,
+                )
                 log_replay_instructions(self.log_file)
-                raise
 
         return True
 
+    @property
+    def is_simulated(self) -> bool:
+        """True when this context's report came from the simulate path.
+
+        Delegates to ``self.report.is_simulated``; see ``TestReport.is_simulated``
+        for the full semantics.
+        """
+        return self.report.is_simulated
+
+    @property
+    def step_status_counts(self) -> Counter[TestStatus]:
+        """Tally of every created step by its current status.
+
+        Includes hierarchy/parametrize parent steps. Read at the end of a run for
+        summaries; reflects late status changes since steps are mutated in place.
+        """
+        return Counter(step.status for step in self.created_steps)
+
+    @property
+    def measurement_counts(self) -> Counter[bool]:
+        """Tally of recorded measurements keyed by ``passed`` (True/False).
+
+        Read at the end of a run for summaries.
+        """
+        return Counter(m.passed for m in self.created_measurements)
+
     def new_step(
         self,
         name: str,
         description: str | None = None,
         assertion_as_fail_not_error: bool = True,
         metadata: dict[str, str | float | bool] | None = None,
+        *,
+        parent: TestStep | None | object = _USE_STACK_TOP,
+        push: bool = True,
     ) -> NewStep:
-        """Alias to return a new step context manager from this report context. Use create_step for actually creating a TestStep in the current context."""
+        """Alias to return a new step context manager from this report context. Use create_step for actually creating a TestStep in the current context.
+
+        ``parent`` and ``push`` default to the linear, stack-based behavior used
+        by everyday callers. The pytest plugin passes an explicit ``parent`` with
+        ``push=False`` to open report-tree parents that persist outside the stack;
+        see :meth:`create_step`.
+        """
         return NewStep(
             self,
             name=name,
             description=description,
             assertion_as_fail_not_error=assertion_as_fail_not_error,
             metadata=metadata,
+            parent=parent,
+            push=push,
         )
 
-    def get_next_step_path(self) -> str:
-        """Get the next step path for the current depth."""
-        top_step = self.step_stack[-1] if self.step_stack else None
-        step_path = top_step.step_path if top_step else ""
-        next_step_number = self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        prefix = f"{step_path}." if step_path else ""
-        return f"{prefix}{next_step_number}"
+    def _resolve_parent(self, parent: TestStep | None | object) -> TestStep | None:
+        """Resolve a ``parent`` argument to a concrete parent step (or None for root)."""
+        if parent is _USE_STACK_TOP:
+            return self.step_stack[-1] if self.step_stack else None
+        return parent  # type: ignore[return-value]
+
+    def get_next_step_path(self, parent: TestStep | None | object = _USE_STACK_TOP) -> str:
+        """Preview the path the next step under ``parent`` would get (no side effects).
+
+        Parent-relative: a child's path is ``<parent path>.<nth child>``, or
+        ``<n>`` at the root. Defaults to the top of the step stack so existing
+        callers see the same value the next stacked ``create_step`` will assign.
+        """
+        parent_step = self._resolve_parent(parent)
+        parent_path = parent_step.step_path if parent_step else ""
+        next_number = self.child_counts.get(parent_path, 0) + 1
+        return f"{parent_path}.{next_number}" if parent_path else str(next_number)
 
     def create_step(
         self,
         name: str,
         description: str | None = None,
         metadata: dict[str, str | float | bool] | None = None,
+        *,
+        parent: TestStep | None | object = _USE_STACK_TOP,
+        push: bool = True,
     ) -> TestStep:
         """Create a new step in the report context.
 
@@ -248,12 +442,23 @@ def create_step(
             metadata: [Optional] Structured key/value metadata to attach to the step. For
                 metadata shared across every step in a report, prefer the `metadata` attribute
                 of the enclosing `TestReport`.
+            parent: The parent step to nest under. ``_USE_STACK_TOP`` (the
+                default) parents to the current top of the step stack — the
+                linear behavior. An explicit ``TestStep`` parents under that step
+                regardless of stack state; explicit ``None`` creates a root step.
+            push: Whether to push the new step onto the step stack. True (the
+                default) for leaf/in-test steps so their substeps nest under
+                them. The pytest plugin passes False for hierarchy/parametrize
+                parents, which live in its own registry and would otherwise
+                trap unrelated steps beneath them.
 
         Returns:
             The created step.
         """
-        step_path = self.get_next_step_path()
-        parent_step = self.step_stack[-1] if self.step_stack else None
+        parent_step = self._resolve_parent(parent)
+        parent_path = parent_step.step_path if parent_step else ""
+        next_number = self.child_counts.get(parent_path, 0) + 1
+        step_path = f"{parent_path}.{next_number}" if parent_path else str(next_number)
 
         step = self.client.test_results.create_step(
             TestStepCreate(
@@ -272,11 +477,12 @@ def create_step(
         )
 
         # Update the step tracking structures.
-        self.step_number_at_depth[len(self.step_stack)] = (
-            self.step_number_at_depth.get(len(self.step_stack), 0) + 1
-        )
-        self.step_stack.append(step)
+        self.child_counts[parent_path] = next_number
+        if push:
+            self.step_stack.append(step)
         self.open_step_results[step.step_path] = True
+        # Retained for end-of-run tallies; never popped (unlike step_stack).
+        self.created_steps.append(step)
 
         return step
 
@@ -287,40 +493,73 @@ def record_step_outcome(self, outcome: bool, step: TestStep):
             self.open_step_results[step.step_path] = False
             self.any_failures = True
 
-    def resolve_and_propagate_step_result(
-        self,
-        step: TestStep,
-        error_info: ErrorInfo | None = None,
-    ) -> bool:
-        """Resolve the result of a step and propagate the result to the parent step if it failed."""
-        result = self.open_step_results.get(step.step_path, True)
-        if error_info:
-            result = False
-        if step.status != TestStatus.IN_PROGRESS:
-            # The step was manually completed so use that result.
-            # Skipped steps are considered passed.
-            result = step.status in (TestStatus.PASSED, TestStatus.SKIPPED)
-
-        # Update the parent step results if this step failed (true by default so no need to do anything if we didn't fail).
-        if not result:
+    def record_measurement(self, measurement: TestMeasurement) -> None:
+        """Retain a recorded measurement for end-of-run summaries."""
+        self.created_measurements.append(measurement)
+
+    def mark_step_failed_after_close(self, step: TestStep):
+        """Mark a step's parent as failed after the step has already been popped from the stack.
+
+        Used by the pytest plugin when a teardown-phase report fires after the
+        fixture's ``__exit__`` has already resolved and exited the step.
+        """
+        self.any_failures = True
+        path_parts = step.step_path.split(".")
+        if len(path_parts) > 1:
+            self.open_step_results[".".join(path_parts[:-1])] = False
+
+    def propagate_step_result(self, step: TestStep, status: TestStatus) -> bool:
+        """Propagate this step's final status to the parent step.
+
+        Status is the governor: anything outside ``{PASSED, SKIPPED}`` counts
+        as a failure for the parent. ``error_info`` is intentionally not
+        consulted here; it is free-form diagnostic data that may sit on a
+        step regardless of status.
+        """
+        succeeded = status in (TestStatus.PASSED, TestStatus.SKIPPED)
+        if not succeeded:
             self.any_failures = True
             self.open_step_results[step.step_path] = False
             path_parts = step.step_path.split(".")
             if len(path_parts) > 1:
-                parent_step_path = ".".join(path_parts[:-1])
-                self.open_step_results[parent_step_path] = False
+                self.open_step_results[".".join(path_parts[:-1])] = False
+        return succeeded
 
-        return result
+    def note_close(self, step: TestStep) -> None:
+        """Record a just-closed step's ``end_time`` against its parent.
 
-    def exit_step(self, step: TestStep):
-        """Exit a step and update the report context."""
-        self.step_number_at_depth[len(self.step_stack)] = 0
-        stack_top = self.step_stack.pop()
-        self.open_step_results.pop(step.step_path)
+        Lets a long-lived parent (one closed later, out of band) adopt the finish
+        time of its latest child instead of wall-clock at its own close. Keyed by
+        the parent's ``step_path`` (the child path minus its last segment).
+        """
+        end_time = step.end_time
+        if end_time is None:
+            return
+        path_parts = step.step_path.split(".")
+        if len(path_parts) <= 1:
+            return
+        parent_path = ".".join(path_parts[:-1])
+        previous = self.parent_end_times.get(parent_path)
+        if previous is None or end_time > previous:
+            self.parent_end_times[parent_path] = end_time
 
-        if stack_top.id_ != step.id_:
+    def exit_step(self, step: TestStep):
+        """Exit a step and update the report context.
+
+        Stacked steps (leaves and their in-test substeps) close in strict LIFO
+        order, so a step that isn't the current top of the stack is a real
+        invariant break. Steps created with an explicit parent and ``push=False``
+        (the pytest plugin's hierarchy/parametrize parents) never sit on the
+        stack and may close in any order — clearing ``open_step_results`` is all
+        that's needed; their result was already propagated to their own parent.
+        """
+        self.open_step_results.pop(step.step_path, None)
+        if self.step_stack and self.step_stack[-1].id_ == step.id_:
+            self.step_stack.pop()
+            return
+        if any(s.id_ == step.id_ for s in self.step_stack):
             raise ValueError(
-                "The popped step was not the top of the stack. This should never happen."
+                "exit_step called out of LIFO order for a stacked step. This should never happen."
             )
 
 
@@ -331,6 +570,13 @@ class NewStep(AbstractContextManager):
     client: SiftClient
     assertion_as_fail_not_error: bool = True
     current_step: TestStep | None = None
+    # Set by the pytest plugin's ``_resolve_initial_status`` to signal that
+    # status was already resolved upstream and ``__exit__`` should skip
+    # re-classifying. Read via ``getattr`` so unset is treated as False.
+    _sift_managed_externally: bool = False
+    # Set by the pytest plugin when finalizing a long-lived parent so ``__exit__``
+    # stamps its last-descendant finish time instead of wall-clock at close.
+    _sift_end_time_override: datetime | None = None
 
     def __init__(
         self,
@@ -339,6 +585,9 @@ def __init__(
         description: str | None = None,
         assertion_as_fail_not_error: bool = True,
         metadata: dict[str, str | float | bool] | None = None,
+        *,
+        parent: TestStep | None | object = _USE_STACK_TOP,
+        push: bool = True,
     ):
         """Initialize a new step context.
 
@@ -348,11 +597,23 @@ def __init__(
             description: The description of the step.
             assertion_as_fail_not_error: Mark steps with assertion errors as failed instead of error+traceback (some users want assertions to work as simple failures especially when using pytest).
             metadata: [Optional] Structured key/value metadata to attach to the step.
+            parent: Parent step to nest under; see :meth:`ReportContext.create_step`.
+            push: Whether the step joins the step stack; see :meth:`ReportContext.create_step`.
         """
         self.report_context = report_context
         self.client = report_context.client
-        self.current_step = self.report_context.create_step(name, description, metadata=metadata)
+        self.current_step = self.report_context.create_step(
+            name, description, metadata=metadata, parent=parent, push=push
+        )
         self.assertion_as_fail_not_error = assertion_as_fail_not_error
+        # Per-step measurement-failure count for ``measurements_passed``.
+        # Tracks only direct ``measure*`` calls on this NewStep instance;
+        # substep / ``report_outcome`` failures are intentionally not folded
+        # in here. ``pytest_fail_if_step_failed`` covers the broader case.
+        self._failed_measurement_count = 0
+        # Out-of-bounds measurements recorded on this step, retained so
+        # ``pytest_fail_if_step_failed`` can name them in the failure message.
+        self._failed_measurements: list[TestMeasurement] = []
 
     def __enter__(self):
         """Enter the context manager to create a new step.
@@ -361,11 +622,62 @@ def __enter__(self):
         """
         return self
 
+    @property
+    def measurements_passed(self) -> bool:
+        """True if every measurement recorded directly on this step has passed.
+
+        Counts only ``step.measure``, ``step.measure_avg``, and
+        ``step.measure_all`` calls on this ``NewStep`` instance; substep and
+        ``report_outcome`` failures are not folded in. For the end-of-test
+        failure that mirrors the report, use ``pytest_fail_if_step_failed()``,
+        which also covers failed substeps.
+        """
+        return self._failed_measurement_count == 0
+
+    def pytest_fail_if_step_failed(self, message: str = "step failed") -> None:
+        """Fail the running pytest test if this step or any descendant failed.
+
+        Covers every signal that resolves the step to FAILED in the report:
+        out-of-bounds measurements recorded directly on the step, failed
+        substeps, and ``report_outcome`` failures. Call it once at the end of a
+        test so the pytest verdict matches the report instead of passing green
+        while the report shows a failure.
+
+        It fails via ``pytest.fail(pytrace=False)`` so the step resolves to
+        FAILED without an assertion traceback in ``error_info``. No-op when the
+        step and all of its descendants passed. Call after the work is done so
+        every measurement and substep is recorded before the failure fires.
+
+        The failure message names each out-of-bounds measurement and each
+        failed substep. ``message`` is used as the header line.
+        """
+        step = self.current_step
+        # ``open_step_results[step_path]`` is the same signal ``__exit__`` reads
+        # to resolve status: it is flipped False by a direct measurement failure
+        # (record_step_outcome) and by any failed child as it propagates upward
+        # (propagate_step_result). Default True covers a step that never opened.
+        if step is None or self.report_context.open_step_results.get(step.step_path, True):
+            return
+        import pytest
+
+        prefix = f"{step.step_path}."
+        failed_substeps = [
+            s
+            for s in self.report_context.created_steps
+            if s.step_path.startswith(prefix)
+            and s.status not in (TestStatus.PASSED, TestStatus.SKIPPED, TestStatus.IN_PROGRESS)
+        ]
+        details = [f"  - measurement {m}" for m in self._failed_measurements]
+        details += [f"  - substep {s.step_path!r}: {s.status.name}" for s in failed_substeps]
+        header = f"{message} ({len(details)}):" if details else message
+        pytest.fail("\n".join([header, *details]), pytrace=False)
+
     def update_step_from_result(
         self,
         exc: type[Exception] | None,
         exc_value: Exception | None,
         tb: traceback.TracebackException | None,
+        end_time: datetime | None = None,
     ) -> bool:
         """Update the step based on its substeps and if there was an exception while executing the step.
 
@@ -373,48 +685,103 @@ def update_step_from_result(
             exc: The class of Exception that was raised.
             exc_value: The exception value.
             tb: The traceback object.
+            end_time: Explicit end_time to stamp. Defaults to now(); the pytest
+                plugin passes the last-child finish time when closing a long-lived
+                parent so its duration reflects its subtree rather than its own
+                late close.
 
         returns: The false if step failed or errored, true otherwise.
         """
+        current_step = self.current_step
+        if current_step is None:
+            # The step was never opened; nothing to resolve. Treat as a pass
+            # so callers that branch on the return value don't see a spurious
+            # failure.
+            return True
+
         error_info = None
-        assert self.current_step is not None
+        aborted = False
+        errored = False
         if exc:
             if isinstance(exc_value, AssertionError) and not self.assertion_as_fail_not_error:
-                # If we're not showing assertion errors (i.e. pytest), mark step as failed but don't set error info.
-                self.report_context.record_step_outcome(False, self.current_step)
+                # pytest-style: an assertion is a plain failure, not an error. Record the
+                # failure and attach the concise assertion message (no traceback) so the
+                # UI can show what was asserted.
+                self.report_context.record_step_outcome(False, current_step)
+                error_info = format_assertion_message(exc, exc_value)
+            elif isinstance(exc_value, (KeyboardInterrupt, SystemExit)):
+                # Hard exit propagating through the substep stack: record as
+                # ABORTED so every in-progress step on the way out reflects
+                # the abort rather than coercing to ERROR.
+                aborted = True
+                error_info = format_truncated_traceback(exc, exc_value, tb)
             else:
-                stack = traceback.format_exception(exc, exc_value, tb)  # type: ignore
-                stack = [stack[0], *stack[-10:]] if len(stack) > 10 else stack
-                trace = "".join(stack)
-                error_info = ErrorInfo(
-                    error_code=1,
-                    error_message=trace,
-                )
-
-        # Resolve the status of this step (i.e. fail if children failed) and propagate the result to the parent step.
-        result = self.report_context.resolve_and_propagate_step_result(
-            self.current_step, error_info
-        )
-
-        # Mark the step as completed
-        status = self.current_step.status
+                errored = True
+                error_info = format_truncated_traceback(exc, exc_value, tb)
+
+        # Status is the governor: anything other than IN_PROGRESS was set
+        # deliberately (manual override, plugin pre-resolution, etc.) and must
+        # not be silently overwritten by side-channel signals. When the step is
+        # still IN_PROGRESS, resolve from independent state: aborts first, then
+        # a child-failed signal (parents inherit FAILED, not the originating
+        # ERROR), then the step's own captured exception, then the children-pass
+        # default. error_info is diagnostic and never drives status.
+        status = current_step.status
         if status == TestStatus.IN_PROGRESS:
-            # Update the status only if the step was in progress i.e. not updated elsewhere.
-            status = TestStatus.PASSED if result else TestStatus.FAILED
-        if error_info:
-            status = TestStatus.ERROR
-        self.current_step.update(
+            children_passed = self.report_context.open_step_results.get(
+                current_step.step_path, True
+            )
+            if aborted:
+                status = TestStatus.ABORTED
+            elif not children_passed:
+                status = TestStatus.FAILED
+            elif errored:
+                status = TestStatus.ERROR
+            else:
+                status = TestStatus.PASSED
+
+        # Propagate based on the resolved status; error_info rides along as
+        # pure diagnostics and does not affect propagation.
+        result = self.report_context.propagate_step_result(current_step, status)
+        current_step.update(
             {
                 "status": status,
-                "end_time": datetime.now(timezone.utc),
+                "end_time": end_time if end_time is not None else datetime.now(timezone.utc),
                 "error_info": error_info,
             },
         )
+        self.report_context.note_close(current_step)
 
         return result
 
     def __exit__(self, exc, exc_value, tb):
-        result = self.update_step_from_result(exc, exc_value, tb)
+        if getattr(self, "_sift_managed_externally", False):
+            # The pytest fixture already resolved status from phase reports.
+            # Propagate based on that resolved status, emit one update_step
+            # with the resolved values, and pop from the stack without
+            # re-classifying.
+            current_step = self.current_step
+            if current_step is None:
+                # The step was never opened; nothing to propagate.
+                return True
+            override = getattr(self, "_sift_end_time_override", None)
+            result = self.report_context.propagate_step_result(current_step, current_step.status)
+            current_step.update(
+                {
+                    "status": current_step.status,
+                    "end_time": override if override is not None else datetime.now(timezone.utc),
+                    "error_info": current_step.error_info,
+                },
+            )
+            self.report_context.note_close(current_step)
+            self.report_context.exit_step(current_step)
+            if hasattr(self, "force_result"):
+                result = self.force_result
+            return result
+
+        result = self.update_step_from_result(
+            exc, exc_value, tb, end_time=getattr(self, "_sift_end_time_override", None)
+        )
 
         # Now that the step is updated. Let the report context handle removing it from the stack and updating the report context.
         self.report_context.exit_step(self.current_step)
@@ -473,6 +840,10 @@ def measure(
             create, log_file=self.report_context.log_file
         )
         self.report_context.record_step_outcome(measurement.passed, self.current_step)
+        self.report_context.record_measurement(measurement)
+        if not measurement.passed:
+            self._failed_measurement_count += 1
+            self._failed_measurements.append(measurement)
 
         return measurement.passed
 
@@ -505,15 +876,7 @@ def measure_avg(
         returns: The true if the average of the values is within the bounds, false otherwise.
         """
         timestamp = timestamp if timestamp else datetime.now(timezone.utc)
-        np_array = None
-        if isinstance(values, list):
-            np_array = np.array(values)
-        elif isinstance(values, np.ndarray):
-            np_array = values
-        elif isinstance(values, pd.Series):
-            np_array = values.to_numpy()
-        else:
-            raise ValueError(f"Invalid value type: {type(values)}")
+        np_array = to_numpy_array(values)
         avg = float(np.mean(np_array))
         result = self.measure(
             name=name,
@@ -561,31 +924,8 @@ def measure_all(
         returns: The true if all values are within the bounds, false otherwise.
         """
         timestamp = timestamp if timestamp else datetime.now(timezone.utc)
-        np_array = None
-        if isinstance(values, list):
-            np_array = np.array(values)
-        elif isinstance(values, np.ndarray):
-            np_array = values
-        elif isinstance(values, pd.Series):
-            np_array = values.to_numpy()
-        else:
-            raise ValueError(f"Invalid value type: {type(values)}")
-
-        numeric_bounds = bounds
-        if isinstance(numeric_bounds, dict):
-            numeric_bounds = NumericBounds(min=bounds.get("min"), max=bounds.get("max"))  # type: ignore
-
-        # Construct a mask of the values that are outside the bounds.
-        mask = None
-        if numeric_bounds.min is not None:
-            mask = np_array < numeric_bounds.min
-        if numeric_bounds.max is not None:
-            val_above_max = np_array > numeric_bounds.max
-            mask = mask | val_above_max if mask is not None else val_above_max
-        if mask is None:
-            raise ValueError("No bounds provided")
-
-        rows_outside_bounds = np_array[mask]
+        np_array = to_numpy_array(values)
+        rows_outside_bounds = np_array[out_of_bounds_mask(np_array, bounds)]
         for row in rows_outside_bounds:
             self.measure(
                 name=name,
diff --git a/python/lib/sift_client/util/test_results/pytest_util.py b/python/lib/sift_client/util/test_results/pytest_util.py
deleted file mode 100644
index a96a47fb3..000000000
--- a/python/lib/sift_client/util/test_results/pytest_util.py
+++ /dev/null
@@ -1,206 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generator
-
-import pytest
-
-from sift_client.sift_types.test_report import TestStatus
-from sift_client.util.test_results import ReportContext
-
-if TYPE_CHECKING:
-    from sift_client.client import SiftClient
-    from sift_client.util.test_results.context_manager import NewStep
-
-REPORT_CONTEXT: ReportContext | None = None
-
-
-def pytest_addoption(parser: pytest.Parser) -> None:
-    """Register Sift-specific command-line options."""
-    parser.addoption(
-        "--sift-test-results-log-file",
-        default=None,
-        help="Path to write the Sift test result log file. "
-        "Use 'true' (default) to auto-create a temp file, "
-        "False, 'false', or 'none' to disable logging, "
-        "or a file path to write to a specific location.",
-    )
-    parser.addoption(
-        "--no-sift-test-results-git-metadata",
-        action="store_false",
-        dest="sift_test_results_git_metadata",
-        default=True,
-        help="Exclude git metadata from the Sift test results. "
-        "Git metadata (repo, branch, commit) is included by default.",
-    )
-    parser.addoption(
-        "--sift-test-results-check-connection",
-        action="store_true",
-        default=False,
-        help="Skip the sift test-result fixtures (report_context, step, module_substep) "
-        "when the Sift client has no connection to the server. Requires a "
-        "`client_has_connection` fixture to be available in the test session.",
-    )
-
-
-def _resolve_log_file(pytestconfig: pytest.Config | None) -> str | Path | bool | None:
-    """Determine log_file value from --sift-test-results-log-file option."""
-    raw = None
-    if pytestconfig is not None:
-        raw = pytestconfig.getoption("--sift-test-results-log-file", default=None)
-    if raw is None:
-        return True
-    lower = str(raw).lower()
-    if lower in ("true", "1"):
-        return True
-    if lower in ("false", "none"):
-        return None
-    return Path(raw)
-
-
-@pytest.hookimpl(tryfirst=True, hookwrapper=True)
-def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo[Any]):
-    """You should import this hook to capture any AssertionErrors that occur during the test. If not included, any assert failures in a test will not automatically fail the step."""
-    outcome = yield
-    report = outcome.get_result()
-    if report.outcome == "skipped":
-        # Skipped steps won't invoke the method/fixtures at all, so we need to manually record a step.
-        if REPORT_CONTEXT:
-            with REPORT_CONTEXT.new_step(name=item.name) as new_step:
-                new_step.current_step.update({"status": TestStatus.SKIPPED})
-    setattr(item, "rep_" + report.when, call)
-
-
-def _report_context_impl(
-    sift_client: SiftClient,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config | None = None,
-) -> Generator[ReportContext | None, None, None]:
-    args = request.config.invocation_params.args
-    test_path = Path(args[0]) if args else None
-    if test_path is not None and test_path.exists():
-        base_name = test_path.name
-        test_case: Path | str = test_path
-    else:
-        base_name = "pytest " + " ".join(args) if args else "pytest"
-        test_case = base_name
-    log_file = _resolve_log_file(pytestconfig)
-    include_git_metadata = (
-        bool(pytestconfig.getoption("sift_test_results_git_metadata", default=True))
-        if pytestconfig
-        else True
-    )
-    with ReportContext(
-        sift_client,
-        name=f"{base_name} {datetime.now(timezone.utc).isoformat()}",
-        test_case=str(test_case),
-        log_file=log_file,
-        include_git_metadata=include_git_metadata,
-    ) as context:
-        # Set a global so we can access this in pytest hooks.
-        global REPORT_CONTEXT
-        REPORT_CONTEXT = context
-        yield context
-
-
-def _check_connection_enabled(pytestconfig: pytest.Config | None) -> bool:
-    """Return True when the caller opted into `--sift-test-results-check-connection`."""
-    if pytestconfig is None:
-        return False
-    return bool(pytestconfig.getoption("sift_test_results_check_connection", default=False))
-
-
-def _has_sift_connection(request: pytest.FixtureRequest) -> bool:
-    """Resolve the `client_has_connection` fixture lazily; only called when the check is enabled."""
-    return bool(request.getfixturevalue("client_has_connection"))
-
-
-@pytest.fixture(scope="session", autouse=True)
-def report_context(
-    sift_client: SiftClient, request: pytest.FixtureRequest, pytestconfig: pytest.Config
-) -> Generator[ReportContext | None, None, None]:
-    """Create a report context for the session.
-
-    The log file destination is controlled by ``--sift-test-results-log-file``.
-    Defaults to a temp file when not set.
-
-    When ``--sift-test-results-check-connection`` is passed, this fixture will no-op
-    (yield None) if the Sift client has no connection to the server. That mode
-    requires a ``client_has_connection`` fixture to be available in the session.
-    """
-    if _check_connection_enabled(pytestconfig) and not _has_sift_connection(request):
-        yield None
-        return
-    yield from _report_context_impl(sift_client, request, pytestconfig=pytestconfig)
-
-
-def _step_impl(
-    report_context: ReportContext, request: pytest.FixtureRequest
-) -> Generator[NewStep | None, None, None]:
-    name = str(request.node.name)
-    existing_docstring = request.node.obj.__doc__ or None
-    with report_context.new_step(
-        name=name, description=existing_docstring, assertion_as_fail_not_error=False
-    ) as new_step:
-        yield new_step
-        if hasattr(request.node, "rep_call") and request.node.rep_call.excinfo:
-            new_step.update_step_from_result(
-                request.node.rep_call.excinfo,
-                request.node.rep_call.excinfo.value,
-                request.node.rep_call.excinfo.tb,
-            )
-
-
-@pytest.fixture(autouse=True)
-def step(
-    report_context: ReportContext | None,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create an outer step for the function.
-
-    No-ops when ``--sift-test-results-check-connection`` is set and the client
-    has no connection (or when the session-scoped ``report_context`` resolved to None).
-    """
-    if report_context is None or (
-        _check_connection_enabled(pytestconfig) and not _has_sift_connection(request)
-    ):
-        yield None
-        return
-    yield from _step_impl(report_context, request)
-
-
-@pytest.fixture(scope="module", autouse=True)
-def module_substep(
-    report_context: ReportContext | None,
-    request: pytest.FixtureRequest,
-    pytestconfig: pytest.Config,
-) -> Generator[NewStep | None, None, None]:
-    """Create a step per module.
-
-    No-ops when ``--sift-test-results-check-connection`` is set and the client
-    has no connection (or when the session-scoped ``report_context`` resolved to None).
-    """
-    if report_context is None or (
-        _check_connection_enabled(pytestconfig) and not _has_sift_connection(request)
-    ):
-        yield None
-        return
-    yield from _step_impl(report_context, request)
-
-
-@pytest.fixture(scope="session")
-def client_has_connection(sift_client):
-    """Check if the SiftClient has a connection to the Sift server.
-
-    Can be used to skip tests that require a connection to the Sift server, and is
-    consulted by the Sift fixtures when ``--sift-test-results-check-connection`` is set.
-    """
-    has_connection = False
-    try:
-        sift_client.ping.ping()
-        has_connection = True
-    except Exception:
-        has_connection = False
-    return has_connection
diff --git a/python/mkdocs.yml b/python/mkdocs.yml
index 90bfd10ed..5a9c73e82 100644
--- a/python/mkdocs.yml
+++ b/python/mkdocs.yml
@@ -51,6 +51,10 @@ extra:
     provider: mike
     alias: true
 
+# Kept out of the nav but still built so the old URL redirects to the guide.
+not_in_nav: |
+  /examples/pytest_plugin.md
+
 nav:
   - Home: index.md
   - Sift Client API
@@ -59,7 +63,15 @@ nav:
       - examples/index.md
       - Basic Usage: examples/basic.ipynb
       - Data Ingestion: examples/ingestion.ipynb
-      - Pytest Plugin: examples/pytest_plugin.md
+      - Pytest Plugin Quickstart: examples/pytest_plugin_quickstart.md
+  - Guides:
+      - guides/index.md
+      - Pytest Plugin:
+          - Overview: guides/pytest_plugin/index.md
+          - Configuration & Defaults: guides/pytest_plugin/configuration.md
+          - Running Modes: guides/pytest_plugin/running_modes.md
+          - Report Structure: guides/pytest_plugin/report_structure.md
+          - Pass/Fail Behavior: guides/pytest_plugin/pass_fail_behavior.md
 #  - Guides:
 #      - Logging
 #      - Error Handling
diff --git a/python/pyproject.toml b/python/pyproject.toml
index ddb4c7d80..61b2b03d2 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sift_stack_py"
-version = "0.16.2"
+version = "0.17.0.dev2"
 description = "Python client library for the Sift API"
 requires-python = ">=3.8"
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -45,6 +45,7 @@ dependencies = [
     "googleapis-common-protos>=1.60",
     "protoc-gen-openapiv2>=0.0.1",
     "filelock~=3.13",
+    'tomli~=2.0; python_version < "3.11"',
 ]
 
 [project.urls]
@@ -84,6 +85,7 @@ dev = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     'ruff~=0.12.10',
     'tomlkit~=0.13.3',
@@ -104,6 +106,7 @@ dev-all = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     "rosbags~=0.0 ; python_full_version >= '3.8.2'",
     'ruff~=0.12.10',
@@ -119,6 +122,7 @@ development = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     'ruff~=0.12.10',
     'tomlkit~=0.13.3',
@@ -157,6 +161,7 @@ docs-build = [
     'pytest-benchmark==4.0.0',
     'pytest-dotenv==0.5.2',
     'pytest-mock==3.14.0',
+    'pytest-randomly==3.15.0',
     'pytest==8.2.2',
     "rosbags~=0.0 ; python_full_version >= '3.8.2'",
     'ruff~=0.12.10',
@@ -205,6 +210,9 @@ development = [
     "pytest-benchmark==4.0.0",
     "pytest-mock==3.14.0",
     "pytest-dotenv==0.5.2",
+    # 3.15.0 is the last line supporting Python 3.8; pinned (rather than 4.x,
+    # which needs 3.10+) so randomization is active on the 3.8 CI test job too.
+    "pytest-randomly==3.15.0",
     "ruff~=0.12.10",
     "tomlkit~=0.13.3"
 ]
@@ -409,6 +417,21 @@ select = [
 env_files = [
     ".env"
 ]
+# `pytester` is registered globally because pytest 8+ disallows `pytest_plugins`
+# in non-top-level conftests. Only the plugin test suite uses it; activating it
+# globally is harmless since the fixture is opt-in.
+# The Sift pytest plugin is loaded so the project's own integration tests can
+# use its fixtures. Unit-test runs are flipped to `--sift-disabled` mode by
+# `lib/sift_client/_tests/conftest.py`.
+# `--import-mode=importlib` loads test files by path with unique synthetic
+# module names. The default `prepend` mode would try to import
+# `lib/sift_client/_tests/conftest.py` as `sift_client._tests.conftest`, which
+# fails because `_tests` is excluded from the wheel (see packages.find above).
+addopts = "-p pytester -p sift_client.pytest_plugin --import-mode=importlib"
+# The autouse gate defaults to off so unit tests don't use the Sift
+# fixtures. The integration subtree (lib/sift_client/_tests/util/) opts
+# back in via `pytest.mark.sift_include` applied in its conftest.
+sift_autouse = false
 testpaths = [
     "lib/sift_py",
     "lib/sift_client/_tests",
diff --git a/python/scripts/dev b/python/scripts/dev
index 510d66d95..ce572dba4 100755
--- a/python/scripts/dev
+++ b/python/scripts/dev
@@ -220,4 +220,5 @@ case "$1" in
         ;;
 esac
 
-exit 0
+# Leave the script's exit code as the subcommand's. A trailing `exit 0` here
+# silently masked ruff / mypy / pytest failures from the pre-push hook.
diff --git a/python/uv.lock b/python/uv.lock
index 038a7ce09..dc463b99b 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -3615,6 +3615,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f2/3b/b26f90f74e2986a82df6e7ac7e319b8ea7ccece1caec9f8ab6104dc70603/pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f", size = 9863, upload-time = "2024-03-21T22:14:02.694Z" },
 ]
 
+[[package]]
+name = "pytest-randomly"
+version = "3.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata", version = "8.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
+    { name = "importlib-metadata", version = "8.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/d4/6e924a0b2855736d942703dec88dfc98b4fe0881c8fa849b6b0fbb9182fa/pytest_randomly-3.15.0.tar.gz", hash = "sha256:b908529648667ba5e54723088edd6f82252f540cc340d748d1fa985539687047", size = 21743, upload-time = "2023-08-15T18:04:59.857Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/24/d3/00e575657422055c4ea220b2f80e8cc6026ab7130372b7067444d1b0ac10/pytest_randomly-3.15.0-py3-none-any.whl", hash = "sha256:0516f4344b29f4e9cdae8bce31c4aeebf59d0b9ef05927c33354ff3859eeeca6", size = 8685, upload-time = "2023-08-15T18:04:57.913Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -4315,7 +4329,7 @@ wheels = [
 
 [[package]]
 name = "sift-stack-py"
-version = "0.16.2"
+version = "0.17.0.dev2"
 source = { editable = "." }
 dependencies = [
     { name = "alive-progress", version = "3.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
@@ -4348,6 +4362,7 @@ dependencies = [
     { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
     { name = "requests", version = "2.34.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "requests-toolbelt" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
     { name = "types-protobuf", version = "5.29.1.20241207", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
     { name = "types-protobuf", version = "6.32.1.20251210", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" },
     { name = "types-protobuf", version = "7.34.1.20260518", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@@ -4399,6 +4414,7 @@ dev = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "ruff" },
     { name = "tomlkit" },
 ]
@@ -4426,6 +4442,7 @@ dev-all = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "rosbags", version = "0.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.8.2' and python_full_version < '3.10'" },
     { name = "rosbags", version = "0.11.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "ruff" },
@@ -4443,6 +4460,7 @@ development = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "ruff" },
     { name = "tomlkit" },
 ]
@@ -4488,6 +4506,7 @@ docs-build = [
     { name = "pytest-benchmark" },
     { name = "pytest-dotenv" },
     { name = "pytest-mock" },
+    { name = "pytest-randomly" },
     { name = "rosbags", version = "0.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.8.2' and python_full_version < '3.10'" },
     { name = "rosbags", version = "0.11.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "ruff" },
@@ -4627,6 +4646,10 @@ requires-dist = [
     { name = "pytest-mock", marker = "extra == 'dev-all'", specifier = "==3.14.0" },
     { name = "pytest-mock", marker = "extra == 'development'", specifier = "==3.14.0" },
     { name = "pytest-mock", marker = "extra == 'docs-build'", specifier = "==3.14.0" },
+    { name = "pytest-randomly", marker = "extra == 'dev'", specifier = "==3.15.0" },
+    { name = "pytest-randomly", marker = "extra == 'dev-all'", specifier = "==3.15.0" },
+    { name = "pytest-randomly", marker = "extra == 'development'", specifier = "==3.15.0" },
+    { name = "pytest-randomly", marker = "extra == 'docs-build'", specifier = "==3.15.0" },
     { name = "pyyaml", specifier = "~=6.0" },
     { name = "rapidyaml", specifier = "~=0.11" },
     { name = "requests", specifier = "~=2.25" },
@@ -4645,6 +4668,7 @@ requires-dist = [
     { name = "sift-stream-bindings", marker = "extra == 'docs-build'", specifier = "==0.3.0" },
     { name = "sift-stream-bindings", marker = "extra == 'sift-stream'", specifier = "==0.3.0" },
     { name = "sift-stream-bindings", marker = "extra == 'sift-stream-bindings'", specifier = "==0.3.0" },
+    { name = "tomli", marker = "python_full_version < '3.11'", specifier = "~=2.0" },
     { name = "tomlkit", marker = "extra == 'dev'", specifier = "~=0.13.3" },
     { name = "tomlkit", marker = "extra == 'dev-all'", specifier = "~=0.13.3" },
     { name = "tomlkit", marker = "extra == 'development'", specifier = "~=0.13.3" },