parameterlab · cemde · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,8 +26,14 @@ jobs:
           pip install uv
           uv sync --group dev
       - name: Run core tests
-        run: |
-          uv run pytest -m core -v
+        run: uv run coverage run --parallel-mode -m pytest -m core -v
+      - name: Upload coverage data
+        if: matrix.python-version == '3.12'
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-core
+          path: .coverage.*
+          include-hidden-files: true
 
   test-benchmark:
     name: Benchmark Tests
@@ -47,12 +53,17 @@ jobs:
           pip install uv
           uv sync --all-extras --group dev
       - name: Run benchmark tests
-        run: |
-          uv run pytest -m "benchmark and not (slow or live)" -v
+        run: uv run coverage run --parallel-mode -m pytest -m "benchmark and not (slow or live)" -v
+      - name: Upload coverage data
+        if: matrix.python-version == '3.12'
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-benchmark
+          path: .coverage.*
+          include-hidden-files: true
 
-  test-all:
-    name: All Tests (With Optional Deps)
-    needs: [test-core, test-benchmark]
+  test-core-optional:
+    name: Core Tests (With Optional Deps)
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -68,9 +79,42 @@ jobs:
         run: |
           pip install uv
           uv sync --all-extras --group dev
-      - name: Run all tests
+      - name: Run core tests with optional deps
+        run: uv run coverage run --parallel-mode -m pytest -m core -v
+      - name: Upload coverage data
+        if: matrix.python-version == '3.12'
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-core-optional
+          path: .coverage.*
+          include-hidden-files: true
+
+  test-interface:
+    name: Interface Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install all dependencies
         run: |
-          uv run pytest -v
+          pip install uv
+          uv sync --all-extras --group dev
+      - name: Run interface tests
+        run: uv run coverage run --parallel-mode -m pytest -m interface -v
+      - name: Upload coverage data
+        if: matrix.python-version == '3.12'
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-interface
+          path: .coverage.*
+          include-hidden-files: true
 
   test-slow:
     name: Slow Tests (Data Downloads + Integrity)
@@ -86,9 +130,14 @@ jobs:
         run: |
           pip install uv
           uv sync --all-extras --group dev
-      - name: Run slow tests
-        run: |
-          uv run pytest -m "slow and not credentialed" -v
+      - name: Run slow and live tests
+        run: uv run coverage run --parallel-mode -m pytest -m "(slow or live) and not credentialed" -v
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-slow
+          path: .coverage.*
+          include-hidden-files: true
 
   # test-credentialed:
   #   name: Credentialed Tests (Live API)
@@ -115,6 +164,7 @@ jobs:
 
   coverage:
     name: Coverage Report
+    needs: [test-core, test-benchmark, test-core-optional, test-interface, test-slow]
     runs-on: ubuntu-latest
     permissions:
       contents: write
@@ -131,11 +181,17 @@ jobs:
       - name: Install dependencies
         run: |
           pip install uv
-          uv sync --all-extras --all-groups
+          uv sync --group dev
+
+      - name: Download coverage data
+        uses: actions/download-artifact@v4
+        with:
+          pattern: coverage-*
+          merge-multiple: true
 
-      - name: Run tests with coverage
+      - name: Combine and report coverage
         run: |
-          uv run coverage run -m pytest
+          uv run coverage combine
           uv run coverage xml
           uv run coverage html
           uv run coverage report

diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,10 @@
+comment: false
+
+coverage:
+  status:
+    project:
+      default:
+        informational: true
+    patch:
+      default:
+        informational: true
diff --git a/pyproject.toml b/pyproject.toml
@@ -192,7 +192,7 @@ testpaths = ["tests"]
 [tool.coverage.run]
 relative_files = true
 source = ["maseval"]
-omit = ["*/tests/*", "*/examples/*", "*/__pycache__/*"]
+omit = ["*/tests/*", "*/examples/*", "*/__pycache__/*", "maseval/benchmark/multiagentbench/marble/**"]
 branch = true
 
 [tool.coverage.report]
@@ -204,6 +204,7 @@ exclude_lines = [
     "if __name__ == .__main__.:",
     "if TYPE_CHECKING:",
     "@abstractmethod",
+    "@overload",
 ]
 precision = 2
 

diff --git a/tests/README.md b/tests/README.md
@@ -55,19 +55,30 @@ Defined in `pyproject.toml`:
 
 ## CI Pipeline
 
-Six jobs in `.github/workflows/test.yml`:
-
-| Job               | Python    | What it runs                      | Gate                   |
-| ----------------- | --------- | --------------------------------- | ---------------------- |
-| test-core         | 3.10–3.14 | `-m core`                         | —                      |
-| test-benchmark    | 3.10–3.14 | `-m "benchmark and not (slow or live)"` | —                |
-| test-all          | 3.10–3.14 | `pytest -v` (default filter)      | After core + benchmark |
-| test-slow         | 3.12      | `-m "slow and not credentialed"`  | —                      |
-| test-credentialed | 3.12      | `-m "credentialed and not smoke"` | Maintainer approval    |
-| coverage          | 3.12      | Default suite (fast) with coverage report | —              |
+Jobs in `.github/workflows/test.yml`. Each test job collects coverage data (from Python 3.12 only); the final coverage job merges them into one combined report.
+
+| Job                | Python    | What it runs                                 | Gate                |
+| ------------------ | --------- | -------------------------------------------- | ------------------- |
+| test-core          | 3.10–3.14 | `-m core` (no optional deps)                 | —                   |
+| test-benchmark     | 3.10–3.14 | `-m "benchmark and not (slow or live)"`      | —                   |
+| test-core-optional | 3.10–3.14 | `-m core` (with optional deps)               | —                   |
+| test-interface     | 3.10–3.14 | `-m interface`                               | —                   |
+| test-slow          | 3.12      | `-m "(slow or live) and not credentialed"`   | —                   |
+| test-credentialed  | 3.12      | `-m "credentialed and not smoke"` (disabled) | Maintainer approval |
+| coverage           | 3.12      | Combines coverage from all jobs above        | After all test jobs |
 
 Contributors don't need API keys — the default suite and slow tests run without them.
 
+### Detecting orphaned tests
+
+Every test must carry at least one marker that maps to a CI job. To find tests that would be missed:
+
+```bash
+uv run pytest --collect-only -m "not (core or benchmark or interface or slow or live or credentialed or smoke)"
+```
+
+If this reports any collected tests, add the appropriate marker (usually `pytestmark = pytest.mark.core` or `pytest.mark.benchmark`) to the file.
+
 ## Test Organization
 
 ```
@@ -105,6 +116,7 @@ Benchmark tests follow a **two-tier pattern**:
 **Tier 1: Structural tests (offline, `benchmark` marker only)**
 
 Tests that work without downloaded data or network access:
+
 - Import protection: `maseval` runs without benchmark optional dependencies
 - Graceful errors: descriptive error when benchmark code is accessed without deps
 - Interface checks: class methods exist, types correct, invalid inputs rejected
@@ -113,6 +125,7 @@ Tests that work without downloaded data or network access:
 **Tier 2: Real data tests (`benchmark` + `live` markers)**
 
 Tests that download and use actual benchmark data:
+
 - Environment/tool tests: create real environments, execute tools on real databases
 - Data loading pipeline: `load_tasks`, `load_domain_config`, etc.
 - Data integrity validation (also marked `slow`): schema checks, minimum record counts, field structure
@@ -122,6 +135,7 @@ Tests that download and use actual benchmark data:
 Benchmarks use `ensure_data_exists()` to download data to the **package's default data directory** (not temp dirs). This function caches — it skips download if files already exist. A session-scoped pytest fixture (e.g., `ensure_tau2_data`, `ensure_macs_templates`) triggers the download once per test session.
 
 Tests that need real data should:
+
 1. Depend on the download fixture (`ensure_tau2_data`, `ensure_macs_templates`, etc.)
 2. Be marked `@pytest.mark.live`
 3. Use simple constructors — e.g., `Tau2Environment({"domain": "retail"})` — since data is already in the default location
@@ -131,6 +145,7 @@ Tests that don't need data (structural, mock-based) should NOT depend on the dow
 #### How to decide: mock or real data?
 
 This is a judgment call. As a guideline:
+
 - If the test validates **structure, types, or error handling** → Tier 1 (offline)
 - If the test operates on **real database records, files, or network resources** → Tier 2 (`live`)
 - Don't force synthetic fixtures where they add complexity without value. If something needs real data, test it with real data.
@@ -166,4 +181,4 @@ requires_openai = pytest.mark.skipif(
 
 ## Notes
 
-- Credentialed tests require maintainer approval via GitHub Environment. See `EXTENDEDTESTINGSTRATEGYPLAN.md` for details.
+- Credentialed tests require maintainer approval via GitHub Environment.
diff --git a/tests/test_core/test_callbacks/test_progress_bar.py b/tests/test_core/test_callbacks/test_progress_bar.py
@@ -15,6 +15,8 @@
 )
 from maseval.core.task import Task
 
+pytestmark = pytest.mark.core
+
 
 @pytest.fixture
 def mock_benchmark():

diff --git a/tests/test_core/test_exceptions.py b/tests/test_core/test_exceptions.py
@@ -7,6 +7,7 @@
 """
 
 import pytest
+
 from maseval import (
     TaskQueue,
     TaskExecutionStatus,
@@ -19,6 +20,8 @@
     validate_arguments_from_schema,
 )
 
+pytestmark = pytest.mark.core
+
 
 class TestExceptionClassification:
     """Tests for exception classification in benchmark execution."""