diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 10160ae8..97634b90 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,8 +26,14 @@ jobs: pip install uv uv sync --group dev - name: Run core tests - run: | - uv run pytest -m core -v + run: uv run coverage run --parallel-mode -m pytest -m core -v + - name: Upload coverage data + if: matrix.python-version == '3.12' + uses: actions/upload-artifact@v4 + with: + name: coverage-core + path: .coverage.* + include-hidden-files: true test-benchmark: name: Benchmark Tests @@ -47,12 +53,17 @@ jobs: pip install uv uv sync --all-extras --group dev - name: Run benchmark tests - run: | - uv run pytest -m "benchmark and not (slow or live)" -v + run: uv run coverage run --parallel-mode -m pytest -m "benchmark and not (slow or live)" -v + - name: Upload coverage data + if: matrix.python-version == '3.12' + uses: actions/upload-artifact@v4 + with: + name: coverage-benchmark + path: .coverage.* + include-hidden-files: true - test-all: - name: All Tests (With Optional Deps) - needs: [test-core, test-benchmark] + test-core-optional: + name: Core Tests (With Optional Deps) runs-on: ubuntu-latest strategy: matrix: @@ -68,9 +79,42 @@ jobs: run: | pip install uv uv sync --all-extras --group dev - - name: Run all tests + - name: Run core tests with optional deps + run: uv run coverage run --parallel-mode -m pytest -m core -v + - name: Upload coverage data + if: matrix.python-version == '3.12' + uses: actions/upload-artifact@v4 + with: + name: coverage-core-optional + path: .coverage.* + include-hidden-files: true + + test-interface: + name: Interface Tests + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install all dependencies run: | - uv run pytest -v + pip install uv + uv sync --all-extras --group dev + - name: Run interface tests + run: uv run coverage run --parallel-mode -m pytest -m interface -v + - name: Upload coverage data + if: matrix.python-version == '3.12' + uses: actions/upload-artifact@v4 + with: + name: coverage-interface + path: .coverage.* + include-hidden-files: true test-slow: name: Slow Tests (Data Downloads + Integrity) @@ -86,9 +130,14 @@ jobs: run: | pip install uv uv sync --all-extras --group dev - - name: Run slow tests - run: | - uv run pytest -m "slow and not credentialed" -v + - name: Run slow and live tests + run: uv run coverage run --parallel-mode -m pytest -m "(slow or live) and not credentialed" -v + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + name: coverage-slow + path: .coverage.* + include-hidden-files: true # test-credentialed: # name: Credentialed Tests (Live API) @@ -115,6 +164,7 @@ jobs: coverage: name: Coverage Report + needs: [test-core, test-benchmark, test-core-optional, test-interface, test-slow] runs-on: ubuntu-latest permissions: contents: write @@ -131,11 +181,17 @@ jobs: - name: Install dependencies run: | pip install uv - uv sync --all-extras --all-groups + uv sync --group dev + + - name: Download coverage data + uses: actions/download-artifact@v4 + with: + pattern: coverage-* + merge-multiple: true - - name: Run tests with coverage + - name: Combine and report coverage run: | - uv run coverage run -m pytest + uv run coverage combine uv run coverage xml uv run coverage html uv run coverage report diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..40928b93 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,10 @@ +comment: false + +coverage: + status: + project: + default: + informational: true + patch: + default: + informational: true diff --git a/pyproject.toml b/pyproject.toml index 18d6f067..5bbf369d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -192,7 +192,7 @@ testpaths = ["tests"] [tool.coverage.run] relative_files = true source = ["maseval"] -omit = ["*/tests/*", "*/examples/*", "*/__pycache__/*"] +omit = ["*/tests/*", "*/examples/*", "*/__pycache__/*", "maseval/benchmark/multiagentbench/marble/**"] branch = true [tool.coverage.report] @@ -204,6 +204,7 @@ exclude_lines = [ "if __name__ == .__main__.:", "if TYPE_CHECKING:", "@abstractmethod", + "@overload", ] precision = 2 diff --git a/tests/README.md b/tests/README.md index 9d376602..c73fca13 100644 --- a/tests/README.md +++ b/tests/README.md @@ -55,19 +55,30 @@ Defined in `pyproject.toml`: ## CI Pipeline -Six jobs in `.github/workflows/test.yml`: - -| Job | Python | What it runs | Gate | -| ----------------- | --------- | --------------------------------- | ---------------------- | -| test-core | 3.10–3.14 | `-m core` | — | -| test-benchmark | 3.10–3.14 | `-m "benchmark and not (slow or live)"` | — | -| test-all | 3.10–3.14 | `pytest -v` (default filter) | After core + benchmark | -| test-slow | 3.12 | `-m "slow and not credentialed"` | — | -| test-credentialed | 3.12 | `-m "credentialed and not smoke"` | Maintainer approval | -| coverage | 3.12 | Default suite (fast) with coverage report | — | +Jobs in `.github/workflows/test.yml`. Each test job collects coverage data (from Python 3.12 only); the final coverage job merges them into one combined report. + +| Job | Python | What it runs | Gate | +| ------------------ | --------- | -------------------------------------------- | ------------------- | +| test-core | 3.10–3.14 | `-m core` (no optional deps) | — | +| test-benchmark | 3.10–3.14 | `-m "benchmark and not (slow or live)"` | — | +| test-core-optional | 3.10–3.14 | `-m core` (with optional deps) | — | +| test-interface | 3.10–3.14 | `-m interface` | — | +| test-slow | 3.12 | `-m "(slow or live) and not credentialed"` | — | +| test-credentialed | 3.12 | `-m "credentialed and not smoke"` (disabled) | Maintainer approval | +| coverage | 3.12 | Combines coverage from all jobs above | After all test jobs | Contributors don't need API keys — the default suite and slow tests run without them. +### Detecting orphaned tests + +Every test must carry at least one marker that maps to a CI job. To find tests that would be missed: + +```bash +uv run pytest --collect-only -m "not (core or benchmark or interface or slow or live or credentialed or smoke)" +``` + +If this reports any collected tests, add the appropriate marker (usually `pytestmark = pytest.mark.core` or `pytest.mark.benchmark`) to the file. + ## Test Organization ``` @@ -105,6 +116,7 @@ Benchmark tests follow a **two-tier pattern**: **Tier 1: Structural tests (offline, `benchmark` marker only)** Tests that work without downloaded data or network access: + - Import protection: `maseval` runs without benchmark optional dependencies - Graceful errors: descriptive error when benchmark code is accessed without deps - Interface checks: class methods exist, types correct, invalid inputs rejected @@ -113,6 +125,7 @@ Tests that work without downloaded data or network access: **Tier 2: Real data tests (`benchmark` + `live` markers)** Tests that download and use actual benchmark data: + - Environment/tool tests: create real environments, execute tools on real databases - Data loading pipeline: `load_tasks`, `load_domain_config`, etc. - Data integrity validation (also marked `slow`): schema checks, minimum record counts, field structure @@ -122,6 +135,7 @@ Tests that download and use actual benchmark data: Benchmarks use `ensure_data_exists()` to download data to the **package's default data directory** (not temp dirs). This function caches — it skips download if files already exist. A session-scoped pytest fixture (e.g., `ensure_tau2_data`, `ensure_macs_templates`) triggers the download once per test session. Tests that need real data should: + 1. Depend on the download fixture (`ensure_tau2_data`, `ensure_macs_templates`, etc.) 2. Be marked `@pytest.mark.live` 3. Use simple constructors — e.g., `Tau2Environment({"domain": "retail"})` — since data is already in the default location @@ -131,6 +145,7 @@ Tests that don't need data (structural, mock-based) should NOT depend on the dow #### How to decide: mock or real data? This is a judgment call. As a guideline: + - If the test validates **structure, types, or error handling** → Tier 1 (offline) - If the test operates on **real database records, files, or network resources** → Tier 2 (`live`) - Don't force synthetic fixtures where they add complexity without value. If something needs real data, test it with real data. @@ -166,4 +181,4 @@ requires_openai = pytest.mark.skipif( ## Notes -- Credentialed tests require maintainer approval via GitHub Environment. See `EXTENDEDTESTINGSTRATEGYPLAN.md` for details. +- Credentialed tests require maintainer approval via GitHub Environment. diff --git a/tests/test_core/test_callbacks/test_progress_bar.py b/tests/test_core/test_callbacks/test_progress_bar.py index c05e5d40..007fc42a 100644 --- a/tests/test_core/test_callbacks/test_progress_bar.py +++ b/tests/test_core/test_callbacks/test_progress_bar.py @@ -15,6 +15,8 @@ ) from maseval.core.task import Task +pytestmark = pytest.mark.core + @pytest.fixture def mock_benchmark(): diff --git a/tests/test_core/test_exceptions.py b/tests/test_core/test_exceptions.py index 68177468..416ebb7e 100644 --- a/tests/test_core/test_exceptions.py +++ b/tests/test_core/test_exceptions.py @@ -7,6 +7,7 @@ """ import pytest + from maseval import ( TaskQueue, TaskExecutionStatus, @@ -19,6 +20,8 @@ validate_arguments_from_schema, ) +pytestmark = pytest.mark.core + class TestExceptionClassification: """Tests for exception classification in benchmark execution."""