maseval
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 19 additions & 3 deletions b/‎AGENTS.md‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 5 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎maseval/benchmark/tau2/domains/base.py‎
Lines changed: 4 additions & 2 deletions b/‎maseval/benchmark/tau2/domains/base.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 6 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎scripts/coverage_by_feature.py‎
Lines changed: 59 additions & 10 deletions b/‎scripts/coverage_by_feature.py‎
Lines changed: 59 additions & 10 deletions
@@ -72,6 +72,55 @@ jobs:
         run: |
           uv run pytest -v
 
+  test-slow:
+    name: Slow Tests (Data Downloads + Integrity)
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: |
+          pip install uv
+          uv sync --all-extras --group dev
+      - name: Cache benchmark data
+        uses: actions/cache@v4
+        with:
+          path: |
+            maseval/benchmark/tau2/data/
+            maseval/benchmark/macs/data/
+            maseval/benchmark/macs/prompt_templates/
+          key: benchmark-data-${{ hashFiles('maseval/benchmark/tau2/data_loader.py', 'maseval/benchmark/macs/data_loader.py') }}
+      - name: Run slow tests
+        run: |
+          uv run pytest -m "slow and not credentialed" -v
+
+  # test-credentialed:
+  #   name: Credentialed Tests (Live API)
+  #   runs-on: ubuntu-latest
+  #   environment: credentialed-tests
+
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - name: Set up Python 3.12
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: "3.12"
+  #     - name: Install dependencies
+  #       run: |
+  #         pip install uv
+  #         uv sync --all-extras --group dev
+  #     - name: Run credentialed tests
+  #       env:
+  #         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  #         ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+  #         GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+  #       run: |
+  #         uv run pytest -m "credentialed and not smoke" -v
+
   coverage:
     name: Coverage Report
     runs-on: ubuntu-latest
 
@@ -1,3 +1,6 @@
+# Subdirectories can have their own .gitignores.
+# E.g. check `maseval/benchmar/.../.gitignore
+
 # Custom
 .idea/
 .DS_Store
 
@@ -38,13 +38,16 @@ uv run ruff check . --fix
 
 ## Testing Instructions
 
-- Tests use pytest markers: `core`, `interface`, `smolagents`, `langgraph`, `contract`
+- Tests use composable pytest markers — see `tests/README.md` for full details
+- **What it tests**: `core`, `interface`, `contract`, `benchmark`, `smolagents`, `langgraph`, `llamaindex`, `gaia2`, `camel`
+- **What it needs**: `live` (network), `credentialed` (API keys), `slow` (>30s), `smoke` (full pipeline)
+- Default `pytest` excludes `slow`, `credentialed`, and `smoke` via `addopts`
 - All tests must pass before PR merge
 - Add/update tests for code changes
-- Fix type errors and lint issues until suite is green
+- **Benchmark tests** follow a two-tier pattern (offline structural + live real-data). See `tests/README.md` for the recommended pattern when adding or modifying benchmark tests.
 
 ```bash
-# Run all tests
+# Default — fast tests only
 uv run pytest -v
 
 # Core tests only (minimal dependencies)
@@ -53,14 +56,27 @@ uv run pytest -m core -v
 # Specific integration tests
 uv run pytest -m smolagents -v
 uv run pytest -m interface -v
+
+# Data download validation (needs network)
+uv run pytest -m "live and slow" -v
+
+# Live API tests (needs OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY)
+uv run pytest -m credentialed -v
+
+# Fully offline
+uv run pytest -m "not live" -v
 ```
 
 ## Coverage
 
 View coverage by feature area (auto-discovers benchmarks/interfaces):
 
 ```bash
+# Full coverage (default + slow + live, excludes credentialed and smoke)
 uv run python scripts/coverage_by_feature.py
+
+# Fast-only (skip slow and live tests)
+uv run python scripts/coverage_by_feature.py --exclude slow,live
 ```
 
 Manual coverage for specific modules:
 
@@ -47,6 +47,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Added `camel_role_playing_execution_loop()` for benchmarks using CAMEL's RolePlaying semantics (PR: #22)
   - Added `CamelRolePlayingTracer` and `CamelWorkforceTracer` for capturing orchestration-level traces from CAMEL's multi-agent systems (PR: #22)
 
+**Testing**
+
+- Composable pytest markers (`live`, `credentialed`, `slow`, `smoke`) for fine-grained test selection; default runs exclude slow, credentialed, and smoke tests (PR: #29)
+- Marker implication hook: `credentialed` implies `live`, so `-m "not live"` always gives a fully offline run (PR: #29)
+- Skip decorators (`requires_openai`, `requires_anthropic`, `requires_google`) for tests needing API keys (PR: #29)
+- Data integrity tests for Tau2 and MACS benchmarks validating download pipelines, file structures, and database content (PR: #29)
+- HTTP-level API contract tests for model adapters (OpenAI, Anthropic, Google GenAI, LiteLLM) using `respx` mocks — no API keys needed (PR: #29)
+- Live API round-trip tests for all model adapters (`-m credentialed`) (PR: #29)
+- CI jobs for slow tests (with benchmark data caching) and credentialed tests (behind GitHub Environment approval) (PR: #29)
+- Added `respx` dev dependency for HTTP-level mocking (PR: #29)
+
 ### Changed
 
 **Core**
@@ -72,8 +83,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LangGraphUser` → `LangGraphLLMUser`
   - `LlamaIndexUser` → `LlamaIndexLLMUser`
 
+**Testing**
+
+- Coverage script (`scripts/coverage_by_feature.py`) now accepts `--exclude` flag to skip additional markers; always excludes `credentialed` and `smoke` by default (PR: #29)
+
 ### Fixed
 
+- Fixed incorrect return type annotations on `DB.load()` and `DB.copy_deep()` in Tau2 benchmark — now use `Self` instead of `"DB"`, so subclass methods return the correct type (PR: #29)
+
 ### Removed
 
 ## [0.3.0] - 2025-01-18
 
@@ -211,11 +211,14 @@ When you open a Pull Request, a series of automated checks will run using **GitH
 The pipeline automatically performs the following tasks:
 
 - **Linting and Formatting**: Verifies that your code adheres to our style guide using `ruff`.
-- **Testing**: Runs the entire test suite across different Python versions and operating systems. This includes tests for both the core package and the optional integrations.
+- **Testing** (tiered):
+  - *Fast tests* (every PR, Python 3.10–3.14): core, benchmark, and all default-suite tests. No API keys needed.
+  - *Slow tests* (every PR, Python 3.12): data download and integrity validation.
+  - *Credentialed tests* (every PR, Python 3.12): live API tests. Requires maintainer approval to run — secrets are only exposed after approval.
 - **Type Checking**: Validates type annotations using `ty`.
 - **Documentation**: Ensures documentation builds without errors using `mkdocs`.
 
-**All checks must pass** before your Pull Request can be merged. You can view the progress and logs of these checks directly on your Pull Request page in GitHub.
+**All checks must pass** before your Pull Request can be merged. Contributors don't need API keys — the default and slow test suites run without them. See `tests/README.md` for how markers work and for the recommended benchmark testing pattern (offline structural tests vs. real-data tests).
 
 > **Note:** You don't need to run all these checks locally - CI will catch issues. However, running `uv run ruff format && uv run ruff check` before pushing can save you time.
 
 
@@ -16,6 +16,8 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Generic, Optional, TypeVar, Union
 
+from typing_extensions import Self
+
 from pydantic import BaseModel, ConfigDict
 
 from maseval.benchmark.tau2.utils import get_pydantic_hash, load_file, update_pydantic_model_with_dict
@@ -37,7 +39,7 @@ class DB(BaseModel):
     model_config = ConfigDict(extra="forbid")  # Reject unknown fields
 
     @classmethod
-    def load(cls, path: Union[str, Path]) -> "DB":
+    def load(cls, path: Union[str, Path]) -> Self:
         """Load the database from a structured file (JSON, TOML, YAML).
 
         Args:
@@ -73,7 +75,7 @@ def get_statistics(self) -> Dict[str, Any]:
         """
         return {}
 
-    def copy_deep(self) -> "DB":
+    def copy_deep(self) -> Self:
         """Create a deep copy of the database.
 
         Returns:
 
@@ -82,6 +82,7 @@ dev = [
     "ruff>=0.14.0",
     "ty>=0.0.5",
     "pre-commit>=4.0.0",
+    "respx>=0.22.0",
 ]
 
 # Documentation building - for contributors only
@@ -112,9 +113,13 @@ markers = [
     "llamaindex: Tests that specifically require llama-index-core",
     "gaia2: Tests that specifically require ARE (Agent Research Environments)",
     "camel: Tests that specifically require camel-ai",
+    "live: Tests requiring network access (downloads, external APIs)",
+    "credentialed: Tests requiring API keys (implies live, costs money)",
+    "slow: Tests taking >30 seconds (data downloads, large datasets)",
+    "smoke: Full end-to-end pipeline validation (pre-release only)",
 ]
 minversion = "6.0"
-addopts = "-ra -q"
+addopts = "-ra -q -m 'not (slow or credentialed or smoke)'"
 testpaths = ["tests"]
 
 [tool.coverage.run]
 
@@ -4,17 +4,31 @@
 Automatically discovers benchmarks and integrations from the codebase structure.
 Provides a high-level view of coverage by logical component rather than by file.
 
-Usage:
+By default, runs all tests except ``credentialed`` and ``smoke`` (i.e. includes
+``slow`` and ``live`` tests that don't need API keys).  Use ``--exclude`` to
+skip additional markers.
+
+Usage::
+
+    # Full coverage (default + slow + live)
     uv run python scripts/coverage_by_feature.py
+
+    # Fast-only (skip slow and live tests)
+    uv run python scripts/coverage_by_feature.py --exclude slow,live
 """
 
+import argparse
 import json
 import subprocess
 import sys
 from pathlib import Path
 from typing import Dict, List, Set
 
 
+# Markers that are always excluded (need API keys or are pre-release only)
+ALWAYS_EXCLUDED = ["credentialed", "smoke"]
+
+
 def discover_benchmarks(maseval_dir: Path) -> List[str]:
     """Auto-discover benchmark implementations."""
     benchmark_dir = maseval_dir / "benchmark"
@@ -66,14 +80,30 @@ def discover_integrations(maseval_dir: Path) -> Dict[str, Dict[str, List[str]]]:
     return integrations
 
 
-def run_coverage() -> bool:
-    """Run pytest with coverage collection."""
-    print("Running tests with coverage...")
-    result = subprocess.run(
-        ["pytest", "--cov=maseval", "--cov-report=json", "--quiet"],
-        capture_output=True,
-        text=True,
-    )
+def build_marker_expression(exclude: List[str]) -> str:
+    """Build a pytest marker expression from the list of markers to exclude."""
+    all_excluded = ALWAYS_EXCLUDED + [m for m in exclude if m not in ALWAYS_EXCLUDED]
+    return "not (" + " or ".join(all_excluded) + ")"
+
+
+def run_coverage(marker_expr: str) -> bool:
+    """Run pytest with coverage collection.
+
+    Args:
+        marker_expr: Pytest marker expression (passed via -m).
+    """
+    cmd = [
+        "pytest",
+        "--override-ini=addopts=",
+        "-m",
+        marker_expr,
+        "--cov=maseval",
+        "--cov-report=json",
+        "--quiet",
+    ]
+
+    print(f"Running tests with coverage  (-m '{marker_expr}') ...")
+    result = subprocess.run(cmd, capture_output=True, text=True)
     if result.returncode != 0:
         print("\nTests failed:")
         print(result.stdout)
@@ -133,13 +163,32 @@ def format_coverage(label: str, stats: Dict[str, float], indent: int = 0) -> str
     return f"{indent_str}{label:<30} {color}{percent:6.2f}%{reset}  ({stats['covered']}/{stats['total']} lines)"
 
 
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Generate test coverage report organized by feature area.",
+    )
+    parser.add_argument(
+        "--exclude",
+        type=str,
+        default="",
+        help="Comma-separated markers to exclude (e.g. 'slow,live'). 'credentialed' and 'smoke' are always excluded.",
+    )
+    return parser.parse_args()
+
+
 def main():
     """Generate coverage report by feature area."""
+    args = parse_args()
     repo_root = Path(__file__).parent.parent
     maseval_dir = repo_root / "maseval"
 
+    # Build marker expression
+    extra_excludes = [m.strip() for m in args.exclude.split(",") if m.strip()]
+    marker_expr = build_marker_expression(extra_excludes)
+
     # Run coverage
-    if not run_coverage():
+    if not run_coverage(marker_expr):
         print("\nTests failed. Coverage report may be incomplete.")
 
     print("\n" + "=" * 80)