btx anthropic runner

realark · realark · commit 2b8aae363f95 · 2026-06-02T16:12:48.000-06:00
diff --git a/py/noxfile.py b/py/noxfile.py
@@ -279,6 +279,18 @@ def test_btx_openai(session, version):
     _run_tests(session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "openai", "BTX_CLIENT": "openai"})
 
 
+@nox.session()
+@nox.parametrize("version", ANTHROPIC_VERSIONS, ids=ANTHROPIC_VERSIONS)
+def test_btx_anthropic(session, version):
+    """Run the BTX cross-language LLM-span spec tests (Anthropic provider)."""
+    _install_test_deps(session)
+    _install_matrix_dep(session, "anthropic", version)
+    session.install("pyyaml")
+    _run_tests(
+        session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "anthropic", "BTX_CLIENT": "anthropic"}
+    )
+
+
 @nox.session()
 def test_openai_ddtrace(session):
     _install_test_deps(session)
diff --git a/py/src/braintrust/btx/span_validator.py b/py/src/braintrust/btx/span_validator.py
@@ -56,10 +56,21 @@ def _is_reasoning_message(value: Any) -> bool:
     return True
 
 
+def _is_positive_number(value: Any) -> bool:
+    return isinstance(value, (int, float)) and not isinstance(value, bool) and value > 0
+
+
+def _undefined_or_null(value: Any) -> bool:
+    """True if the value is absent (None/null) — used for fields that must not be populated."""
+    return value is None
+
+
 _NAMED_MATCHERS: dict[str, Any] = {
     "is_non_negative_number": _is_non_negative_number,
+    "is_positive_number": _is_positive_number,
     "is_non_empty_string": _is_non_empty_string,
     "is_reasoning_message": _is_reasoning_message,
+    "undefined_or_null": _undefined_or_null,
 }
 
 
diff --git a/py/src/braintrust/btx/spec-ref.txt b/py/src/braintrust/btx/spec-ref.txt
@@ -1 +1 @@
-v0.0.1
+v0.0.5
diff --git a/py/src/braintrust/btx/spec_executor.py b/py/src/braintrust/btx/spec_executor.py
@@ -122,7 +122,7 @@ def _dispatch(spec: LlmSpanSpec, client: Any) -> None:
         _execute_responses(spec.requests, client)
 
     elif provider == "anthropic" and endpoint == "/v1/messages":
-        _execute_anthropic_messages(spec.requests, client)
+        _execute_anthropic_messages(spec.requests, client, extra_headers=spec.headers or {})
 
     else:
         raise NotImplementedError(f"BTX executor: provider={provider!r} endpoint={endpoint!r} not implemented")
@@ -191,7 +191,9 @@ def _execute_responses(requests: list[dict[str, Any]], client: Any) -> None:
 # ---------------------------------------------------------------------------
 
 
-def _execute_anthropic_messages(requests: list[dict[str, Any]], client: Any) -> None:
+def _execute_anthropic_messages(
+    requests: list[dict[str, Any]], client: Any, extra_headers: dict[str, str] | None = None
+) -> None:
     """Execute Anthropic messages requests.
 
     Handles streaming (stream=True) by consuming the stream context manager.
@@ -209,14 +211,20 @@ def _execute_anthropic_messages(requests: list[dict[str, Any]], client: Any) ->
         is_streaming = full_req.get("stream", False)
         conversation_history.extend(req.get("messages", []))
 
+        create_kwargs: dict[str, Any] = dict(full_req)
+        if extra_headers:
+            create_kwargs["extra_headers"] = extra_headers
+
         if is_streaming:
-            with client.messages.create(**full_req) as stream:
-                final = stream.get_final_message()
-            if hasattr(final, "content") and final.content:
-                text_blocks = [b.text for b in final.content if hasattr(b, "text")]
-                conversation_history.append({"role": "assistant", "content": " ".join(text_blocks)})
+            # Iterate the stream to exhaustion — the Braintrust TracedMessageStream
+            # context manager captures metrics and logs the span on __exit__.
+            # We can't call get_final_message() on the traced wrapper, so we
+            # skip history accumulation for streaming (no multi-turn streaming specs).
+            with client.messages.create(**create_kwargs) as stream:
+                for _ in stream:
+                    pass
         else:
-            response = client.messages.create(**full_req)
+            response = client.messages.create(**create_kwargs)
             if hasattr(response, "content") and response.content:
                 text_blocks = [b.text for b in response.content if hasattr(b, "text")]
                 conversation_history.append({"role": "assistant", "content": " ".join(text_blocks)})
diff --git a/py/src/braintrust/btx/spec_loader.py b/py/src/braintrust/btx/spec_loader.py
@@ -1,35 +1,31 @@
 """Load BTX LLM-span spec YAML files.
 
-Handles the three custom YAML tags used in the spec:
+Handles the custom YAML tags used in the spec:
   !fn <name-or-lambda>  — named predicate or arbitrary lambda (eval'd in Python)
   !starts_with <prefix> — string prefix check
   !or [...]             — at-least-one-of validator
+  !gen <generator-name> — value generated by the test runner (e.g. test_runner_client)
 """
 
 from __future__ import annotations
 
 import dataclasses
 import os
+import uuid
 from pathlib import Path
 from typing import Any
 
 import yaml
 
 
 # ---------------------------------------------------------------------------
-# Matcher types (parallel to SpecMatcher.java)
+# Matcher / generator types
 # ---------------------------------------------------------------------------
 
 
 @dataclasses.dataclass
 class FnMatcher:
-    """A named or lambda-expression validator.
-
-    For well-known names (is_non_negative_number, etc.) the span_validator
-    module dispatches them to dedicated functions.  For arbitrary Python
-    expressions the expression string is stored and eval()'d at validation
-    time.
-    """
+    """A named or lambda-expression validator."""
 
     expr: str  # e.g. "is_non_negative_number" or "lambda value: value > 0"
 
@@ -44,6 +40,18 @@ class OrMatcher:
     alternatives: list[Any]
 
 
+@dataclasses.dataclass
+class GenValue:
+    """A value generated by the test runner at execution time.
+
+    The generator name determines what value is produced:
+      test_runner_client — a string identifying this SDK/client (e.g. "python-openai")
+      vcr_nonce          — a random string that changes every run (busts caches)
+    """
+
+    generator: str  # e.g. "test_runner_client", "vcr_nonce"
+
+
 # ---------------------------------------------------------------------------
 # YAML custom constructors
 # ---------------------------------------------------------------------------
@@ -64,6 +72,11 @@ def _or_constructor(loader: yaml.SafeLoader, node: yaml.Node) -> OrMatcher:
     return OrMatcher(alternatives=alternatives)
 
 
+def _gen_constructor(loader: yaml.SafeLoader, node: yaml.Node) -> GenValue:
+    generator = loader.construct_scalar(node)  # type: ignore[arg-type]
+    return GenValue(generator=generator)
+
+
 def _make_loader() -> type:
     """Return a SafeLoader subclass with BTX custom tags registered."""
 
@@ -73,9 +86,57 @@ class BtxLoader(yaml.SafeLoader):
     BtxLoader.add_constructor("!fn", _fn_constructor)
     BtxLoader.add_constructor("!starts_with", _starts_with_constructor)
     BtxLoader.add_constructor("!or", _or_constructor)
+    BtxLoader.add_constructor("!gen", _gen_constructor)
     return BtxLoader
 
 
+# ---------------------------------------------------------------------------
+# Generator resolution
+# ---------------------------------------------------------------------------
+
+# Stable client identifier for this SDK implementation.
+_CLIENT_ID = "python-btx"
+
+# Per-process nonce — constant within a run so cassette body matching is stable,
+# but differs across runs so cache-busting specs actually bust caches.
+_VCR_NONCE = str(uuid.uuid4())[:8]
+
+_GENERATORS: dict[str, str] = {
+    "test_runner_client": _CLIENT_ID,
+    "vcr_nonce": _VCR_NONCE,
+}
+
+
+def _resolve_gen(value: GenValue) -> str:
+    if value.generator in _GENERATORS:
+        return _GENERATORS[value.generator]
+    raise ValueError(f"Unknown !gen generator: {value.generator!r}")
+
+
+def _resolve_variables(variables: dict[str, Any]) -> dict[str, str]:
+    """Resolve all !gen values in the variables map to concrete strings."""
+    resolved: dict[str, str] = {}
+    for key, val in variables.items():
+        if isinstance(val, GenValue):
+            resolved[key] = _resolve_gen(val)
+        else:
+            resolved[key] = str(val)
+    return resolved
+
+
+def _substitute_templates(obj: Any, variables: dict[str, str]) -> Any:
+    """Recursively substitute {{var}} placeholders in strings."""
+    if isinstance(obj, str):
+        for key, value in variables.items():
+            obj = obj.replace(f"{{{{{key}}}}}", value)
+        return obj
+    if isinstance(obj, dict):
+        return {k: _substitute_templates(v, variables) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_substitute_templates(item, variables) for item in obj]
+    return obj
+
+
 # ---------------------------------------------------------------------------
 # Spec dataclass
 # ---------------------------------------------------------------------------
@@ -89,6 +150,7 @@ class LlmSpanSpec:
     endpoint: str
     requests: list[dict[str, Any]]
     expected_brainstore_spans: list[dict[str, Any]]
+    headers: dict[str, str]
     source_path: Path
 
     @property
@@ -98,13 +160,19 @@ def display_name(self) -> str:
 
     @classmethod
     def from_dict(cls, data: dict[str, Any], source_path: Path) -> "LlmSpanSpec":
+        # Resolve variables and substitute templates in requests
+        raw_variables = data.get("variables", {})
+        variables = _resolve_variables(raw_variables)
+        requests = _substitute_templates(data.get("requests", []), variables)
+
         return cls(
             name=data["name"],
             type=data["type"],
             provider=data["provider"],
             endpoint=data["endpoint"],
-            requests=data.get("requests", []),
+            requests=requests,
             expected_brainstore_spans=data.get("expected_brainstore_spans", []),
+            headers=data.get("headers", {}),
             source_path=source_path,
         )
 
@@ -117,13 +185,6 @@ def from_dict(cls, data: dict[str, Any], source_path: Path) -> "LlmSpanSpec":
 
 
 def _spec_root(override: str | None = None) -> Path:
-    """Return the llm_span spec root directory.
-
-    Priority:
-    1. ``override`` argument (used by the pytest fixture after fetching specs)
-    2. ``BTX_SPEC_ROOT`` environment variable
-    3. ``<btx-dir>/spec/test/llm_span`` (local dev snapshot)
-    """
     if override:
         return Path(override)
     env = os.environ.get("BTX_SPEC_ROOT")
@@ -139,10 +200,8 @@ def load_specs(
     """Load all YAML spec files under *spec_root*.
 
     Args:
-        spec_root: Path to the ``test/llm_span`` directory.  Falls back to
-                   :func:`_spec_root` resolution if ``None``.
+        spec_root: Path to the ``test/llm_span`` directory.
         providers: Optional allow-list of provider names (e.g. ``["openai"]``).
-                   If ``None``, all providers are loaded.
 
     Returns:
         Sorted list of :class:`LlmSpanSpec` instances.
@@ -159,7 +218,6 @@ def load_specs(
     specs: list[LlmSpanSpec] = []
 
     for yaml_path in sorted(root.rglob("*.yaml")):
-        # Filter by provider directory if requested
         provider_dir = yaml_path.parent.name
         if providers is not None and provider_dir not in providers:
             continue