Skip to content

Commit 2b8aae3

Browse files
committed
btx anthropic runner
1 parent 0d6c87d commit 2b8aae3

5 files changed

Lines changed: 119 additions & 30 deletions

File tree

py/noxfile.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,18 @@ def test_btx_openai(session, version):
279279
_run_tests(session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "openai", "BTX_CLIENT": "openai"})
280280

281281

282+
@nox.session()
283+
@nox.parametrize("version", ANTHROPIC_VERSIONS, ids=ANTHROPIC_VERSIONS)
284+
def test_btx_anthropic(session, version):
285+
"""Run the BTX cross-language LLM-span spec tests (Anthropic provider)."""
286+
_install_test_deps(session)
287+
_install_matrix_dep(session, "anthropic", version)
288+
session.install("pyyaml")
289+
_run_tests(
290+
session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "anthropic", "BTX_CLIENT": "anthropic"}
291+
)
292+
293+
282294
@nox.session()
283295
def test_openai_ddtrace(session):
284296
_install_test_deps(session)

py/src/braintrust/btx/span_validator.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,21 @@ def _is_reasoning_message(value: Any) -> bool:
5656
return True
5757

5858

59+
def _is_positive_number(value: Any) -> bool:
60+
return isinstance(value, (int, float)) and not isinstance(value, bool) and value > 0
61+
62+
63+
def _undefined_or_null(value: Any) -> bool:
64+
"""True if the value is absent (None/null) — used for fields that must not be populated."""
65+
return value is None
66+
67+
5968
_NAMED_MATCHERS: dict[str, Any] = {
6069
"is_non_negative_number": _is_non_negative_number,
70+
"is_positive_number": _is_positive_number,
6171
"is_non_empty_string": _is_non_empty_string,
6272
"is_reasoning_message": _is_reasoning_message,
73+
"undefined_or_null": _undefined_or_null,
6374
}
6475

6576

py/src/braintrust/btx/spec-ref.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v0.0.1
1+
v0.0.5

py/src/braintrust/btx/spec_executor.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def _dispatch(spec: LlmSpanSpec, client: Any) -> None:
122122
_execute_responses(spec.requests, client)
123123

124124
elif provider == "anthropic" and endpoint == "/v1/messages":
125-
_execute_anthropic_messages(spec.requests, client)
125+
_execute_anthropic_messages(spec.requests, client, extra_headers=spec.headers or {})
126126

127127
else:
128128
raise NotImplementedError(f"BTX executor: provider={provider!r} endpoint={endpoint!r} not implemented")
@@ -191,7 +191,9 @@ def _execute_responses(requests: list[dict[str, Any]], client: Any) -> None:
191191
# ---------------------------------------------------------------------------
192192

193193

194-
def _execute_anthropic_messages(requests: list[dict[str, Any]], client: Any) -> None:
194+
def _execute_anthropic_messages(
195+
requests: list[dict[str, Any]], client: Any, extra_headers: dict[str, str] | None = None
196+
) -> None:
195197
"""Execute Anthropic messages requests.
196198
197199
Handles streaming (stream=True) by consuming the stream context manager.
@@ -209,14 +211,20 @@ def _execute_anthropic_messages(requests: list[dict[str, Any]], client: Any) ->
209211
is_streaming = full_req.get("stream", False)
210212
conversation_history.extend(req.get("messages", []))
211213

214+
create_kwargs: dict[str, Any] = dict(full_req)
215+
if extra_headers:
216+
create_kwargs["extra_headers"] = extra_headers
217+
212218
if is_streaming:
213-
with client.messages.create(**full_req) as stream:
214-
final = stream.get_final_message()
215-
if hasattr(final, "content") and final.content:
216-
text_blocks = [b.text for b in final.content if hasattr(b, "text")]
217-
conversation_history.append({"role": "assistant", "content": " ".join(text_blocks)})
219+
# Iterate the stream to exhaustion — the Braintrust TracedMessageStream
220+
# context manager captures metrics and logs the span on __exit__.
221+
# We can't call get_final_message() on the traced wrapper, so we
222+
# skip history accumulation for streaming (no multi-turn streaming specs).
223+
with client.messages.create(**create_kwargs) as stream:
224+
for _ in stream:
225+
pass
218226
else:
219-
response = client.messages.create(**full_req)
227+
response = client.messages.create(**create_kwargs)
220228
if hasattr(response, "content") and response.content:
221229
text_blocks = [b.text for b in response.content if hasattr(b, "text")]
222230
conversation_history.append({"role": "assistant", "content": " ".join(text_blocks)})

py/src/braintrust/btx/spec_loader.py

Lines changed: 79 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,31 @@
11
"""Load BTX LLM-span spec YAML files.
22
3-
Handles the three custom YAML tags used in the spec:
3+
Handles the custom YAML tags used in the spec:
44
!fn <name-or-lambda> — named predicate or arbitrary lambda (eval'd in Python)
55
!starts_with <prefix> — string prefix check
66
!or [...] — at-least-one-of validator
7+
!gen <generator-name> — value generated by the test runner (e.g. test_runner_client)
78
"""
89

910
from __future__ import annotations
1011

1112
import dataclasses
1213
import os
14+
import uuid
1315
from pathlib import Path
1416
from typing import Any
1517

1618
import yaml
1719

1820

1921
# ---------------------------------------------------------------------------
20-
# Matcher types (parallel to SpecMatcher.java)
22+
# Matcher / generator types
2123
# ---------------------------------------------------------------------------
2224

2325

2426
@dataclasses.dataclass
2527
class FnMatcher:
26-
"""A named or lambda-expression validator.
27-
28-
For well-known names (is_non_negative_number, etc.) the span_validator
29-
module dispatches them to dedicated functions. For arbitrary Python
30-
expressions the expression string is stored and eval()'d at validation
31-
time.
32-
"""
28+
"""A named or lambda-expression validator."""
3329

3430
expr: str # e.g. "is_non_negative_number" or "lambda value: value > 0"
3531

@@ -44,6 +40,18 @@ class OrMatcher:
4440
alternatives: list[Any]
4541

4642

43+
@dataclasses.dataclass
44+
class GenValue:
45+
"""A value generated by the test runner at execution time.
46+
47+
The generator name determines what value is produced:
48+
test_runner_client — a string identifying this SDK/client (e.g. "python-openai")
49+
vcr_nonce — a random string that changes every run (busts caches)
50+
"""
51+
52+
generator: str # e.g. "test_runner_client", "vcr_nonce"
53+
54+
4755
# ---------------------------------------------------------------------------
4856
# YAML custom constructors
4957
# ---------------------------------------------------------------------------
@@ -64,6 +72,11 @@ def _or_constructor(loader: yaml.SafeLoader, node: yaml.Node) -> OrMatcher:
6472
return OrMatcher(alternatives=alternatives)
6573

6674

75+
def _gen_constructor(loader: yaml.SafeLoader, node: yaml.Node) -> GenValue:
76+
generator = loader.construct_scalar(node) # type: ignore[arg-type]
77+
return GenValue(generator=generator)
78+
79+
6780
def _make_loader() -> type:
6881
"""Return a SafeLoader subclass with BTX custom tags registered."""
6982

@@ -73,9 +86,57 @@ class BtxLoader(yaml.SafeLoader):
7386
BtxLoader.add_constructor("!fn", _fn_constructor)
7487
BtxLoader.add_constructor("!starts_with", _starts_with_constructor)
7588
BtxLoader.add_constructor("!or", _or_constructor)
89+
BtxLoader.add_constructor("!gen", _gen_constructor)
7690
return BtxLoader
7791

7892

93+
# ---------------------------------------------------------------------------
94+
# Generator resolution
95+
# ---------------------------------------------------------------------------
96+
97+
# Stable client identifier for this SDK implementation.
98+
_CLIENT_ID = "python-btx"
99+
100+
# Per-process nonce — constant within a run so cassette body matching is stable,
101+
# but differs across runs so cache-busting specs actually bust caches.
102+
_VCR_NONCE = str(uuid.uuid4())[:8]
103+
104+
_GENERATORS: dict[str, str] = {
105+
"test_runner_client": _CLIENT_ID,
106+
"vcr_nonce": _VCR_NONCE,
107+
}
108+
109+
110+
def _resolve_gen(value: GenValue) -> str:
111+
if value.generator in _GENERATORS:
112+
return _GENERATORS[value.generator]
113+
raise ValueError(f"Unknown !gen generator: {value.generator!r}")
114+
115+
116+
def _resolve_variables(variables: dict[str, Any]) -> dict[str, str]:
117+
"""Resolve all !gen values in the variables map to concrete strings."""
118+
resolved: dict[str, str] = {}
119+
for key, val in variables.items():
120+
if isinstance(val, GenValue):
121+
resolved[key] = _resolve_gen(val)
122+
else:
123+
resolved[key] = str(val)
124+
return resolved
125+
126+
127+
def _substitute_templates(obj: Any, variables: dict[str, str]) -> Any:
128+
"""Recursively substitute {{var}} placeholders in strings."""
129+
if isinstance(obj, str):
130+
for key, value in variables.items():
131+
obj = obj.replace(f"{{{{{key}}}}}", value)
132+
return obj
133+
if isinstance(obj, dict):
134+
return {k: _substitute_templates(v, variables) for k, v in obj.items()}
135+
if isinstance(obj, list):
136+
return [_substitute_templates(item, variables) for item in obj]
137+
return obj
138+
139+
79140
# ---------------------------------------------------------------------------
80141
# Spec dataclass
81142
# ---------------------------------------------------------------------------
@@ -89,6 +150,7 @@ class LlmSpanSpec:
89150
endpoint: str
90151
requests: list[dict[str, Any]]
91152
expected_brainstore_spans: list[dict[str, Any]]
153+
headers: dict[str, str]
92154
source_path: Path
93155

94156
@property
@@ -98,13 +160,19 @@ def display_name(self) -> str:
98160

99161
@classmethod
100162
def from_dict(cls, data: dict[str, Any], source_path: Path) -> "LlmSpanSpec":
163+
# Resolve variables and substitute templates in requests
164+
raw_variables = data.get("variables", {})
165+
variables = _resolve_variables(raw_variables)
166+
requests = _substitute_templates(data.get("requests", []), variables)
167+
101168
return cls(
102169
name=data["name"],
103170
type=data["type"],
104171
provider=data["provider"],
105172
endpoint=data["endpoint"],
106-
requests=data.get("requests", []),
173+
requests=requests,
107174
expected_brainstore_spans=data.get("expected_brainstore_spans", []),
175+
headers=data.get("headers", {}),
108176
source_path=source_path,
109177
)
110178

@@ -117,13 +185,6 @@ def from_dict(cls, data: dict[str, Any], source_path: Path) -> "LlmSpanSpec":
117185

118186

119187
def _spec_root(override: str | None = None) -> Path:
120-
"""Return the llm_span spec root directory.
121-
122-
Priority:
123-
1. ``override`` argument (used by the pytest fixture after fetching specs)
124-
2. ``BTX_SPEC_ROOT`` environment variable
125-
3. ``<btx-dir>/spec/test/llm_span`` (local dev snapshot)
126-
"""
127188
if override:
128189
return Path(override)
129190
env = os.environ.get("BTX_SPEC_ROOT")
@@ -139,10 +200,8 @@ def load_specs(
139200
"""Load all YAML spec files under *spec_root*.
140201
141202
Args:
142-
spec_root: Path to the ``test/llm_span`` directory. Falls back to
143-
:func:`_spec_root` resolution if ``None``.
203+
spec_root: Path to the ``test/llm_span`` directory.
144204
providers: Optional allow-list of provider names (e.g. ``["openai"]``).
145-
If ``None``, all providers are loaded.
146205
147206
Returns:
148207
Sorted list of :class:`LlmSpanSpec` instances.
@@ -159,7 +218,6 @@ def load_specs(
159218
specs: list[LlmSpanSpec] = []
160219

161220
for yaml_path in sorted(root.rglob("*.yaml")):
162-
# Filter by provider directory if requested
163221
provider_dir = yaml_path.parent.name
164222
if providers is not None and provider_dir not in providers:
165223
continue

0 commit comments

Comments
 (0)