Skip to content

Commit 27b71b7

Browse files
authored
spec test runner (#301)
add `btx` spec test runner to the python sdk. For now, the only runner supported is openai. (side-note: I hooked up anthropic as well, but the specs and the sdk disagree on the span structure. will address this later) ### --- usage --- - run end-to-end and assert against real brainstore spans: `nox -s "test_btx_openai(latest)" -- --disable-vcr` - https://www.braintrust.dev/app/braintrustdata.com/p/python-unit-test/trace?object_type=project_logs&object_id=955087e9-e7ce-4461-9e2c-5b4f5fedde1d&r=c77a32a2-ff83-44ac-9a86-2e5df0a3ee90&s=850a4e23-b450-4445-b809-1b1dfc476680 - run offline with vcr and in-memory span assertions: `nox -s "test_btx_openai(1.77.0)" "test_btx_openai(latest)"`
1 parent ec87b5a commit 27b71b7

30 files changed

Lines changed: 5399 additions & 0 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@ Dockerfile.local
2222
CLAUDE.local.md
2323
.zed
2424
tsconfig.tsbuildinfo
25+
26+
# BTX spec cache
27+
py/src/braintrust/btx/.spec-cache/

py/noxfile.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def _pinned_python_version():
123123
CONTRIB_DIR = "braintrust/contrib"
124124
DEVSERVER_DIR = "braintrust/devserver"
125125
TYPE_TESTS_DIR = "braintrust/type_tests"
126+
BTX_DIR = "braintrust/btx"
126127

127128

128129
SILENT_INSTALLS = True
@@ -195,6 +196,16 @@ def test_openai_http2_streaming(session, version):
195196
_run_tests(session, f"{INTEGRATION_DIR}/openai/test_openai_http2.py", version=version)
196197

197198

199+
@nox.session()
200+
@nox.parametrize("version", OPENAI_VERSIONS, ids=OPENAI_VERSIONS)
201+
def test_btx_openai(session, version):
202+
"""Run the BTX cross-language LLM-span spec tests (OpenAI provider)."""
203+
_install_test_deps(session)
204+
_install_matrix_dep(session, "openai", version)
205+
session.install("pyyaml")
206+
_run_tests(session, "braintrust/btx", version=version, env={"BTX_PROVIDER": "openai", "BTX_CLIENT": "openai"})
207+
208+
198209
@nox.session()
199210
def test_openai_ddtrace(session):
200211
_install_test_deps(session)
@@ -619,6 +630,7 @@ def _run_core_tests(session):
619630
CONTRIB_DIR,
620631
DEVSERVER_DIR,
621632
TYPE_TESTS_DIR,
633+
BTX_DIR,
622634
],
623635
)
624636

py/src/braintrust/btx/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# BTX: Cross-language SDK spec tests for the Braintrust Python SDK.
2+
# Specs are fetched from braintrustdata/braintrust-spec at a pinned ref.

py/src/braintrust/btx/conftest.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
"""pytest configuration for BTX tests.
2+
3+
Dual-mode operation:
4+
VCR off (--disable-vcr or --vcr-record=all):
5+
- Real provider API calls
6+
- Spans sent to Braintrust backend via real logger
7+
- Spans fetched back via BTQL for validation
8+
9+
VCR on (default, cassettes present):
10+
- Provider HTTP replayed from cassettes
11+
- Spans captured in memory via _internal_with_memory_background_logger
12+
- No Braintrust backend calls needed
13+
14+
The VCR mode is detected from the pytest-vcr options already present in the
15+
test session (--disable-vcr / --vcr-record).
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import os
21+
import sys
22+
import tarfile
23+
import tempfile
24+
import urllib.request
25+
from pathlib import Path
26+
27+
import pytest
28+
from braintrust import logger
29+
from braintrust.test_helpers import init_test_logger
30+
31+
32+
_BTX_DIR = Path(__file__).parent
33+
_SPEC_REF_FILE = _BTX_DIR / "spec-ref.txt"
34+
_SPEC_CACHE_DIR = _BTX_DIR / ".spec-cache"
35+
36+
_TEST_PROJECT = "btx-test-project"
37+
38+
# Stash key: spec root path, set by pytest_configure before collection
39+
_spec_root_key = pytest.StashKey[Path]()
40+
# Stash key: whether VCR is disabled (live mode)
41+
_vcr_off_key = pytest.StashKey[bool]()
42+
43+
44+
# ---------------------------------------------------------------------------
45+
# Spec fetching — before collection
46+
# ---------------------------------------------------------------------------
47+
48+
49+
def _read_spec_ref() -> str:
50+
return _SPEC_REF_FILE.read_text().strip()
51+
52+
53+
def _fetch_spec_if_needed(ref: str) -> Path:
54+
"""Download braintrust-spec@ref into the local cache; skip if already present.
55+
56+
Pure Python implementation — no bash or curl required, works on all
57+
platforms including Windows.
58+
59+
Race-condition safe: extracts into a temporary sibling directory and then
60+
atomically renames it into the final cache_dir. If two processes race,
61+
one wins the rename and the other detects the final directory already
62+
exists and returns immediately.
63+
"""
64+
import shutil
65+
66+
cache_dir = _SPEC_CACHE_DIR / ref
67+
llm_span_root = cache_dir / "test" / "llm_span"
68+
69+
if llm_span_root.exists():
70+
return llm_span_root
71+
72+
_SPEC_CACHE_DIR.mkdir(parents=True, exist_ok=True)
73+
print(f"\n[btx] Fetching braintrust-spec@{ref} ...")
74+
75+
url = f"https://github.com/braintrustdata/braintrust-spec/archive/{ref}.tar.gz"
76+
77+
# Extract into a unique temp directory next to the final cache_dir so that
78+
# the eventual os.rename() is atomic (same filesystem, no cross-device move).
79+
tmp_dir = Path(tempfile.mkdtemp(dir=_SPEC_CACHE_DIR, prefix=f"{ref}.tmp."))
80+
# Use mkstemp (not deprecated mktemp) to atomically create the temp tarball.
81+
tmp_tar_fd, tmp_tar_str = tempfile.mkstemp(suffix=".tar.gz", dir=_SPEC_CACHE_DIR)
82+
os.close(tmp_tar_fd)
83+
tmp_tar = Path(tmp_tar_str)
84+
85+
try:
86+
urllib.request.urlretrieve(url, tmp_tar)
87+
88+
with tarfile.open(tmp_tar, "r:gz") as tar:
89+
members = tar.getmembers()
90+
# Strip the top-level directory (e.g. "braintrust-spec-af0e006/")
91+
top = members[0].name.split("/")[0] + "/"
92+
for member in members:
93+
member.name = member.name[len(top) :]
94+
if member.name:
95+
# filter="data" was added in 3.12; fall back gracefully on older Pythons
96+
if sys.version_info >= (3, 12):
97+
tar.extract(member, tmp_dir, filter="data")
98+
else:
99+
tar.extract(member, tmp_dir) # noqa: S202
100+
101+
# Atomic rename: if another process already won the race, our tmp_dir
102+
# is redundant — clean it up and use the existing cache_dir.
103+
try:
104+
tmp_dir.rename(cache_dir)
105+
except (FileExistsError, OSError):
106+
# Another process beat us to it; that's fine.
107+
if not llm_span_root.exists():
108+
raise
109+
except Exception:
110+
shutil.rmtree(tmp_dir, ignore_errors=True)
111+
raise
112+
finally:
113+
Path(tmp_tar).unlink(missing_ok=True)
114+
shutil.rmtree(tmp_dir, ignore_errors=True)
115+
116+
if not llm_span_root.exists():
117+
raise FileNotFoundError(f"Expected llm_span dir not found after fetch: {llm_span_root}")
118+
119+
print(f"[btx] Spec cached at {llm_span_root}")
120+
return llm_span_root
121+
122+
123+
def pytest_configure(config: pytest.Config) -> None:
124+
"""Fetch specs before collection and detect VCR mode."""
125+
# --- spec fetch ---
126+
env_override = os.environ.get("BTX_SPEC_ROOT")
127+
if env_override:
128+
spec_root = Path(env_override)
129+
else:
130+
ref = _read_spec_ref()
131+
spec_root = _fetch_spec_if_needed(ref)
132+
133+
config.stash[_spec_root_key] = spec_root
134+
os.environ["BTX_SPEC_ROOT"] = str(spec_root)
135+
136+
# --- VCR mode detection ---
137+
# vcr_off means: bypass VCR entirely, make real API calls, validate via BTQL.
138+
# This is only true when --disable-vcr is passed.
139+
# --vcr-record=all means: make real API calls but still use VCR (to record
140+
# cassettes) and capture spans in-memory — so vcr_off stays False.
141+
vcr_off = bool(config.getoption("--disable-vcr", default=False, skip=True))
142+
config.stash[_vcr_off_key] = vcr_off
143+
144+
145+
# ---------------------------------------------------------------------------
146+
# VCR configuration
147+
# ---------------------------------------------------------------------------
148+
149+
# Response headers to drop before writing cassettes. These carry sensitive or
150+
# ephemeral values (session cookies, org/project IDs, per-request trace IDs)
151+
# that should never be committed to source control.
152+
_SCRUB_RESPONSE_HEADERS = {
153+
"set-cookie",
154+
"openai-organization",
155+
"openai-project",
156+
"x-request-id",
157+
"cf-ray",
158+
"cf-cache-status",
159+
"alt-svc",
160+
}
161+
162+
163+
def _scrub_response_headers(response: dict) -> dict:
164+
"""Strip sensitive/ephemeral headers from responses before cassette write."""
165+
response["headers"] = {
166+
k: v for k, v in response.get("headers", {}).items() if k.lower() not in _SCRUB_RESPONSE_HEADERS
167+
}
168+
return response
169+
170+
171+
@pytest.fixture(scope="session")
172+
def vcr_config() -> dict:
173+
"""In CI: record_mode=none. Locally: record_mode=once."""
174+
record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
175+
return {
176+
"record_mode": record_mode,
177+
"decode_compressed_response": True,
178+
# Match on method + URI + body: the request payload (model, messages, etc.)
179+
# is what determines which cassette response is appropriate.
180+
# Volatile per-version metadata lives in headers, not the body, so we strip
181+
# those via filter_headers instead of dropping body from match_on.
182+
"match_on": ["method", "scheme", "host", "port", "path", "query", "body"],
183+
"filter_headers": [
184+
"authorization",
185+
"Authorization",
186+
"x-api-key",
187+
"api-key",
188+
"openai-organization",
189+
"openai-api-key",
190+
"x-goog-api-key",
191+
"x-bt-auth-token",
192+
"cookie",
193+
"Cookie",
194+
# Stainless SDK telemetry headers — version-specific, not part of the
195+
# request semantics; strip so cassettes survive SDK version bumps.
196+
"user-agent",
197+
"User-Agent",
198+
"x-stainless-arch",
199+
"x-stainless-async",
200+
"x-stainless-lang",
201+
"x-stainless-os",
202+
"x-stainless-package-version",
203+
"x-stainless-runtime",
204+
"x-stainless-runtime-version",
205+
"x-stainless-read-timeout",
206+
"x-stainless-retry-count",
207+
],
208+
"before_record_response": _scrub_response_headers,
209+
}
210+
211+
212+
def _btx_cassette_path(provider: str, spec_name: str) -> str:
213+
"""Return the absolute cassette path for a given provider and spec name.
214+
215+
Cassettes live in the provider's integration cassette directory so they
216+
share the same version matrix as the rest of that provider's tests:
217+
integrations/<provider>/cassettes/<version>/btx/<spec_name>.yaml
218+
219+
Using an absolute path causes pytest-vcr to ignore vcr_cassette_dir
220+
entirely and write/read cassettes directly at this location.
221+
"""
222+
from braintrust.integrations.conftest import _versioned_cassette_dir
223+
224+
integration_cassettes = _BTX_DIR.parent / "integrations" / provider / "cassettes"
225+
versioned_dir = Path(_versioned_cassette_dir(str(integration_cassettes)))
226+
cassette = versioned_dir / "btx" / f"{spec_name}.yaml"
227+
cassette.parent.mkdir(parents=True, exist_ok=True)
228+
return str(cassette)
229+
230+
231+
@pytest.fixture
232+
def vcr_cassette_name(request: pytest.FixtureRequest) -> str:
233+
"""Return the absolute cassette path for this spec.
234+
235+
The parametrize ID is '<provider>/<spec_name>' (e.g. 'openai/completions').
236+
Cassettes are routed to the provider's own integration directory:
237+
integrations/<provider>/cassettes/<version>/btx/<spec_name>.yaml
238+
"""
239+
node_name = request.node.name # e.g. "test_btx_spec[openai/completions]"
240+
if "[" in node_name and node_name.endswith("]"):
241+
spec_id = node_name[node_name.index("[") + 1 : -1]
242+
else:
243+
spec_id = node_name
244+
245+
if "/" in spec_id:
246+
provider, spec_name = spec_id.split("/", 1)
247+
return _btx_cassette_path(provider, spec_name)
248+
return spec_id
249+
250+
251+
# ---------------------------------------------------------------------------
252+
# Mode-aware fixtures
253+
# ---------------------------------------------------------------------------
254+
255+
256+
@pytest.fixture(scope="session")
257+
def btx_vcr_off(request: pytest.FixtureRequest) -> bool:
258+
"""True when running in live (VCR-off) mode."""
259+
return request.config.stash.get(_vcr_off_key, False)
260+
261+
262+
@pytest.fixture(scope="session")
263+
def btx_spec_root(request: pytest.FixtureRequest) -> Path:
264+
"""The llm_span spec root (already fetched by pytest_configure)."""
265+
return request.config.stash[_spec_root_key]
266+
267+
268+
@pytest.fixture(scope="session")
269+
def btx_project_id(btx_vcr_off: bool) -> str | None:
270+
"""Resolve the Braintrust project ID once per session (live mode only).
271+
272+
In VCR mode this is never called. In live mode the project name/ID is
273+
constant across all test cases, so we look it up once here rather than
274+
once per parametrized test.
275+
"""
276+
if not btx_vcr_off:
277+
return None
278+
project_id = os.environ.get("BRAINTRUST_PROJECT_ID") or os.environ.get("BRAINTRUST_DEFAULT_PROJECT_ID")
279+
if project_id:
280+
return project_id
281+
from .span_fetcher import fetch_project_id
282+
283+
project = os.environ.get("BRAINTRUST_PROJECT") or os.environ.get(
284+
"BRAINTRUST_DEFAULT_PROJECT_NAME", "python-unit-test"
285+
)
286+
return fetch_project_id(project)
287+
288+
289+
@pytest.fixture
290+
def memory_logger(btx_vcr_off):
291+
"""In VCR-on mode: install in-memory span capture.
292+
In VCR-off mode: yield None (spans go to the real Braintrust backend).
293+
"""
294+
if btx_vcr_off:
295+
yield None
296+
else:
297+
init_test_logger(_TEST_PROJECT)
298+
with logger._internal_with_memory_background_logger() as bgl:
299+
yield bgl

0 commit comments

Comments
 (0)