Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ddtrace/llmobs/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1649,7 +1649,7 @@ def as_dataframe(self) -> None:
flat_record[("expected_output", "")] = expected_output
column_tuples.add(("expected_output", ""))

metadata = record.get("metadata", {})
metadata = (record.get("metadata") or {})
if isinstance(metadata, dict):
for metadata_col, metadata_val in metadata.items():
flat_record[("metadata", metadata_col)] = metadata_val
Expand Down Expand Up @@ -2000,7 +2000,7 @@ def _prepare_summary_evaluator_data(
record: DatasetRecord = self._dataset[idx]
inputs.append(record["input_data"])
expected_outputs.append(record["expected_output"])
record_metadata = record.get("metadata", {})
record_metadata = (record.get("metadata") or {})
metadata_list.append({**record_metadata, "experiment_config": self._config})

eval_result_at_idx_by_name = eval_results[idx]["evaluations"]
Expand Down Expand Up @@ -2237,7 +2237,7 @@ async def _process_record(
tags["dataset_record_canonical_id"] = canonical_id
output_data = None
last_exc_info = None
record_metadata = record.get("metadata", {})
record_metadata = (record.get("metadata") or {})
task_args: list = [input_data, self._config]
if self._task_accepts_metadata:
task_args.append(record_metadata)
Expand Down Expand Up @@ -2358,7 +2358,7 @@ async def _evaluate_record(
input_data = record["input_data"]
output_data = task_result["output"]
expected_output = record["expected_output"]
metadata = record.get("metadata", {})
metadata = (record.get("metadata") or {})

async def _run_single_evaluator(
evaluator: Union[EvaluatorType, AsyncEvaluatorType],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
fixes:
- |
LLM Observability: Fixes a ``TypeError: 'NoneType' object is not a mapping`` raised
when running an experiment over dataset records whose ``metadata`` field is ``None``
(a common shape for records serialized with an explicit ``null``). Metadata reads in
``ddtrace/llmobs/_experiment.py`` now fall back to an empty dict for both missing and
``None`` values, so summary evaluators, task argument plumbing, and per-record evaluator
context no longer crash on these records.
39 changes: 39 additions & 0 deletions tests/llmobs/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,45 @@ def test_experiment_run_summary_evaluators(llmobs, test_dataset_one_record):
}


def test_experiment_run_summary_evaluators_handles_none_metadata(llmobs):
"""Regression: records whose `metadata` field is explicitly None must not crash
the summary evaluator path. `dict.get("metadata", {})` returns the stored None
(not the default), which then fails when spread into a dict literal. See
_prepare_summary_evaluator_data and the per-record paths that build
`{**metadata, "experiment_config": ...}`.
"""
dataset = Dataset(
name="test_dataset_none_metadata",
project={"name": "test_project", "_id": "proj_123"},
dataset_id="ds_none_metadata",
records=[
{
"record_id": "rec_1",
"input_data": {"prompt": "What is the capital of France?"},
"expected_output": {"answer": "Paris"},
"metadata": None,
}
],
description="Dataset whose records carry metadata=None",
latest_version=1,
version=1,
_dne_client=None,
)
exp = llmobs.experiment(
"test_experiment",
dummy_task,
dataset,
[dummy_evaluator],
summary_evaluators=[dummy_summary_evaluator],
)
task_results = asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False))
eval_results = asyncio.run(exp._experiment._run_evaluators(task_results, raise_errors=False))
summary_eval_results = asyncio.run(
exp._experiment._run_summary_evaluators(task_results, eval_results, raise_errors=True)
)
assert summary_eval_results[0]["evaluations"]["dummy_summary_evaluator"]["error"] is None


def test_experiment_run_evaluators_error(llmobs, test_dataset_one_record):
exp = llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [faulty_evaluator])
task_results = asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False))
Expand Down
Loading