diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 4fe80f52b3d..174c0489f14 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -1649,7 +1649,7 @@ def as_dataframe(self) -> None: flat_record[("expected_output", "")] = expected_output column_tuples.add(("expected_output", "")) - metadata = record.get("metadata", {}) + metadata = (record.get("metadata") or {}) if isinstance(metadata, dict): for metadata_col, metadata_val in metadata.items(): flat_record[("metadata", metadata_col)] = metadata_val @@ -2000,7 +2000,7 @@ def _prepare_summary_evaluator_data( record: DatasetRecord = self._dataset[idx] inputs.append(record["input_data"]) expected_outputs.append(record["expected_output"]) - record_metadata = record.get("metadata", {}) + record_metadata = (record.get("metadata") or {}) metadata_list.append({**record_metadata, "experiment_config": self._config}) eval_result_at_idx_by_name = eval_results[idx]["evaluations"] @@ -2237,7 +2237,7 @@ async def _process_record( tags["dataset_record_canonical_id"] = canonical_id output_data = None last_exc_info = None - record_metadata = record.get("metadata", {}) + record_metadata = (record.get("metadata") or {}) task_args: list = [input_data, self._config] if self._task_accepts_metadata: task_args.append(record_metadata) @@ -2358,7 +2358,7 @@ async def _evaluate_record( input_data = record["input_data"] output_data = task_result["output"] expected_output = record["expected_output"] - metadata = record.get("metadata", {}) + metadata = (record.get("metadata") or {}) async def _run_single_evaluator( evaluator: Union[EvaluatorType, AsyncEvaluatorType], diff --git a/releasenotes/notes/fix-llmobs-experiment-none-metadata-7d4e2f9a8c1b3e5f.yaml b/releasenotes/notes/fix-llmobs-experiment-none-metadata-7d4e2f9a8c1b3e5f.yaml new file mode 100644 index 00000000000..5fe5bdee73d --- /dev/null +++ b/releasenotes/notes/fix-llmobs-experiment-none-metadata-7d4e2f9a8c1b3e5f.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + LLM Observability: Fixes a ``TypeError: 'NoneType' object is not a mapping`` raised + when running an experiment over dataset records whose ``metadata`` field is ``None`` + (a common shape for records serialized with an explicit ``null``). Metadata reads in + ``ddtrace/llmobs/_experiment.py`` now fall back to an empty dict for both missing and + ``None`` values, so summary evaluators, task argument plumbing, and per-record evaluator + context no longer crash on these records. diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 6c58137a618..1995e38943b 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -1753,6 +1753,45 @@ def test_experiment_run_summary_evaluators(llmobs, test_dataset_one_record): } +def test_experiment_run_summary_evaluators_handles_none_metadata(llmobs): + """Regression: records whose `metadata` field is explicitly None must not crash + the summary evaluator path. `dict.get("metadata", {})` returns the stored None + (not the default), which then fails when spread into a dict literal. See + _prepare_summary_evaluator_data and the per-record paths that build + `{**metadata, "experiment_config": ...}`. + """ + dataset = Dataset( + name="test_dataset_none_metadata", + project={"name": "test_project", "_id": "proj_123"}, + dataset_id="ds_none_metadata", + records=[ + { + "record_id": "rec_1", + "input_data": {"prompt": "What is the capital of France?"}, + "expected_output": {"answer": "Paris"}, + "metadata": None, + } + ], + description="Dataset whose records carry metadata=None", + latest_version=1, + version=1, + _dne_client=None, + ) + exp = llmobs.experiment( + "test_experiment", + dummy_task, + dataset, + [dummy_evaluator], + summary_evaluators=[dummy_summary_evaluator], + ) + task_results = asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False)) + eval_results = asyncio.run(exp._experiment._run_evaluators(task_results, raise_errors=False)) + summary_eval_results = asyncio.run( + exp._experiment._run_summary_evaluators(task_results, eval_results, raise_errors=True) + ) + assert summary_eval_results[0]["evaluations"]["dummy_summary_evaluator"]["error"] is None + + def test_experiment_run_evaluators_error(llmobs, test_dataset_one_record): exp = llmobs.experiment("test_experiment", dummy_task, test_dataset_one_record, [faulty_evaluator]) task_results = asyncio.run(exp._experiment._run_task(1, run=run_info_with_stable_id(0), raise_errors=False))