Skip to content

Commit 4079ffa

Browse files
authored
task(logger): accept serializable objects as metadata (#437)
### Description Run `metadata` through the `bt_safe_deep_copy` before validating it so Pydantic models and other objects that dump to dictionaries can be logged as metadata. This change covers: - `Logger.log` - `Experiment.log` - `Span.log` - `Logger.log_feedback` - `Experiment.log_feedback` - `update_span` - `Dataset.__init__` - `Dataset.insert` - `Dataset.update` ### Testing - Added type tests - Added unit tests for the following: - Mocked Pydantic `model_dump` behavior to avoid needing dependency: - `span.log` - `logger.log` - `experiment.log` - `logger.log_feedback` - `experiment.log_feedback` - `dataset.insert` - `dataset.update` - reject `metadata` with non string keys - reject `metadata` that does not serialize into dict - test `span.log` with actual pydantic model - Manually tested the repro script
1 parent d9e0737 commit 4079ffa

4 files changed

Lines changed: 239 additions & 38 deletions

File tree

py/src/braintrust/logger.py

Lines changed: 44 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,7 +1800,8 @@ def init_dataset(
18001800
key is specified, will prompt the user to login.
18011801
:param org_name: (Optional) The name of a specific organization to connect to. This is useful if you belong to multiple.
18021802
:param project_id: The id of the project to create the dataset in. This takes precedence over `project` if specified.
1803-
:param metadata: (Optional) a dictionary with additional data about the dataset. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
1803+
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the dataset. The values in `metadata` can be any
1804+
JSON-serializable type, but its keys must be strings.
18041805
:param use_output: (Deprecated) If True, records will be fetched from this dataset in the legacy format, with the "expected" field renamed to "output". This option will be removed in a future version of Braintrust.
18051806
:param _internal_btql: (Internal) If specified, the dataset will be created with the given BTQL filters.
18061807
:param state: (Internal) The Braintrust state to use. If not specified, will use the global state. For advanced use only.
@@ -2777,8 +2778,24 @@ def _helper(v: Any) -> Any:
27772778
return event
27782779

27792780

2781+
def _validate_and_sanitize_metadata(metadata: Metadata) -> dict[str, Any]:
2782+
if isinstance(metadata, dict):
2783+
sanitized_metadata = metadata
2784+
else:
2785+
sanitized_metadata = bt_safe_deep_copy(metadata)
2786+
if not isinstance(sanitized_metadata, dict):
2787+
raise ValueError("metadata must be a dictionary or serialize to a dictionary")
2788+
2789+
for key in sanitized_metadata.keys():
2790+
if not isinstance(key, str):
2791+
raise ValueError("metadata keys must be strings")
2792+
2793+
return sanitized_metadata
2794+
2795+
27802796
def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
2781-
scores = event.get("scores")
2797+
sanitized_event = dict(event)
2798+
scores = sanitized_event.get("scores")
27822799
if scores:
27832800
for name, score in scores.items():
27842801
if not isinstance(name, str):
@@ -2796,15 +2813,10 @@ def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any])
27962813
if score < 0 or score > 1:
27972814
raise ValueError("score values must be between 0 and 1")
27982815

2799-
metadata = event.get("metadata")
2800-
if metadata:
2801-
if not isinstance(metadata, dict):
2802-
raise ValueError("metadata must be a dictionary")
2803-
for key in metadata.keys():
2804-
if not isinstance(key, str):
2805-
raise ValueError("metadata keys must be strings")
2816+
if "metadata" in sanitized_event and sanitized_event["metadata"] is not None:
2817+
sanitized_event["metadata"] = _validate_and_sanitize_metadata(sanitized_event["metadata"])
28062818

2807-
metrics = event.get("metrics")
2819+
metrics = sanitized_event.get("metrics")
28082820
if metrics:
28092821
if not isinstance(metrics, dict):
28102822
raise ValueError("metrics must be a dictionary")
@@ -2816,26 +2828,26 @@ def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any])
28162828
if not isinstance(value, (int, float)):
28172829
raise ValueError("metric values must be numbers")
28182830

2819-
tags = event.get("tags")
2831+
tags = sanitized_event.get("tags")
28202832
if tags:
28212833
validate_tags(tags)
28222834

2823-
span_attributes = event.get("span_attributes")
2835+
span_attributes = sanitized_event.get("span_attributes")
28242836
if span_attributes:
28252837
if not isinstance(span_attributes, dict):
28262838
raise ValueError("span_attributes must be a dictionary")
28272839
for key in span_attributes.keys():
28282840
if not isinstance(key, str):
28292841
raise ValueError("span_attributes keys must be strings")
28302842

2831-
input = event.get("input")
2832-
inputs = event.get("inputs")
2843+
input = sanitized_event.get("input")
2844+
inputs = sanitized_event.get("inputs")
28332845
if input is not None and inputs is not None:
28342846
raise ValueError("Only one of input or inputs (deprecated) can be specified. Prefer input.")
28352847
if inputs is not None:
2836-
return dict(**{k: v for k, v in event.items() if k not in ["input", "inputs"]}, input=inputs)
2848+
return dict(**{k: v for k, v in sanitized_event.items() if k not in ["input", "inputs"]}, input=inputs)
28372849
else:
2838-
return {k: v for k, v in event.items()}
2850+
return {k: v for k, v in sanitized_event.items()}
28392851

28402852

28412853
# Note that this only checks properties that are expected of a complete event.
@@ -3832,7 +3844,7 @@ def log(
38323844
:param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
38333845
:param error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
38343846
:param scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
3835-
:param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
3847+
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
38363848
:param tags: (Optional) a list of strings that you can use to filter and group records later.
38373849
:param metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
38383850
:param id: (Optional) a unique identifier for the event. If you don't provide one, BrainTrust will generate one for you.
@@ -3881,7 +3893,7 @@ def log_feedback(
38813893
:param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not.
38823894
:param tags: (Optional) a list of strings that you can use to filter and group records later.
38833895
:param comment: (Optional) an optional comment string to log about the event.
3884-
:param metadata: (Optional) a dictionary with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event.
3896+
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
38853897
:param source: (Optional) the source of the feedback. Must be one of "external" (default), "app", or "api".
38863898
"""
38873899
return _log_feedback_impl(
@@ -4653,18 +4665,10 @@ def _get_state(self) -> BraintrustState:
46534665

46544666
def _validate_event(
46554667
self,
4656-
metadata: Metadata | None = None,
46574668
expected: Any | None = None,
46584669
output: Any | None = None,
46594670
tags: Sequence[str] | None = None,
4660-
):
4661-
if metadata is not None:
4662-
if not isinstance(metadata, dict):
4663-
raise ValueError("metadata must be a dictionary")
4664-
for key in metadata.keys():
4665-
if not isinstance(key, str):
4666-
raise ValueError("metadata keys must be strings")
4667-
4671+
) -> None:
46684672
if expected is not None and output is not None:
46694673
raise ValueError("Only one of expected or output (deprecated) can be specified. Prefer expected.")
46704674

@@ -4717,15 +4721,18 @@ def insert(
47174721
:param input: The argument that uniquely define an input case (an arbitrary, JSON serializable object).
47184722
:param expected: The output of your application, including post-processing (an arbitrary, JSON serializable object).
47194723
:param tags: (Optional) a list of strings that you can use to filter and group records later.
4720-
:param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just
4721-
about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the
4724+
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with
4725+
additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help
4726+
find and analyze examples later. For example, you could log the
47224727
`prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any
47234728
JSON-serializable type, but its keys must be strings.
47244729
:param id: (Optional) a unique identifier for the event. If you don't provide one, Braintrust will generate one for you.
47254730
:param output: (Deprecated) The output of your application. Use `expected` instead.
47264731
:returns: The `id` of the logged record.
47274732
"""
4728-
self._validate_event(metadata=metadata, expected=expected, output=output, tags=tags)
4733+
if metadata is not None:
4734+
metadata = _validate_and_sanitize_metadata(metadata)
4735+
self._validate_event(expected=expected, output=output, tags=tags)
47294736

47304737
row_id = id or str(uuid.uuid4())
47314738

@@ -4760,11 +4767,13 @@ def update(
47604767
:param input: (Optional) The new input value for the record (an arbitrary, JSON serializable object).
47614768
:param expected: (Optional) The new expected output value for the record (an arbitrary, JSON serializable object).
47624769
:param tags: (Optional) A list of strings to update the tags of the record.
4763-
:param metadata: (Optional) A dictionary to update the metadata of the record. The values in `metadata` can be any
4764-
JSON-serializable type, but its keys must be strings.
4770+
:param metadata: (Optional) A dictionary, or an object that serializes to a dictionary (such as a Pydantic model), to update
4771+
the metadata of the record. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
47654772
:returns: The `id` of the updated record.
47664773
"""
4767-
self._validate_event(metadata=metadata, expected=expected, tags=tags)
4774+
if metadata is not None:
4775+
metadata = _validate_and_sanitize_metadata(metadata)
4776+
self._validate_event(expected=expected, tags=tags)
47684777

47694778
args = self._create_args(
47704779
id=id,
@@ -5265,7 +5274,7 @@ def log(
52655274
:param error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
52665275
:param tags: (Optional) a list of strings that you can use to filter and group records later.
52675276
:param scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
5268-
:param metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
5277+
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
52695278
:param metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
52705279
:param id: (Optional) a unique identifier for the event. If you don't provide one, BrainTrust will generate one for you.
52715280
:param allow_concurrent_with_spans: (Optional) in rare cases where you need to log at the top level separately from using spans on the logger elsewhere, set this to True.
@@ -5313,7 +5322,7 @@ def log_feedback(
53135322
:param expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not.
53145323
:param tags: (Optional) a list of strings that you can use to filter and group records later.
53155324
:param comment: (Optional) an optional comment string to log about the event.
5316-
:param metadata: (Optional) a dictionary with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event.
5325+
:param metadata: (Optional) a dictionary, or an object that serializes to a dictionary (such as a Pydantic model), with additional data about the feedback. If you have a `user_id`, you can log it here and access it in the Braintrust UI. Note, this metadata does not correspond to the main event itself, but rather the audit log attached to the event. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
53175326
:param source: (Optional) the source of the feedback. Must be one of "external" (default), "app", or "api".
53185327
"""
53195328
return _log_feedback_impl(

0 commit comments

Comments
 (0)