Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@ class SamplerColumnConfig(SingleColumnConfig):
conditional_params: Optional dictionary for conditional parameters. The dict keys
are the conditions that must be met (e.g., "age > 21") for the conditional parameters
to be used. The values of dict are the parameters to use when the condition is met.
convert_to: Optional type conversion to apply after sampling. Must be one of "float", "int", or "str".
Useful for converting numerical samples to strings or other types.
convert_to: Optional type conversion to apply after sampling. For numerical samplers,
must be one of "float", "int", or "str". For datetime and timedelta samplers, accepts
a strftime format string (e.g., ``"%Y-%m-%d"``, ``"%m/%d/%Y %H:%M"``). When omitted,
datetime/timedelta columns default to ISO-8601 format (e.g., ``2024-01-15T09:30:00``).
Inherited Attributes:
name (required): Unique name of the column to be generated.
Expand All @@ -70,7 +72,12 @@ class SamplerColumnConfig(SingleColumnConfig):
description="Optional dictionary for conditional parameters; keys are conditions, values are params to use when met",
)
convert_to: str | None = Field(
default=None, description="Optional type conversion after sampling: 'float', 'int', or 'str'"
default=None,
description=(
"Optional type conversion after sampling: 'float', 'int', or 'str' for numerical samplers; "
"a strftime format string (e.g., '%Y-%m-%d') for datetime/timedelta samplers. "
"Datetime/timedelta columns default to ISO-8601 (e.g., 2024-01-15T09:30:00) when omitted."
),
)
column_type: Literal["sampler"] = "sampler"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ def validate_data_conversion(convert_to: str | None) -> None:


class DatetimeFormatMixin:
"""Pre/post-processing mixin for datetime and timedelta samplers.

Formatting behavior:
- With ``convert_to``: formats using the given strftime string.
- Without ``convert_to``: returns ISO-8601 strings (e.g., ``2024-01-15T09:30:00``).
"""

@staticmethod
def preproc(series: pd.Series, convert_to: str | None) -> pd.Series:
return series
Expand All @@ -98,15 +105,7 @@ def preproc(series: pd.Series, convert_to: str | None) -> pd.Series:
def postproc(series: pd.Series, convert_to: str | None) -> pd.Series:
if convert_to is not None:
return series.dt.strftime(convert_to)
if series.dt.month.nunique() == 1:
return series.apply(lambda dt: dt.year).astype(str)
if series.dt.day.nunique() == 1:
return series.apply(lambda dt: dt.strftime("%Y-%m"))
if series.dt.hour.sum() > 0 or series.dt.minute.sum() > 0:
return series.apply(lambda dt: dt.isoformat()).astype(str)
if series.dt.second.sum() == 0:
return series.apply(lambda dt: dt.date()).astype(str)
return series.apply(lambda dt: dt.isoformat()).astype(str)
return series.dt.strftime("%Y-%m-%dT%H:%M:%S")

@staticmethod
def validate_data_conversion(convert_to: str | None) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from __future__ import annotations

from datetime import datetime
from unittest.mock import Mock

import pytest
Expand Down Expand Up @@ -143,6 +144,43 @@ def test_datetime_format_mixin_validate_data_conversion_valid_format():
DatetimeFormatMixin.validate_data_conversion(None)


def test_datetime_format_mixin_postproc_no_convert_to_returns_isoformat():
series = lazy.pd.Series(lazy.pd.date_range("2023-01-01", periods=3))
result = DatetimeFormatMixin.postproc(series, None)
expected = lazy.pd.Series(["2023-01-01T00:00:00", "2023-01-02T00:00:00", "2023-01-03T00:00:00"], dtype="str")
lazy.pd.testing.assert_series_equal(result, expected)


def test_datetime_format_mixin_postproc_single_record():
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-06-15 14:30:00"]))
result = DatetimeFormatMixin.postproc(series, None)
expected = lazy.pd.Series(["2024-06-15T14:30:00"], dtype="str")
lazy.pd.testing.assert_series_equal(result, expected)


def test_datetime_format_mixin_postproc_same_month_records():
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-03-01", "2024-03-15", "2024-03-28"]))
result = DatetimeFormatMixin.postproc(series, None)
expected = lazy.pd.Series(["2024-03-01T00:00:00", "2024-03-15T00:00:00", "2024-03-28T00:00:00"], dtype="str")
lazy.pd.testing.assert_series_equal(result, expected)


def test_datetime_format_mixin_postproc_stdlib_fromisoformat():
"""Output must be parseable by Python stdlib datetime.fromisoformat, not just pandas."""
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-06-15 14:30:00", "2025-01-01 00:00:00"]))
result = DatetimeFormatMixin.postproc(series, None)
for val in result:
datetime.fromisoformat(val)


def test_datetime_format_mixin_postproc_round_trip_preserves_values():
"""Output can be parsed back to the original timestamps."""
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-03-15 09:30:00", "2024-11-01 18:45:00"]))
result = DatetimeFormatMixin.postproc(series, None)
round_tripped = lazy.pd.to_datetime(result)
lazy.pd.testing.assert_series_equal(round_tripped, series, check_names=False, check_dtype=False)

Comment thread
johnnygreco marked this conversation as resolved.

def test_datetime_format_mixin_validate_data_conversion_invalid_format():
with pytest.raises(ValueError, match="Invalid datetime format"):
DatetimeFormatMixin.validate_data_conversion("invalid_format")
Comment thread
johnnygreco marked this conversation as resolved.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from __future__ import annotations

from datetime import datetime
from decimal import Decimal
from functools import partial

Expand Down Expand Up @@ -67,7 +68,7 @@ def test_datetime_formats(stub_schema_builder):
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(100)

assert dataset["year"].str.match(r"\d{4}").all()
assert dataset["year"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
assert dataset["datetime"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()


Expand Down Expand Up @@ -96,7 +97,7 @@ def test_timedelta(stub_schema_builder):
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(100)

assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}").all()
assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()

dt = lazy.pd.to_datetime(dataset["new_date"]) - lazy.pd.to_datetime(dataset["reference_date"])
assert (dt <= lazy.pd.Timedelta(days=10)).all()
Expand Down Expand Up @@ -142,6 +143,136 @@ def test_dataset_column_convert_datetime_format(stub_schema_builder):
assert lazy.pd.to_datetime(dataset["col_1"], format="%m/%d/%Y").notna().all()


def test_datetime_single_record_returns_isoformat(stub_schema_builder):
"""Reproducer for issue #484: single-record preview must return full ISO-8601."""
stub_schema_builder.add_column(
name="ts",
sampler_type=SamplerType.DATETIME,
params={"start": "2024-01-01", "end": "2026-06-30", "unit": "h"},
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(1)
value = dataset["ts"].iloc[0]
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
datetime.fromisoformat(value)


@pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s"])
def test_datetime_all_units_preview_size(stub_schema_builder, unit):
"""Every unit granularity must return valid ISO-8601 even at preview sizes (1-5 records)."""
stub_schema_builder.add_column(
name="ts",
sampler_type=SamplerType.DATETIME,
params={"start": "2020-01-01", "end": "2025-12-31", "unit": unit},
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(3)
for value in dataset["ts"]:
assert "T" in value, f"unit={unit!r}: expected ISO-8601, got: {value}"
datetime.fromisoformat(value)


def test_datetime_output_round_trips_through_pd_to_datetime(stub_schema_builder):
"""Output strings must survive pd.to_datetime() for downstream DataFrame joins/filters."""
stub_schema_builder.add_column(
name="ts",
sampler_type=SamplerType.DATETIME,
params={"start": "2020-01-01", "end": "2025-01-01", "unit": "s"},
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(50)
parsed = lazy.pd.to_datetime(dataset["ts"])
assert parsed.notna().all()
assert parsed.dtype == "datetime64[ns]"


def test_timedelta_single_record(stub_schema_builder):
"""TimeDelta columns must also produce valid ISO-8601 for single-record previews."""
stub_schema_builder.add_column(
name="order_date",
sampler_type=SamplerType.DATETIME,
params={"start": "2024-01-01", "end": "2024-12-31", "unit": "D"},
)
stub_schema_builder.add_column(
name="delivery_date",
sampler_type=SamplerType.TIMEDELTA,
params={"dt_min": 1, "dt_max": 5, "reference_column_name": "order_date", "unit": "D"},
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(1)
for col in ["order_date", "delivery_date"]:
value = dataset[col].iloc[0]
assert "T" in value, f"{col}: expected ISO-8601, got: {value}"
datetime.fromisoformat(value)


def test_timedelta_hourly_units(stub_schema_builder):
"""TimeDelta with sub-day units must produce valid ISO-8601."""
stub_schema_builder.add_column(
name="event_start",
sampler_type=SamplerType.DATETIME,
params={"start": "2024-06-01", "end": "2024-06-30", "unit": "h"},
)
stub_schema_builder.add_column(
name="event_end",
sampler_type=SamplerType.TIMEDELTA,
params={"dt_min": 1, "dt_max": 4, "reference_column_name": "event_start", "unit": "h"},
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(20)
for col in ["event_start", "event_end"]:
for value in dataset[col]:
datetime.fromisoformat(value)
# Verify the timedelta relationship holds.
starts = lazy.pd.to_datetime(dataset["event_start"])
ends = lazy.pd.to_datetime(dataset["event_end"])
deltas = ends - starts
assert (deltas >= lazy.pd.Timedelta(hours=1)).all()
assert (deltas < lazy.pd.Timedelta(hours=4)).all()


def test_multiple_datetime_columns_independent(stub_schema_builder):
"""Multiple datetime columns with different configs don't contaminate each other."""
stub_schema_builder.add_column(
name="created_at",
sampler_type=SamplerType.DATETIME,
params={"start": "2020-01-01", "end": "2020-12-31", "unit": "D"},
)
stub_schema_builder.add_column(
name="logged_at",
sampler_type=SamplerType.DATETIME,
params={"start": "2024-06-01", "end": "2024-06-30", "unit": "s"},
convert_to="%Y-%m-%d %H:%M:%S",
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(10)
# created_at: no convert_to β†’ ISO-8601 with T separator.
for value in dataset["created_at"]:
assert "T" in value
datetime.fromisoformat(value)
# logged_at: explicit convert_to β†’ space separator, no T.
for value in dataset["logged_at"]:
assert "T" not in value
lazy.pd.to_datetime(value, format="%Y-%m-%d %H:%M:%S")


def test_datetime_narrow_range_single_day(stub_schema_builder):
"""Sampling within a single day must still return full ISO-8601 timestamps."""
stub_schema_builder.add_column(
name="ts",
sampler_type=SamplerType.DATETIME,
params={"start": "2024-07-04 00:00:00", "end": "2024-07-04 23:59:59", "unit": "s"},
)
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
dataset = generator.generate(5)
for value in dataset["ts"]:
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
parsed = datetime.fromisoformat(value)
assert parsed.year == 2024
assert parsed.month == 7
assert parsed.day == 4


def test_dataset_with_conditionals(stub_schema_builder):
stub_schema_builder.add_column(
name="col_1",
Expand Down
33 changes: 32 additions & 1 deletion packages/data-designer/tests/interface/test_data_designer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch
Expand All @@ -20,7 +21,7 @@
from data_designer.config.models import ModelProvider
from data_designer.config.processors import DropColumnsProcessorConfig
from data_designer.config.run_config import RunConfig
from data_designer.config.sampler_params import CategorySamplerParams, SamplerType
from data_designer.config.sampler_params import CategorySamplerParams, DatetimeSamplerParams, SamplerType
from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy
from data_designer.config.seed_source import (
AgentRolloutFormat,
Expand Down Expand Up @@ -702,6 +703,36 @@ def test_preview_raises_generation_error_when_dataset_is_empty(
data_designer.preview(stub_sampler_only_config_builder, num_records=1)


def test_preview_datetime_single_record_returns_iso8601(
stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path
):
"""Regression test for #484: single-record datetime preview must return ISO-8601, not a bare year."""
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
config_builder.add_column(
SamplerColumnConfig(
name="ts",
sampler_type=SamplerType.DATETIME,
params=DatetimeSamplerParams(start="2024-01-01", end="2026-06-30", unit="h"),
),
)

data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)

result = data_designer.preview(config_builder, num_records=1)
ts_value = result.dataset["ts"].iloc[0]

# Must be a full ISO-8601 timestamp, not a bare year like "2025".
assert "T" in ts_value, f"Expected ISO-8601 timestamp, got: {ts_value!r}"
parsed = datetime.fromisoformat(ts_value)
assert parsed.year >= 2024
assert parsed.year <= 2026


def test_preview_with_dropped_columns(
stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path
):
Expand Down
Loading