Skip to content

Commit 4a28136

Browse files
authored
fix: always return ISO-8601 from datetime postproc (#484) (#512)
* fix: always return ISO-8601 from datetime postproc (#484) The DatetimeFormatMixin.postproc heuristics inferred output format from value distribution, silently stripping date/time components for small datasets or narrow date ranges. Replace with deterministic ISO-8601 output via vectorized strftime. Users who need custom formats can still set convert_to on the SamplerColumnConfig. * docs: update convert_to docstring and add DatetimeFormatMixin docstring The SamplerColumnConfig.convert_to docstring incorrectly stated that only "float", "int", or "str" are accepted. Datetime/timedelta samplers accept strftime format strings. Also document the ISO-8601 default. * test: add regression test for #484 via DataDesigner.preview API Captures the exact reproducer from the issue: a single-record datetime preview through the public DataDesigner.preview() interface must return a full ISO-8601 timestamp, not a bare year string. * test: trim redundant datetime tests, align reproducer with issue #484 - Remove postproc_same_day_records (subsumed by same_month + no_convert_to) - Remove postproc_always_parseable (subsumed by stdlib_fromisoformat) - Remove all_same_month integration test (subsumed by narrow_range_single_day) - Update single_record test to use unit="h" matching the issue reproducer * fix: address review nits — move datetime import to module scope, drop redundant isinstance
1 parent fdd5ebb commit 4a28136

5 files changed

Lines changed: 221 additions & 15 deletions

File tree

packages/data-designer-config/src/data_designer/config/column_configs.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,10 @@ class SamplerColumnConfig(SingleColumnConfig):
4444
conditional_params: Optional dictionary for conditional parameters. The dict keys
4545
are the conditions that must be met (e.g., "age > 21") for the conditional parameters
4646
to be used. The values of dict are the parameters to use when the condition is met.
47-
convert_to: Optional type conversion to apply after sampling. Must be one of "float", "int", or "str".
48-
Useful for converting numerical samples to strings or other types.
47+
convert_to: Optional type conversion to apply after sampling. For numerical samplers,
48+
must be one of "float", "int", or "str". For datetime and timedelta samplers, accepts
49+
a strftime format string (e.g., ``"%Y-%m-%d"``, ``"%m/%d/%Y %H:%M"``). When omitted,
50+
datetime/timedelta columns default to ISO-8601 format (e.g., ``2024-01-15T09:30:00``).
4951
5052
Inherited Attributes:
5153
name (required): Unique name of the column to be generated.
@@ -70,7 +72,12 @@ class SamplerColumnConfig(SingleColumnConfig):
7072
description="Optional dictionary for conditional parameters; keys are conditions, values are params to use when met",
7173
)
7274
convert_to: str | None = Field(
73-
default=None, description="Optional type conversion after sampling: 'float', 'int', or 'str'"
75+
default=None,
76+
description=(
77+
"Optional type conversion after sampling: 'float', 'int', or 'str' for numerical samplers; "
78+
"a strftime format string (e.g., '%Y-%m-%d') for datetime/timedelta samplers. "
79+
"Datetime/timedelta columns default to ISO-8601 (e.g., 2024-01-15T09:30:00) when omitted."
80+
),
7481
)
7582
column_type: Literal["sampler"] = "sampler"
7683

packages/data-designer-engine/src/data_designer/engine/sampling_gen/data_sources/base.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,13 @@ def validate_data_conversion(convert_to: str | None) -> None:
9090

9191

9292
class DatetimeFormatMixin:
93+
"""Pre/post-processing mixin for datetime and timedelta samplers.
94+
95+
Formatting behavior:
96+
- With ``convert_to``: formats using the given strftime string.
97+
- Without ``convert_to``: returns ISO-8601 strings (e.g., ``2024-01-15T09:30:00``).
98+
"""
99+
93100
@staticmethod
94101
def preproc(series: pd.Series, convert_to: str | None) -> pd.Series:
95102
return series
@@ -98,15 +105,7 @@ def preproc(series: pd.Series, convert_to: str | None) -> pd.Series:
98105
def postproc(series: pd.Series, convert_to: str | None) -> pd.Series:
99106
if convert_to is not None:
100107
return series.dt.strftime(convert_to)
101-
if series.dt.month.nunique() == 1:
102-
return series.apply(lambda dt: dt.year).astype(str)
103-
if series.dt.day.nunique() == 1:
104-
return series.apply(lambda dt: dt.strftime("%Y-%m"))
105-
if series.dt.hour.sum() > 0 or series.dt.minute.sum() > 0:
106-
return series.apply(lambda dt: dt.isoformat()).astype(str)
107-
if series.dt.second.sum() == 0:
108-
return series.apply(lambda dt: dt.date()).astype(str)
109-
return series.apply(lambda dt: dt.isoformat()).astype(str)
108+
return series.dt.strftime("%Y-%m-%dT%H:%M:%S")
110109

111110
@staticmethod
112111
def validate_data_conversion(convert_to: str | None) -> None:

packages/data-designer-engine/tests/engine/sampling_gen/data_sources/test_sources.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from __future__ import annotations
55

6+
from datetime import datetime
67
from unittest.mock import Mock
78

89
import pytest
@@ -143,6 +144,43 @@ def test_datetime_format_mixin_validate_data_conversion_valid_format():
143144
DatetimeFormatMixin.validate_data_conversion(None)
144145

145146

147+
def test_datetime_format_mixin_postproc_no_convert_to_returns_isoformat():
148+
series = lazy.pd.Series(lazy.pd.date_range("2023-01-01", periods=3))
149+
result = DatetimeFormatMixin.postproc(series, None)
150+
expected = lazy.pd.Series(["2023-01-01T00:00:00", "2023-01-02T00:00:00", "2023-01-03T00:00:00"], dtype="str")
151+
lazy.pd.testing.assert_series_equal(result, expected)
152+
153+
154+
def test_datetime_format_mixin_postproc_single_record():
155+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-06-15 14:30:00"]))
156+
result = DatetimeFormatMixin.postproc(series, None)
157+
expected = lazy.pd.Series(["2024-06-15T14:30:00"], dtype="str")
158+
lazy.pd.testing.assert_series_equal(result, expected)
159+
160+
161+
def test_datetime_format_mixin_postproc_same_month_records():
162+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-03-01", "2024-03-15", "2024-03-28"]))
163+
result = DatetimeFormatMixin.postproc(series, None)
164+
expected = lazy.pd.Series(["2024-03-01T00:00:00", "2024-03-15T00:00:00", "2024-03-28T00:00:00"], dtype="str")
165+
lazy.pd.testing.assert_series_equal(result, expected)
166+
167+
168+
def test_datetime_format_mixin_postproc_stdlib_fromisoformat():
169+
"""Output must be parseable by Python stdlib datetime.fromisoformat, not just pandas."""
170+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-06-15 14:30:00", "2025-01-01 00:00:00"]))
171+
result = DatetimeFormatMixin.postproc(series, None)
172+
for val in result:
173+
datetime.fromisoformat(val)
174+
175+
176+
def test_datetime_format_mixin_postproc_round_trip_preserves_values():
177+
"""Output can be parsed back to the original timestamps."""
178+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-03-15 09:30:00", "2024-11-01 18:45:00"]))
179+
result = DatetimeFormatMixin.postproc(series, None)
180+
round_tripped = lazy.pd.to_datetime(result)
181+
lazy.pd.testing.assert_series_equal(round_tripped, series, check_names=False, check_dtype=False)
182+
183+
146184
def test_datetime_format_mixin_validate_data_conversion_invalid_format():
147185
with pytest.raises(ValueError, match="Invalid datetime format"):
148186
DatetimeFormatMixin.validate_data_conversion("invalid_format")

packages/data-designer-engine/tests/engine/sampling_gen/test_generator.py

Lines changed: 133 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from __future__ import annotations
55

6+
from datetime import datetime
67
from decimal import Decimal
78
from functools import partial
89

@@ -67,7 +68,7 @@ def test_datetime_formats(stub_schema_builder):
6768
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
6869
dataset = generator.generate(100)
6970

70-
assert dataset["year"].str.match(r"\d{4}").all()
71+
assert dataset["year"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
7172
assert dataset["datetime"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
7273

7374

@@ -96,7 +97,7 @@ def test_timedelta(stub_schema_builder):
9697
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
9798
dataset = generator.generate(100)
9899

99-
assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}").all()
100+
assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
100101

101102
dt = lazy.pd.to_datetime(dataset["new_date"]) - lazy.pd.to_datetime(dataset["reference_date"])
102103
assert (dt <= lazy.pd.Timedelta(days=10)).all()
@@ -142,6 +143,136 @@ def test_dataset_column_convert_datetime_format(stub_schema_builder):
142143
assert lazy.pd.to_datetime(dataset["col_1"], format="%m/%d/%Y").notna().all()
143144

144145

146+
def test_datetime_single_record_returns_isoformat(stub_schema_builder):
147+
"""Reproducer for issue #484: single-record preview must return full ISO-8601."""
148+
stub_schema_builder.add_column(
149+
name="ts",
150+
sampler_type=SamplerType.DATETIME,
151+
params={"start": "2024-01-01", "end": "2026-06-30", "unit": "h"},
152+
)
153+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
154+
dataset = generator.generate(1)
155+
value = dataset["ts"].iloc[0]
156+
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
157+
datetime.fromisoformat(value)
158+
159+
160+
@pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s"])
161+
def test_datetime_all_units_preview_size(stub_schema_builder, unit):
162+
"""Every unit granularity must return valid ISO-8601 even at preview sizes (1-5 records)."""
163+
stub_schema_builder.add_column(
164+
name="ts",
165+
sampler_type=SamplerType.DATETIME,
166+
params={"start": "2020-01-01", "end": "2025-12-31", "unit": unit},
167+
)
168+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
169+
dataset = generator.generate(3)
170+
for value in dataset["ts"]:
171+
assert "T" in value, f"unit={unit!r}: expected ISO-8601, got: {value}"
172+
datetime.fromisoformat(value)
173+
174+
175+
def test_datetime_output_round_trips_through_pd_to_datetime(stub_schema_builder):
176+
"""Output strings must survive pd.to_datetime() for downstream DataFrame joins/filters."""
177+
stub_schema_builder.add_column(
178+
name="ts",
179+
sampler_type=SamplerType.DATETIME,
180+
params={"start": "2020-01-01", "end": "2025-01-01", "unit": "s"},
181+
)
182+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
183+
dataset = generator.generate(50)
184+
parsed = lazy.pd.to_datetime(dataset["ts"])
185+
assert parsed.notna().all()
186+
assert parsed.dtype == "datetime64[ns]"
187+
188+
189+
def test_timedelta_single_record(stub_schema_builder):
190+
"""TimeDelta columns must also produce valid ISO-8601 for single-record previews."""
191+
stub_schema_builder.add_column(
192+
name="order_date",
193+
sampler_type=SamplerType.DATETIME,
194+
params={"start": "2024-01-01", "end": "2024-12-31", "unit": "D"},
195+
)
196+
stub_schema_builder.add_column(
197+
name="delivery_date",
198+
sampler_type=SamplerType.TIMEDELTA,
199+
params={"dt_min": 1, "dt_max": 5, "reference_column_name": "order_date", "unit": "D"},
200+
)
201+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
202+
dataset = generator.generate(1)
203+
for col in ["order_date", "delivery_date"]:
204+
value = dataset[col].iloc[0]
205+
assert "T" in value, f"{col}: expected ISO-8601, got: {value}"
206+
datetime.fromisoformat(value)
207+
208+
209+
def test_timedelta_hourly_units(stub_schema_builder):
210+
"""TimeDelta with sub-day units must produce valid ISO-8601."""
211+
stub_schema_builder.add_column(
212+
name="event_start",
213+
sampler_type=SamplerType.DATETIME,
214+
params={"start": "2024-06-01", "end": "2024-06-30", "unit": "h"},
215+
)
216+
stub_schema_builder.add_column(
217+
name="event_end",
218+
sampler_type=SamplerType.TIMEDELTA,
219+
params={"dt_min": 1, "dt_max": 4, "reference_column_name": "event_start", "unit": "h"},
220+
)
221+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
222+
dataset = generator.generate(20)
223+
for col in ["event_start", "event_end"]:
224+
for value in dataset[col]:
225+
datetime.fromisoformat(value)
226+
# Verify the timedelta relationship holds.
227+
starts = lazy.pd.to_datetime(dataset["event_start"])
228+
ends = lazy.pd.to_datetime(dataset["event_end"])
229+
deltas = ends - starts
230+
assert (deltas >= lazy.pd.Timedelta(hours=1)).all()
231+
assert (deltas < lazy.pd.Timedelta(hours=4)).all()
232+
233+
234+
def test_multiple_datetime_columns_independent(stub_schema_builder):
235+
"""Multiple datetime columns with different configs don't contaminate each other."""
236+
stub_schema_builder.add_column(
237+
name="created_at",
238+
sampler_type=SamplerType.DATETIME,
239+
params={"start": "2020-01-01", "end": "2020-12-31", "unit": "D"},
240+
)
241+
stub_schema_builder.add_column(
242+
name="logged_at",
243+
sampler_type=SamplerType.DATETIME,
244+
params={"start": "2024-06-01", "end": "2024-06-30", "unit": "s"},
245+
convert_to="%Y-%m-%d %H:%M:%S",
246+
)
247+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
248+
dataset = generator.generate(10)
249+
# created_at: no convert_to → ISO-8601 with T separator.
250+
for value in dataset["created_at"]:
251+
assert "T" in value
252+
datetime.fromisoformat(value)
253+
# logged_at: explicit convert_to → space separator, no T.
254+
for value in dataset["logged_at"]:
255+
assert "T" not in value
256+
lazy.pd.to_datetime(value, format="%Y-%m-%d %H:%M:%S")
257+
258+
259+
def test_datetime_narrow_range_single_day(stub_schema_builder):
260+
"""Sampling within a single day must still return full ISO-8601 timestamps."""
261+
stub_schema_builder.add_column(
262+
name="ts",
263+
sampler_type=SamplerType.DATETIME,
264+
params={"start": "2024-07-04 00:00:00", "end": "2024-07-04 23:59:59", "unit": "s"},
265+
)
266+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
267+
dataset = generator.generate(5)
268+
for value in dataset["ts"]:
269+
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
270+
parsed = datetime.fromisoformat(value)
271+
assert parsed.year == 2024
272+
assert parsed.month == 7
273+
assert parsed.day == 4
274+
275+
145276
def test_dataset_with_conditionals(stub_schema_builder):
146277
stub_schema_builder.add_column(
147278
name="col_1",

packages/data-designer/tests/interface/test_data_designer.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import json
77
import logging
8+
from datetime import datetime
89
from pathlib import Path
910
from typing import Any
1011
from unittest.mock import MagicMock, patch
@@ -20,7 +21,7 @@
2021
from data_designer.config.models import ModelProvider
2122
from data_designer.config.processors import DropColumnsProcessorConfig
2223
from data_designer.config.run_config import RunConfig
23-
from data_designer.config.sampler_params import CategorySamplerParams, SamplerType
24+
from data_designer.config.sampler_params import CategorySamplerParams, DatetimeSamplerParams, SamplerType
2425
from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy
2526
from data_designer.config.seed_source import (
2627
AgentRolloutFormat,
@@ -702,6 +703,36 @@ def test_preview_raises_generation_error_when_dataset_is_empty(
702703
data_designer.preview(stub_sampler_only_config_builder, num_records=1)
703704

704705

706+
def test_preview_datetime_single_record_returns_iso8601(
707+
stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path
708+
):
709+
"""Regression test for #484: single-record datetime preview must return ISO-8601, not a bare year."""
710+
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
711+
config_builder.add_column(
712+
SamplerColumnConfig(
713+
name="ts",
714+
sampler_type=SamplerType.DATETIME,
715+
params=DatetimeSamplerParams(start="2024-01-01", end="2026-06-30", unit="h"),
716+
),
717+
)
718+
719+
data_designer = DataDesigner(
720+
artifact_path=stub_artifact_path,
721+
model_providers=stub_model_providers,
722+
secret_resolver=PlaintextResolver(),
723+
managed_assets_path=stub_managed_assets_path,
724+
)
725+
726+
result = data_designer.preview(config_builder, num_records=1)
727+
ts_value = result.dataset["ts"].iloc[0]
728+
729+
# Must be a full ISO-8601 timestamp, not a bare year like "2025".
730+
assert "T" in ts_value, f"Expected ISO-8601 timestamp, got: {ts_value!r}"
731+
parsed = datetime.fromisoformat(ts_value)
732+
assert parsed.year >= 2024
733+
assert parsed.year <= 2026
734+
735+
705736
def test_preview_with_dropped_columns(
706737
stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path
707738
):

0 commit comments

Comments
 (0)