Skip to content

Commit 722c33b

Browse files
committed
fix: always return ISO-8601 from datetime postproc (#484)
The DatetimeFormatMixin.postproc heuristics inferred output format from value distribution, silently stripping date/time components for small datasets or narrow date ranges. Replace with deterministic ISO-8601 output via vectorized strftime. Users who need custom formats can still set convert_to on the SamplerColumnConfig.
1 parent 5f04e5d commit 722c33b

3 files changed

Lines changed: 202 additions & 11 deletions

File tree

packages/data-designer-engine/src/data_designer/engine/sampling_gen/data_sources/base.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,7 @@ def preproc(series: pd.Series, convert_to: str | None) -> pd.Series:
9898
def postproc(series: pd.Series, convert_to: str | None) -> pd.Series:
9999
if convert_to is not None:
100100
return series.dt.strftime(convert_to)
101-
if series.dt.month.nunique() == 1:
102-
return series.apply(lambda dt: dt.year).astype(str)
103-
if series.dt.day.nunique() == 1:
104-
return series.apply(lambda dt: dt.strftime("%Y-%m"))
105-
if series.dt.hour.sum() > 0 or series.dt.minute.sum() > 0:
106-
return series.apply(lambda dt: dt.isoformat()).astype(str)
107-
if series.dt.second.sum() == 0:
108-
return series.apply(lambda dt: dt.date()).astype(str)
109-
return series.apply(lambda dt: dt.isoformat()).astype(str)
101+
return series.dt.strftime("%Y-%m-%dT%H:%M:%S")
110102

111103
@staticmethod
112104
def validate_data_conversion(convert_to: str | None) -> None:

packages/data-designer-engine/tests/engine/sampling_gen/data_sources/test_sources.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,61 @@ def test_datetime_format_mixin_validate_data_conversion_valid_format():
143143
DatetimeFormatMixin.validate_data_conversion(None)
144144

145145

146+
def test_datetime_format_mixin_postproc_no_convert_to_returns_isoformat():
147+
series = lazy.pd.Series(lazy.pd.date_range("2023-01-01", periods=3))
148+
result = DatetimeFormatMixin.postproc(series, None)
149+
expected = lazy.pd.Series(["2023-01-01T00:00:00", "2023-01-02T00:00:00", "2023-01-03T00:00:00"], dtype="str")
150+
lazy.pd.testing.assert_series_equal(result, expected)
151+
152+
153+
def test_datetime_format_mixin_postproc_single_record():
154+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-06-15 14:30:00"]))
155+
result = DatetimeFormatMixin.postproc(series, None)
156+
expected = lazy.pd.Series(["2024-06-15T14:30:00"], dtype="str")
157+
lazy.pd.testing.assert_series_equal(result, expected)
158+
159+
160+
def test_datetime_format_mixin_postproc_same_month_records():
161+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-03-01", "2024-03-15", "2024-03-28"]))
162+
result = DatetimeFormatMixin.postproc(series, None)
163+
expected = lazy.pd.Series(["2024-03-01T00:00:00", "2024-03-15T00:00:00", "2024-03-28T00:00:00"], dtype="str")
164+
lazy.pd.testing.assert_series_equal(result, expected)
165+
166+
167+
def test_datetime_format_mixin_postproc_same_day_records():
168+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-01-01 08:00:00", "2024-02-01 12:00:00"]))
169+
result = DatetimeFormatMixin.postproc(series, None)
170+
expected = lazy.pd.Series(["2024-01-01T08:00:00", "2024-02-01T12:00:00"], dtype="str")
171+
lazy.pd.testing.assert_series_equal(result, expected)
172+
173+
174+
def test_datetime_format_mixin_postproc_always_parseable():
175+
"""All postproc outputs without convert_to must be parseable by fromisoformat."""
176+
series = lazy.pd.Series(lazy.pd.date_range("2023-06-01", periods=5, freq="h"))
177+
result = DatetimeFormatMixin.postproc(series, None)
178+
for val in result:
179+
lazy.pd.Timestamp.fromisoformat(val)
180+
181+
182+
def test_datetime_format_mixin_postproc_stdlib_fromisoformat():
183+
"""Output must be parseable by Python stdlib datetime.fromisoformat, not just pandas."""
184+
from datetime import datetime
185+
186+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-06-15 14:30:00", "2025-01-01 00:00:00"]))
187+
result = DatetimeFormatMixin.postproc(series, None)
188+
for val in result:
189+
parsed = datetime.fromisoformat(val)
190+
assert isinstance(parsed, datetime)
191+
192+
193+
def test_datetime_format_mixin_postproc_round_trip_preserves_values():
194+
"""Output can be parsed back to the original timestamps."""
195+
series = lazy.pd.Series(lazy.pd.to_datetime(["2024-03-15 09:30:00", "2024-11-01 18:45:00"]))
196+
result = DatetimeFormatMixin.postproc(series, None)
197+
round_tripped = lazy.pd.to_datetime(result)
198+
lazy.pd.testing.assert_series_equal(round_tripped, series, check_names=False, check_dtype=False)
199+
200+
146201
def test_datetime_format_mixin_validate_data_conversion_invalid_format():
147202
with pytest.raises(ValueError, match="Invalid datetime format"):
148203
DatetimeFormatMixin.validate_data_conversion("invalid_format")

packages/data-designer-engine/tests/engine/sampling_gen/test_generator.py

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from __future__ import annotations
55

6+
from datetime import datetime
67
from decimal import Decimal
78
from functools import partial
89

@@ -67,7 +68,7 @@ def test_datetime_formats(stub_schema_builder):
6768
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
6869
dataset = generator.generate(100)
6970

70-
assert dataset["year"].str.match(r"\d{4}").all()
71+
assert dataset["year"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
7172
assert dataset["datetime"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
7273

7374

@@ -96,7 +97,7 @@ def test_timedelta(stub_schema_builder):
9697
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
9798
dataset = generator.generate(100)
9899

99-
assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}").all()
100+
assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all()
100101

101102
dt = lazy.pd.to_datetime(dataset["new_date"]) - lazy.pd.to_datetime(dataset["reference_date"])
102103
assert (dt <= lazy.pd.Timedelta(days=10)).all()
@@ -142,6 +143,149 @@ def test_dataset_column_convert_datetime_format(stub_schema_builder):
142143
assert lazy.pd.to_datetime(dataset["col_1"], format="%m/%d/%Y").notna().all()
143144

144145

146+
def test_datetime_single_record_returns_isoformat(stub_schema_builder):
147+
"""Reproducer for issue #484: single-record preview must return full ISO-8601."""
148+
stub_schema_builder.add_column(
149+
name="ts",
150+
sampler_type=SamplerType.DATETIME,
151+
params={"start": "2024-01-01", "end": "2026-12-31", "unit": "D"},
152+
)
153+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
154+
dataset = generator.generate(1)
155+
value = dataset["ts"].iloc[0]
156+
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
157+
datetime.fromisoformat(value)
158+
159+
160+
def test_datetime_all_same_month_returns_isoformat(stub_schema_builder):
161+
stub_schema_builder.add_column(
162+
name="ts",
163+
sampler_type=SamplerType.DATETIME,
164+
params={"start": "2024-03-01", "end": "2024-03-31", "unit": "D"},
165+
)
166+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
167+
dataset = generator.generate(10)
168+
for value in dataset["ts"]:
169+
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
170+
datetime.fromisoformat(value)
171+
172+
173+
@pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s"])
174+
def test_datetime_all_units_preview_size(stub_schema_builder, unit):
175+
"""Every unit granularity must return valid ISO-8601 even at preview sizes (1-5 records)."""
176+
stub_schema_builder.add_column(
177+
name="ts",
178+
sampler_type=SamplerType.DATETIME,
179+
params={"start": "2020-01-01", "end": "2025-12-31", "unit": unit},
180+
)
181+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
182+
dataset = generator.generate(3)
183+
for value in dataset["ts"]:
184+
assert "T" in value, f"unit={unit!r}: expected ISO-8601, got: {value}"
185+
datetime.fromisoformat(value)
186+
187+
188+
def test_datetime_output_round_trips_through_pd_to_datetime(stub_schema_builder):
189+
"""Output strings must survive pd.to_datetime() for downstream DataFrame joins/filters."""
190+
stub_schema_builder.add_column(
191+
name="ts",
192+
sampler_type=SamplerType.DATETIME,
193+
params={"start": "2020-01-01", "end": "2025-01-01", "unit": "s"},
194+
)
195+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
196+
dataset = generator.generate(50)
197+
parsed = lazy.pd.to_datetime(dataset["ts"])
198+
assert parsed.notna().all()
199+
assert parsed.dtype == "datetime64[ns]"
200+
201+
202+
def test_timedelta_single_record(stub_schema_builder):
203+
"""TimeDelta columns must also produce valid ISO-8601 for single-record previews."""
204+
stub_schema_builder.add_column(
205+
name="order_date",
206+
sampler_type=SamplerType.DATETIME,
207+
params={"start": "2024-01-01", "end": "2024-12-31", "unit": "D"},
208+
)
209+
stub_schema_builder.add_column(
210+
name="delivery_date",
211+
sampler_type=SamplerType.TIMEDELTA,
212+
params={"dt_min": 1, "dt_max": 5, "reference_column_name": "order_date", "unit": "D"},
213+
)
214+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
215+
dataset = generator.generate(1)
216+
for col in ["order_date", "delivery_date"]:
217+
value = dataset[col].iloc[0]
218+
assert "T" in value, f"{col}: expected ISO-8601, got: {value}"
219+
datetime.fromisoformat(value)
220+
221+
222+
def test_timedelta_hourly_units(stub_schema_builder):
223+
"""TimeDelta with sub-day units must produce valid ISO-8601."""
224+
stub_schema_builder.add_column(
225+
name="event_start",
226+
sampler_type=SamplerType.DATETIME,
227+
params={"start": "2024-06-01", "end": "2024-06-30", "unit": "h"},
228+
)
229+
stub_schema_builder.add_column(
230+
name="event_end",
231+
sampler_type=SamplerType.TIMEDELTA,
232+
params={"dt_min": 1, "dt_max": 4, "reference_column_name": "event_start", "unit": "h"},
233+
)
234+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
235+
dataset = generator.generate(20)
236+
for col in ["event_start", "event_end"]:
237+
for value in dataset[col]:
238+
datetime.fromisoformat(value)
239+
# Verify the timedelta relationship holds.
240+
starts = lazy.pd.to_datetime(dataset["event_start"])
241+
ends = lazy.pd.to_datetime(dataset["event_end"])
242+
deltas = ends - starts
243+
assert (deltas >= lazy.pd.Timedelta(hours=1)).all()
244+
assert (deltas < lazy.pd.Timedelta(hours=4)).all()
245+
246+
247+
def test_multiple_datetime_columns_independent(stub_schema_builder):
248+
"""Multiple datetime columns with different configs don't contaminate each other."""
249+
stub_schema_builder.add_column(
250+
name="created_at",
251+
sampler_type=SamplerType.DATETIME,
252+
params={"start": "2020-01-01", "end": "2020-12-31", "unit": "D"},
253+
)
254+
stub_schema_builder.add_column(
255+
name="logged_at",
256+
sampler_type=SamplerType.DATETIME,
257+
params={"start": "2024-06-01", "end": "2024-06-30", "unit": "s"},
258+
convert_to="%Y-%m-%d %H:%M:%S",
259+
)
260+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
261+
dataset = generator.generate(10)
262+
# created_at: no convert_to → ISO-8601 with T separator.
263+
for value in dataset["created_at"]:
264+
assert "T" in value
265+
datetime.fromisoformat(value)
266+
# logged_at: explicit convert_to → space separator, no T.
267+
for value in dataset["logged_at"]:
268+
assert "T" not in value
269+
lazy.pd.to_datetime(value, format="%Y-%m-%d %H:%M:%S")
270+
271+
272+
def test_datetime_narrow_range_single_day(stub_schema_builder):
273+
"""Sampling within a single day must still return full ISO-8601 timestamps."""
274+
stub_schema_builder.add_column(
275+
name="ts",
276+
sampler_type=SamplerType.DATETIME,
277+
params={"start": "2024-07-04 00:00:00", "end": "2024-07-04 23:59:59", "unit": "s"},
278+
)
279+
generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns())
280+
dataset = generator.generate(5)
281+
for value in dataset["ts"]:
282+
assert "T" in value, f"Expected ISO-8601 format but got: {value}"
283+
parsed = datetime.fromisoformat(value)
284+
assert parsed.year == 2024
285+
assert parsed.month == 7
286+
assert parsed.day == 4
287+
288+
145289
def test_dataset_with_conditionals(stub_schema_builder):
146290
stub_schema_builder.add_column(
147291
name="col_1",

0 commit comments

Comments
 (0)