|
3 | 3 |
|
4 | 4 | from __future__ import annotations |
5 | 5 |
|
| 6 | +from datetime import datetime |
6 | 7 | from decimal import Decimal |
7 | 8 | from functools import partial |
8 | 9 |
|
@@ -67,7 +68,7 @@ def test_datetime_formats(stub_schema_builder): |
67 | 68 | generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
68 | 69 | dataset = generator.generate(100) |
69 | 70 |
|
70 | | - assert dataset["year"].str.match(r"\d{4}").all() |
| 71 | + assert dataset["year"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all() |
71 | 72 | assert dataset["datetime"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all() |
72 | 73 |
|
73 | 74 |
|
@@ -96,7 +97,7 @@ def test_timedelta(stub_schema_builder): |
96 | 97 | generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
97 | 98 | dataset = generator.generate(100) |
98 | 99 |
|
99 | | - assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}").all() |
| 100 | + assert dataset["new_date"].str.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}").all() |
100 | 101 |
|
101 | 102 | dt = lazy.pd.to_datetime(dataset["new_date"]) - lazy.pd.to_datetime(dataset["reference_date"]) |
102 | 103 | assert (dt <= lazy.pd.Timedelta(days=10)).all() |
@@ -142,6 +143,149 @@ def test_dataset_column_convert_datetime_format(stub_schema_builder): |
142 | 143 | assert lazy.pd.to_datetime(dataset["col_1"], format="%m/%d/%Y").notna().all() |
143 | 144 |
|
144 | 145 |
|
| 146 | +def test_datetime_single_record_returns_isoformat(stub_schema_builder): |
| 147 | + """Reproducer for issue #484: single-record preview must return full ISO-8601.""" |
| 148 | + stub_schema_builder.add_column( |
| 149 | + name="ts", |
| 150 | + sampler_type=SamplerType.DATETIME, |
| 151 | + params={"start": "2024-01-01", "end": "2026-12-31", "unit": "D"}, |
| 152 | + ) |
| 153 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 154 | + dataset = generator.generate(1) |
| 155 | + value = dataset["ts"].iloc[0] |
| 156 | + assert "T" in value, f"Expected ISO-8601 format but got: {value}" |
| 157 | + datetime.fromisoformat(value) |
| 158 | + |
| 159 | + |
| 160 | +def test_datetime_all_same_month_returns_isoformat(stub_schema_builder): |
| 161 | + stub_schema_builder.add_column( |
| 162 | + name="ts", |
| 163 | + sampler_type=SamplerType.DATETIME, |
| 164 | + params={"start": "2024-03-01", "end": "2024-03-31", "unit": "D"}, |
| 165 | + ) |
| 166 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 167 | + dataset = generator.generate(10) |
| 168 | + for value in dataset["ts"]: |
| 169 | + assert "T" in value, f"Expected ISO-8601 format but got: {value}" |
| 170 | + datetime.fromisoformat(value) |
| 171 | + |
| 172 | + |
| 173 | +@pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s"]) |
| 174 | +def test_datetime_all_units_preview_size(stub_schema_builder, unit): |
| 175 | + """Every unit granularity must return valid ISO-8601 even at preview sizes (1-5 records).""" |
| 176 | + stub_schema_builder.add_column( |
| 177 | + name="ts", |
| 178 | + sampler_type=SamplerType.DATETIME, |
| 179 | + params={"start": "2020-01-01", "end": "2025-12-31", "unit": unit}, |
| 180 | + ) |
| 181 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 182 | + dataset = generator.generate(3) |
| 183 | + for value in dataset["ts"]: |
| 184 | + assert "T" in value, f"unit={unit!r}: expected ISO-8601, got: {value}" |
| 185 | + datetime.fromisoformat(value) |
| 186 | + |
| 187 | + |
| 188 | +def test_datetime_output_round_trips_through_pd_to_datetime(stub_schema_builder): |
| 189 | + """Output strings must survive pd.to_datetime() for downstream DataFrame joins/filters.""" |
| 190 | + stub_schema_builder.add_column( |
| 191 | + name="ts", |
| 192 | + sampler_type=SamplerType.DATETIME, |
| 193 | + params={"start": "2020-01-01", "end": "2025-01-01", "unit": "s"}, |
| 194 | + ) |
| 195 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 196 | + dataset = generator.generate(50) |
| 197 | + parsed = lazy.pd.to_datetime(dataset["ts"]) |
| 198 | + assert parsed.notna().all() |
| 199 | + assert parsed.dtype == "datetime64[ns]" |
| 200 | + |
| 201 | + |
| 202 | +def test_timedelta_single_record(stub_schema_builder): |
| 203 | + """TimeDelta columns must also produce valid ISO-8601 for single-record previews.""" |
| 204 | + stub_schema_builder.add_column( |
| 205 | + name="order_date", |
| 206 | + sampler_type=SamplerType.DATETIME, |
| 207 | + params={"start": "2024-01-01", "end": "2024-12-31", "unit": "D"}, |
| 208 | + ) |
| 209 | + stub_schema_builder.add_column( |
| 210 | + name="delivery_date", |
| 211 | + sampler_type=SamplerType.TIMEDELTA, |
| 212 | + params={"dt_min": 1, "dt_max": 5, "reference_column_name": "order_date", "unit": "D"}, |
| 213 | + ) |
| 214 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 215 | + dataset = generator.generate(1) |
| 216 | + for col in ["order_date", "delivery_date"]: |
| 217 | + value = dataset[col].iloc[0] |
| 218 | + assert "T" in value, f"{col}: expected ISO-8601, got: {value}" |
| 219 | + datetime.fromisoformat(value) |
| 220 | + |
| 221 | + |
| 222 | +def test_timedelta_hourly_units(stub_schema_builder): |
| 223 | + """TimeDelta with sub-day units must produce valid ISO-8601.""" |
| 224 | + stub_schema_builder.add_column( |
| 225 | + name="event_start", |
| 226 | + sampler_type=SamplerType.DATETIME, |
| 227 | + params={"start": "2024-06-01", "end": "2024-06-30", "unit": "h"}, |
| 228 | + ) |
| 229 | + stub_schema_builder.add_column( |
| 230 | + name="event_end", |
| 231 | + sampler_type=SamplerType.TIMEDELTA, |
| 232 | + params={"dt_min": 1, "dt_max": 4, "reference_column_name": "event_start", "unit": "h"}, |
| 233 | + ) |
| 234 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 235 | + dataset = generator.generate(20) |
| 236 | + for col in ["event_start", "event_end"]: |
| 237 | + for value in dataset[col]: |
| 238 | + datetime.fromisoformat(value) |
| 239 | + # Verify the timedelta relationship holds. |
| 240 | + starts = lazy.pd.to_datetime(dataset["event_start"]) |
| 241 | + ends = lazy.pd.to_datetime(dataset["event_end"]) |
| 242 | + deltas = ends - starts |
| 243 | + assert (deltas >= lazy.pd.Timedelta(hours=1)).all() |
| 244 | + assert (deltas < lazy.pd.Timedelta(hours=4)).all() |
| 245 | + |
| 246 | + |
| 247 | +def test_multiple_datetime_columns_independent(stub_schema_builder): |
| 248 | + """Multiple datetime columns with different configs don't contaminate each other.""" |
| 249 | + stub_schema_builder.add_column( |
| 250 | + name="created_at", |
| 251 | + sampler_type=SamplerType.DATETIME, |
| 252 | + params={"start": "2020-01-01", "end": "2020-12-31", "unit": "D"}, |
| 253 | + ) |
| 254 | + stub_schema_builder.add_column( |
| 255 | + name="logged_at", |
| 256 | + sampler_type=SamplerType.DATETIME, |
| 257 | + params={"start": "2024-06-01", "end": "2024-06-30", "unit": "s"}, |
| 258 | + convert_to="%Y-%m-%d %H:%M:%S", |
| 259 | + ) |
| 260 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 261 | + dataset = generator.generate(10) |
| 262 | + # created_at: no convert_to → ISO-8601 with T separator. |
| 263 | + for value in dataset["created_at"]: |
| 264 | + assert "T" in value |
| 265 | + datetime.fromisoformat(value) |
| 266 | + # logged_at: explicit convert_to → space separator, no T. |
| 267 | + for value in dataset["logged_at"]: |
| 268 | + assert "T" not in value |
| 269 | + lazy.pd.to_datetime(value, format="%Y-%m-%d %H:%M:%S") |
| 270 | + |
| 271 | + |
| 272 | +def test_datetime_narrow_range_single_day(stub_schema_builder): |
| 273 | + """Sampling within a single day must still return full ISO-8601 timestamps.""" |
| 274 | + stub_schema_builder.add_column( |
| 275 | + name="ts", |
| 276 | + sampler_type=SamplerType.DATETIME, |
| 277 | + params={"start": "2024-07-04 00:00:00", "end": "2024-07-04 23:59:59", "unit": "s"}, |
| 278 | + ) |
| 279 | + generator = DatasetGenerator(sampler_columns=stub_schema_builder.to_sampler_columns()) |
| 280 | + dataset = generator.generate(5) |
| 281 | + for value in dataset["ts"]: |
| 282 | + assert "T" in value, f"Expected ISO-8601 format but got: {value}" |
| 283 | + parsed = datetime.fromisoformat(value) |
| 284 | + assert parsed.year == 2024 |
| 285 | + assert parsed.month == 7 |
| 286 | + assert parsed.day == 4 |
| 287 | + |
| 288 | + |
145 | 289 | def test_dataset_with_conditionals(stub_schema_builder): |
146 | 290 | stub_schema_builder.add_column( |
147 | 291 | name="col_1", |
|
0 commit comments