|
17 | 17 | from data_designer.config.preview_results import PreviewResults |
18 | 18 | from data_designer.config.utils.errors import DatasetSampleDisplayError |
19 | 19 | from data_designer.config.utils.visualization import display_sample_record as display_fn |
| 20 | +from data_designer.engine.dataset_builders.errors import ArtifactStorageError |
20 | 21 | from data_designer.engine.storage.artifact_storage import ArtifactStorage |
21 | 22 | from data_designer.interface.results import DatasetCreationResults |
22 | 23 |
|
@@ -262,62 +263,129 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_ |
262 | 263 | stub_artifact_storage.load_dataset.assert_called_once() |
263 | 264 |
|
264 | 265 |
|
| 266 | +@pytest.fixture |
| 267 | +def stub_batch_dir(stub_dataframe, tmp_path): |
| 268 | + """Directory with two batch parquet files split from stub_dataframe. |
| 269 | +
|
| 270 | + Splitting into two batches exercises the multi-batch streaming path in export(). |
| 271 | + """ |
| 272 | + batch_dir = tmp_path / "parquet-files" |
| 273 | + batch_dir.mkdir() |
| 274 | + mid = len(stub_dataframe) // 2 |
| 275 | + stub_dataframe.iloc[:mid].to_parquet(batch_dir / "batch_00000.parquet", index=False) |
| 276 | + stub_dataframe.iloc[mid:].to_parquet(batch_dir / "batch_00001.parquet", index=False) |
| 277 | + return batch_dir |
| 278 | + |
| 279 | + |
265 | 280 | @pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"]) |
266 | | -def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt): |
267 | | - """export() writes a file in the requested format.""" |
| 281 | +def test_export_writes_file(stub_dataset_creation_results, stub_batch_dir, tmp_path, fmt) -> None: |
| 282 | + """export() writes a non-empty file for each supported format.""" |
| 283 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
268 | 284 | out = tmp_path / f"out.{fmt}" |
269 | | - result = stub_dataset_creation_results.export(out, format=fmt) |
| 285 | + result = stub_dataset_creation_results.export(out) |
270 | 286 | assert result == out |
271 | 287 | assert out.exists() |
272 | 288 | assert out.stat().st_size > 0 |
273 | 289 |
|
274 | 290 |
|
275 | | -def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path): |
276 | | - """JSONL export writes one JSON object per line.""" |
| 291 | +def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None: |
| 292 | + """JSONL export writes one valid JSON object per line, covering all records.""" |
| 293 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
277 | 294 | out = tmp_path / "out.jsonl" |
278 | | - stub_dataset_creation_results.export(out, format="jsonl") |
| 295 | + stub_dataset_creation_results.export(out) |
279 | 296 | lines = out.read_text(encoding="utf-8").splitlines() |
280 | 297 | assert len(lines) == len(stub_dataframe) |
281 | | - # Each line must be valid JSON |
282 | 298 | for line in lines: |
283 | 299 | json.loads(line) |
284 | 300 |
|
285 | 301 |
|
286 | | -def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path): |
287 | | - """CSV export has a header row and one data row per record.""" |
| 302 | +def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None: |
| 303 | + """CSV export produces a single header row and one data row per record.""" |
| 304 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
288 | 305 | out = tmp_path / "out.csv" |
289 | | - stub_dataset_creation_results.export(out, format="csv") |
| 306 | + stub_dataset_creation_results.export(out) |
290 | 307 | loaded = lazy.pd.read_csv(out) |
291 | 308 | assert list(loaded.columns) == list(stub_dataframe.columns) |
292 | 309 | assert len(loaded) == len(stub_dataframe) |
293 | 310 |
|
294 | 311 |
|
295 | | -def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path): |
296 | | - """Parquet export round-trips to the original DataFrame.""" |
| 312 | +def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None: |
| 313 | + """Parquet export round-trips to the original DataFrame across two batches.""" |
| 314 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
297 | 315 | out = tmp_path / "out.parquet" |
298 | | - stub_dataset_creation_results.export(out, format="parquet") |
| 316 | + stub_dataset_creation_results.export(out) |
299 | 317 | loaded = lazy.pd.read_parquet(out) |
300 | | - lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True)) |
| 318 | + lazy.pd.testing.assert_frame_equal( |
| 319 | + loaded.reset_index(drop=True), |
| 320 | + stub_dataframe.reset_index(drop=True), |
| 321 | + ) |
301 | 322 |
|
302 | 323 |
|
303 | | -def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path): |
304 | | - """export() defaults to JSONL when no format is given.""" |
| 324 | +def test_export_infers_format_from_extension(stub_dataset_creation_results, stub_batch_dir, tmp_path) -> None: |
| 325 | + """export() infers the output format from the file extension when format is omitted.""" |
| 326 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
305 | 327 | out = tmp_path / "out.jsonl" |
306 | 328 | stub_dataset_creation_results.export(out) |
307 | 329 | lines = out.read_text(encoding="utf-8").splitlines() |
308 | | - # All lines must be valid JSON |
309 | 330 | for line in lines: |
310 | 331 | json.loads(line) |
311 | 332 |
|
312 | 333 |
|
313 | | -def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path): |
314 | | - """export() raises InvalidFileFormatError for unknown formats.""" |
| 334 | +def test_export_explicit_format_overrides_extension( |
| 335 | + stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path |
| 336 | +) -> None: |
| 337 | + """Passing format= explicitly overrides extension-based inference.""" |
| 338 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
| 339 | + out = tmp_path / "data.txt" |
| 340 | + stub_dataset_creation_results.export(out, format="jsonl") |
| 341 | + lines = out.read_text(encoding="utf-8").splitlines() |
| 342 | + assert len(lines) == len(stub_dataframe) |
| 343 | + for line in lines: |
| 344 | + json.loads(line) |
| 345 | + |
| 346 | + |
| 347 | +def test_export_parquet_schema_unification(stub_dataset_creation_results, tmp_path) -> None: |
| 348 | + """Parquet export unifies schemas across batches with diverging column types.""" |
| 349 | + batch_dir = tmp_path / "parquet-files" |
| 350 | + batch_dir.mkdir() |
| 351 | + # Batch 0: 'value' as int64; Batch 1: 'value' as float64 (type drift) |
| 352 | + lazy.pd.DataFrame({"value": lazy.pd.array([1, 2], dtype="int64")}).to_parquet( |
| 353 | + batch_dir / "batch_00000.parquet", index=False |
| 354 | + ) |
| 355 | + lazy.pd.DataFrame({"value": lazy.pd.array([3.0, 4.0], dtype="float64")}).to_parquet( |
| 356 | + batch_dir / "batch_00001.parquet", index=False |
| 357 | + ) |
| 358 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = batch_dir |
| 359 | + out = tmp_path / "out.parquet" |
| 360 | + stub_dataset_creation_results.export(out) |
| 361 | + loaded = lazy.pd.read_parquet(out) |
| 362 | + assert list(loaded["value"]) == [1.0, 2.0, 3.0, 4.0] |
| 363 | + |
| 364 | + |
| 365 | +def test_export_unknown_extension_raises(stub_dataset_creation_results, tmp_path) -> None: |
| 366 | + """export() raises InvalidFileFormatError when the extension is not a supported format.""" |
315 | 367 | with pytest.raises(InvalidFileFormatError, match="Unsupported export format"): |
316 | | - stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type] |
| 368 | + stub_dataset_creation_results.export(tmp_path / "out.xyz") |
| 369 | + |
| 370 | + |
| 371 | +def test_export_unsupported_explicit_format_raises(stub_dataset_creation_results, tmp_path) -> None: |
| 372 | + """export() raises InvalidFileFormatError for an explicit unsupported format override.""" |
| 373 | + with pytest.raises(InvalidFileFormatError, match="Unsupported export format"): |
| 374 | + stub_dataset_creation_results.export(tmp_path / "out.jsonl", format="xlsx") # type: ignore[arg-type] |
| 375 | + |
| 376 | + |
| 377 | +def test_export_no_batch_files_raises(stub_dataset_creation_results, tmp_path) -> None: |
| 378 | + """export() raises ArtifactStorageError when the batch directory is empty.""" |
| 379 | + empty_dir = tmp_path / "parquet-files" |
| 380 | + empty_dir.mkdir() |
| 381 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = empty_dir |
| 382 | + with pytest.raises(ArtifactStorageError, match="No batch parquet files found"): |
| 383 | + stub_dataset_creation_results.export(tmp_path / "out.jsonl") |
317 | 384 |
|
318 | 385 |
|
319 | | -def test_export_returns_path_object(stub_dataset_creation_results, tmp_path): |
| 386 | +def test_export_returns_path_object(stub_dataset_creation_results, stub_batch_dir, tmp_path) -> None: |
320 | 387 | """export() returns a Path regardless of whether str or Path was passed.""" |
| 388 | + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir |
321 | 389 | out = tmp_path / "out.jsonl" |
322 | 390 | result = stub_dataset_creation_results.export(str(out)) |
323 | 391 | assert isinstance(result, Path) |
|
0 commit comments