From 68557c15b0eebf15a9173d98adae90cb26d2a594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 21:22:01 +0200 Subject: [PATCH 1/6] feat(results): add export() method and --output-format CLI flag Adds DatasetCreationResults.export(path, format=) supporting jsonl, csv, and parquet. The CLI create command gains --output-format / -f which writes dataset. alongside the parquet batch files. --- .../src/data_designer/cli/commands/create.py | 12 ++++ .../cli/controllers/generation_controller.py | 21 ++++++ .../src/data_designer/interface/results.py | 41 ++++++++++- .../tests/cli/commands/test_create_command.py | 31 +++++++- packages/data-designer/tests/cli/test_main.py | 1 + .../tests/interface/test_results.py | 71 +++++++++++++++++++ 6 files changed, 174 insertions(+), 3 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/create.py b/packages/data-designer/src/data_designer/cli/commands/create.py index 3bf3265f0..1f9506dca 100644 --- a/packages/data-designer/src/data_designer/cli/commands/create.py +++ b/packages/data-designer/src/data_designer/cli/commands/create.py @@ -7,6 +7,7 @@ from data_designer.cli.controllers.generation_controller import GenerationController from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS +from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS def create_command( @@ -35,6 +36,16 @@ def create_command( "-o", help="Path where generated artifacts will be stored. Defaults to ./artifacts.", ), + output_format: str | None = typer.Option( + None, + "--output-format", + "-f", + help=( + f"Export the dataset to a single file after generation. " + f"Supported formats: {', '.join(SUPPORTED_EXPORT_FORMATS)}. " + "The file is written to //dataset.." + ), + ), ) -> None: """Create a full dataset and save results to disk. @@ -60,4 +71,5 @@ def create_command( num_records=num_records, dataset_name=dataset_name, artifact_path=artifact_path, + output_format=output_format, ) diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py index 74a44c3cd..bdac0a95f 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py @@ -116,6 +116,7 @@ def run_create( num_records: int, dataset_name: str, artifact_path: str | None, + output_format: str | None = None, ) -> None: """Load config, create a full dataset, and save results to disk. @@ -124,6 +125,8 @@ def run_create( num_records: Number of records to generate. dataset_name: Name for the generated dataset folder. artifact_path: Path where generated artifacts will be stored, or None for default. + output_format: If set, export the dataset to a single file in this format after + generation. One of 'jsonl', 'csv', 'parquet'. """ config_builder = self._load_config(config_source) @@ -157,6 +160,24 @@ def run_create( console.print() print_success(f"Dataset created — {len(dataset)} record(s) generated") console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]") + + if output_format is not None: + from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS + + if output_format not in SUPPORTED_EXPORT_FORMATS: + print_error( + f"Unsupported export format: {output_format!r}. " + f"Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + ) + raise typer.Exit(code=1) + export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}" + try: + results.export(export_path, format=output_format) # type: ignore[arg-type] + except Exception as e: + print_error(f"Export failed: {e}") + raise typer.Exit(code=1) + console.print(f" Exported to: [bold]{export_path}[/bold]") + console.print() def _load_config(self, config_source: str) -> DataDesignerConfigBuilder: diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 07692ff00..fce35f8d3 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -4,7 +4,7 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -19,6 +19,9 @@ from data_designer.engine.dataset_builders.utils.task_model import TaskTrace +ExportFormat = Literal["jsonl", "csv", "parquet"] +SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = ("jsonl", "csv", "parquet") + class DatasetCreationResults(WithRecordSamplerMixin): """Results container for a Data Designer dataset creation run. @@ -95,6 +98,42 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name + def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path: + """Export the generated dataset to a single file. + + Args: + path: Output file path. The extension is not inferred from *format* — + the exact path is used as-is. + format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``. + Defaults to ``'jsonl'``. + + Returns: + Path to the written file. + + Raises: + ValueError: If an unsupported format is requested. + + Example: + >>> results = data_designer.create(config, num_records=1000) + >>> results.export("output.jsonl") + PosixPath('output.jsonl') + >>> results.export("output.csv", format="csv") + PosixPath('output.csv') + """ + if format not in SUPPORTED_EXPORT_FORMATS: + raise ValueError( + f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + ) + path = Path(path) + df = self.load_dataset() + if format == "jsonl": + df.to_json(path, orient="records", lines=True, force_ascii=False) + elif format == "csv": + df.to_csv(path, index=False) + elif format == "parquet": + df.to_parquet(path, index=False) + return path + def push_to_hub( self, repo_id: str, diff --git a/packages/data-designer/tests/cli/commands/test_create_command.py b/packages/data-designer/tests/cli/commands/test_create_command.py index 30cae9bc7..cde4fff06 100644 --- a/packages/data-designer/tests/cli/commands/test_create_command.py +++ b/packages/data-designer/tests/cli/commands/test_create_command.py @@ -18,7 +18,7 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None) + create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None) mock_ctrl_cls.assert_called_once() mock_ctrl.run_create.assert_called_once_with( @@ -26,6 +26,7 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non num_records=10, dataset_name="dataset", artifact_path=None, + output_format=None, ) @@ -40,6 +41,7 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None: num_records=100, dataset_name="my_data", artifact_path="/custom/output", + output_format=None, ) mock_ctrl.run_create.assert_called_once_with( @@ -47,6 +49,7 @@ def test_create_command_passes_custom_options(mock_ctrl_cls: MagicMock) -> None: num_records=100, dataset_name="my_data", artifact_path="/custom/output", + output_format=None, ) @@ -56,11 +59,35 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock) mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None) + create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None) mock_ctrl.run_create.assert_called_once_with( config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, + output_format=None, + ) + + +@patch("data_designer.cli.commands.create.GenerationController") +def test_create_command_passes_output_format(mock_ctrl_cls: MagicMock) -> None: + """Test create_command forwards --output-format to the controller.""" + mock_ctrl = MagicMock() + mock_ctrl_cls.return_value = mock_ctrl + + create_command( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + output_format="jsonl", + ) + + mock_ctrl.run_create.assert_called_once_with( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + output_format="jsonl", ) diff --git a/packages/data-designer/tests/cli/test_main.py b/packages/data-designer/tests/cli/test_main.py index 7a4ec5553..83866c22e 100644 --- a/packages/data-designer/tests/cli/test_main.py +++ b/packages/data-designer/tests/cli/test_main.py @@ -40,4 +40,5 @@ def test_app_dispatches_lazy_create_command(mock_controller_cls: Mock) -> None: num_records=DEFAULT_NUM_RECORDS, dataset_name="dataset", artifact_path=None, + output_format=None, ) diff --git a/packages/data-designer/tests/interface/test_results.py b/packages/data-designer/tests/interface/test_results.py index a28dd987e..802cfff6c 100644 --- a/packages/data-designer/tests/interface/test_results.py +++ b/packages/data-designer/tests/interface/test_results.py @@ -259,6 +259,77 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_ stub_artifact_storage.load_dataset.assert_called_once() +@pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"]) +def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt): + """export() writes a file in the requested format.""" + out = tmp_path / f"out.{fmt}" + result = stub_dataset_creation_results.export(out, format=fmt) + assert result == out + assert out.exists() + assert out.stat().st_size > 0 + + +def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path): + """JSONL export writes one JSON object per line.""" + import json + + out = tmp_path / "out.jsonl" + stub_dataset_creation_results.export(out, format="jsonl") + lines = out.read_text(encoding="utf-8").splitlines() + assert len(lines) == len(stub_dataframe) + # Each line must be valid JSON + for line in lines: + json.loads(line) + + +def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path): + """CSV export has a header row and one data row per record.""" + import data_designer.lazy_heavy_imports as lazy + + out = tmp_path / "out.csv" + stub_dataset_creation_results.export(out, format="csv") + loaded = lazy.pd.read_csv(out) + assert list(loaded.columns) == list(stub_dataframe.columns) + assert len(loaded) == len(stub_dataframe) + + +def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path): + """Parquet export round-trips to the original DataFrame.""" + import data_designer.lazy_heavy_imports as lazy + + out = tmp_path / "out.parquet" + stub_dataset_creation_results.export(out, format="parquet") + loaded = lazy.pd.read_parquet(out) + lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True)) + + +def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path): + """export() defaults to JSONL when no format is given.""" + import json + + out = tmp_path / "out.jsonl" + stub_dataset_creation_results.export(out) + lines = out.read_text(encoding="utf-8").splitlines() + # All lines must be valid JSON + for line in lines: + json.loads(line) + + +def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path): + """export() raises ValueError for unknown formats.""" + with pytest.raises(ValueError, match="Unsupported export format"): + stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type] + + +def test_export_returns_path_object(stub_dataset_creation_results, tmp_path): + """export() returns a Path regardless of whether str or Path was passed.""" + from pathlib import Path + + out = tmp_path / "out.jsonl" + result = stub_dataset_creation_results.export(str(out)) + assert isinstance(result, Path) + + def test_preview_results_dataset_metadata() -> None: """Test that PreviewResults uses DatasetMetadata in display_sample_record.""" config_builder = MagicMock(spec=DataDesignerConfigBuilder) From bbd5ac96a06ca3b2aa2d457500369740c4c6b265 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 22:37:54 +0200 Subject: [PATCH 2/6] fix(cli): validate output_format before dataset generation --- .../cli/controllers/generation_controller.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py index bdac0a95f..5b3498dfd 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py @@ -128,6 +128,14 @@ def run_create( output_format: If set, export the dataset to a single file in this format after generation. One of 'jsonl', 'csv', 'parquet'. """ + from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS + + if output_format is not None and output_format not in SUPPORTED_EXPORT_FORMATS: + print_error( + f"Unsupported export format: {output_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + ) + raise typer.Exit(code=1) + config_builder = self._load_config(config_source) resolved_artifact_path = Path(artifact_path) if artifact_path else Path.cwd() / "artifacts" @@ -162,14 +170,6 @@ def run_create( console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]") if output_format is not None: - from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS - - if output_format not in SUPPORTED_EXPORT_FORMATS: - print_error( - f"Unsupported export format: {output_format!r}. " - f"Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." - ) - raise typer.Exit(code=1) export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}" try: results.export(export_path, format=output_format) # type: ignore[arg-type] From 56fbe31f7c320cee311b14fd66a303db6940d881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Mon, 13 Apr 2026 23:14:04 +0200 Subject: [PATCH 3/6] fix(cli): remove top-level results import from create.py to preserve lazy loading --- .../data-designer/src/data_designer/cli/commands/create.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/create.py b/packages/data-designer/src/data_designer/cli/commands/create.py index 1f9506dca..686d8031c 100644 --- a/packages/data-designer/src/data_designer/cli/commands/create.py +++ b/packages/data-designer/src/data_designer/cli/commands/create.py @@ -7,7 +7,6 @@ from data_designer.cli.controllers.generation_controller import GenerationController from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS -from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS def create_command( @@ -41,8 +40,8 @@ def create_command( "--output-format", "-f", help=( - f"Export the dataset to a single file after generation. " - f"Supported formats: {', '.join(SUPPORTED_EXPORT_FORMATS)}. " + "Export the dataset to a single file after generation. " + "Supported formats: jsonl, csv, parquet. " "The file is written to //dataset.." ), ), From 68fa965322bece8ee932487422f1e33eebc99090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Tue, 21 Apr 2026 21:39:53 +0200 Subject: [PATCH 4/6] =?UTF-8?q?fix(results):=20address=20andreatgretel=20r?= =?UTF-8?q?eview=20=E2=80=94=20error=20types,=20UX=20ordering,=20import=20?= =?UTF-8?q?hygiene?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Derive SUPPORTED_EXPORT_FORMATS from get_args(ExportFormat) so the two can't drift apart - Replace ValueError with InvalidFileFormatError in export() — consistent with project error conventions - Add date_format="iso" to to_json() for consistent datetime serialization across formats - Add click.Choice(SUPPORTED_EXPORT_FORMATS) to --output-format CLI option for parse-time validation, better --help output, and tab completion - Fix double load_dataset() in run_create: inline len() so the DataFrame ref dies before export - Move success message after the export block to avoid "Dataset created" followed by "Export failed" - Move imports to module level in test_results.py (json, Path, lazy already imported) - Add controller-level tests for output_format happy path, bad format rejection, and export failure --- .../src/data_designer/cli/commands/create.py | 3 + .../cli/controllers/generation_controller.py | 5 +- .../src/data_designer/interface/results.py | 9 +-- .../tests/cli/commands/test_create_command.py | 8 ++- .../controllers/test_generation_controller.py | 62 +++++++++++++++++++ .../tests/interface/test_results.py | 17 ++--- 6 files changed, 84 insertions(+), 20 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/create.py b/packages/data-designer/src/data_designer/cli/commands/create.py index 686d8031c..ec0aa3570 100644 --- a/packages/data-designer/src/data_designer/cli/commands/create.py +++ b/packages/data-designer/src/data_designer/cli/commands/create.py @@ -3,10 +3,12 @@ from __future__ import annotations +import click import typer from data_designer.cli.controllers.generation_controller import GenerationController from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS +from data_designer.interface.results import SUPPORTED_EXPORT_FORMATS def create_command( @@ -39,6 +41,7 @@ def create_command( None, "--output-format", "-f", + click_type=click.Choice(list(SUPPORTED_EXPORT_FORMATS)), help=( "Export the dataset to a single file after generation. " "Supported formats: jsonl, csv, parquet. " diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py index 5b3498dfd..88096ed54 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py @@ -158,7 +158,7 @@ def run_create( print_error(f"Dataset creation failed: {e}") raise typer.Exit(code=1) - dataset = results.load_dataset() + num_records = len(results.load_dataset()) analysis = results.load_analysis() if analysis is not None: @@ -166,7 +166,6 @@ def run_create( analysis.to_report() console.print() - print_success(f"Dataset created — {len(dataset)} record(s) generated") console.print(f" Artifacts saved to: [bold]{results.artifact_storage.base_dataset_path}[/bold]") if output_format is not None: @@ -179,6 +178,8 @@ def run_create( console.print(f" Exported to: [bold]{export_path}[/bold]") console.print() + print_success(f"Dataset created — {num_records} record(s) generated") + console.print() def _load_config(self, config_source: str) -> DataDesignerConfigBuilder: """Load a config builder from the given source, exiting on failure. diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index fce35f8d3..4d33ae59e 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -4,11 +4,12 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Literal, get_args from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata +from data_designer.config.errors import InvalidFileFormatError from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.errors import ArtifactStorageError from data_designer.engine.storage.artifact_storage import ArtifactStorage @@ -20,7 +21,7 @@ from data_designer.engine.dataset_builders.utils.task_model import TaskTrace ExportFormat = Literal["jsonl", "csv", "parquet"] -SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = ("jsonl", "csv", "parquet") +SUPPORTED_EXPORT_FORMATS: tuple[str, ...] = get_args(ExportFormat) class DatasetCreationResults(WithRecordSamplerMixin): @@ -121,13 +122,13 @@ def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path: PosixPath('output.csv') """ if format not in SUPPORTED_EXPORT_FORMATS: - raise ValueError( + raise InvalidFileFormatError( f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." ) path = Path(path) df = self.load_dataset() if format == "jsonl": - df.to_json(path, orient="records", lines=True, force_ascii=False) + df.to_json(path, orient="records", lines=True, force_ascii=False, date_format="iso") elif format == "csv": df.to_csv(path, index=False) elif format == "parquet": diff --git a/packages/data-designer/tests/cli/commands/test_create_command.py b/packages/data-designer/tests/cli/commands/test_create_command.py index cde4fff06..fc779df7c 100644 --- a/packages/data-designer/tests/cli/commands/test_create_command.py +++ b/packages/data-designer/tests/cli/commands/test_create_command.py @@ -18,7 +18,9 @@ def test_create_command_delegates_to_controller(mock_ctrl_cls: MagicMock) -> Non mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None) + create_command( + config_source="config.yaml", num_records=10, dataset_name="dataset", artifact_path=None, output_format=None + ) mock_ctrl_cls.assert_called_once() mock_ctrl.run_create.assert_called_once_with( @@ -59,7 +61,9 @@ def test_create_command_default_artifact_path_is_none(mock_ctrl_cls: MagicMock) mock_ctrl = MagicMock() mock_ctrl_cls.return_value = mock_ctrl - create_command(config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None) + create_command( + config_source="config.yaml", num_records=5, dataset_name="ds", artifact_path=None, output_format=None + ) mock_ctrl.run_create.assert_called_once_with( config_source="config.yaml", diff --git a/packages/data-designer/tests/cli/controllers/test_generation_controller.py b/packages/data-designer/tests/cli/controllers/test_generation_controller.py index de4918cff..3f40cc76f 100644 --- a/packages/data-designer/tests/cli/controllers/test_generation_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_generation_controller.py @@ -772,3 +772,65 @@ def test_run_create_skips_report_when_analysis_is_none(mock_load_config: MagicMo # load_analysis() returns None, so to_report() must not be called. # If the code ignores the None check, an AttributeError propagates and the test fails. mock_results.load_analysis.assert_called_once() + + +@patch(f"{_CTRL}.DataDesigner") +@patch(f"{_CTRL}.load_config_builder") +def test_run_create_with_output_format_happy_path(mock_load_config: MagicMock, mock_dd_cls: MagicMock) -> None: + """export() is called with the correct path and format when --output-format is given.""" + mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) + mock_dd = MagicMock() + mock_dd_cls.return_value = mock_dd + mock_results = _make_mock_create_results(5) + mock_dd.create.return_value = mock_results + + controller = GenerationController() + controller.run_create( + config_source="config.yaml", + num_records=5, + dataset_name="dataset", + artifact_path=None, + output_format="jsonl", + ) + + mock_results.export.assert_called_once_with( + Path("/output/artifacts/dataset") / "dataset.jsonl", + format="jsonl", + ) + + +def test_run_create_invalid_output_format_exits() -> None: + """Bad --output-format exits with code 1 before generation starts.""" + controller = GenerationController() + with pytest.raises(typer.Exit) as exc_info: + controller.run_create( + config_source="config.yaml", + num_records=10, + dataset_name="dataset", + artifact_path=None, + output_format="xlsx", + ) + assert exc_info.value.exit_code == 1 + + +@patch(f"{_CTRL}.DataDesigner") +@patch(f"{_CTRL}.load_config_builder") +def test_run_create_export_failure_exits(mock_load_config: MagicMock, mock_dd_cls: MagicMock) -> None: + """If export() raises, run_create exits with code 1.""" + mock_load_config.return_value = MagicMock(spec=DataDesignerConfigBuilder) + mock_dd = MagicMock() + mock_dd_cls.return_value = mock_dd + mock_results = _make_mock_create_results(5) + mock_results.export.side_effect = RuntimeError("disk full") + mock_dd.create.return_value = mock_results + + controller = GenerationController() + with pytest.raises(typer.Exit) as exc_info: + controller.run_create( + config_source="config.yaml", + num_records=5, + dataset_name="dataset", + artifact_path=None, + output_format="csv", + ) + assert exc_info.value.exit_code == 1 diff --git a/packages/data-designer/tests/interface/test_results.py b/packages/data-designer/tests/interface/test_results.py index 802cfff6c..715741cf1 100644 --- a/packages/data-designer/tests/interface/test_results.py +++ b/packages/data-designer/tests/interface/test_results.py @@ -3,6 +3,8 @@ from __future__ import annotations +import json +from pathlib import Path from unittest.mock import MagicMock, patch import pytest @@ -11,6 +13,7 @@ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata +from data_designer.config.errors import InvalidFileFormatError from data_designer.config.preview_results import PreviewResults from data_designer.config.utils.errors import DatasetSampleDisplayError from data_designer.config.utils.visualization import display_sample_record as display_fn @@ -271,8 +274,6 @@ def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt): def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path): """JSONL export writes one JSON object per line.""" - import json - out = tmp_path / "out.jsonl" stub_dataset_creation_results.export(out, format="jsonl") lines = out.read_text(encoding="utf-8").splitlines() @@ -284,8 +285,6 @@ def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path): """CSV export has a header row and one data row per record.""" - import data_designer.lazy_heavy_imports as lazy - out = tmp_path / "out.csv" stub_dataset_creation_results.export(out, format="csv") loaded = lazy.pd.read_csv(out) @@ -295,8 +294,6 @@ def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_p def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path): """Parquet export round-trips to the original DataFrame.""" - import data_designer.lazy_heavy_imports as lazy - out = tmp_path / "out.parquet" stub_dataset_creation_results.export(out, format="parquet") loaded = lazy.pd.read_parquet(out) @@ -305,8 +302,6 @@ def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, t def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path): """export() defaults to JSONL when no format is given.""" - import json - out = tmp_path / "out.jsonl" stub_dataset_creation_results.export(out) lines = out.read_text(encoding="utf-8").splitlines() @@ -316,15 +311,13 @@ def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path) def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path): - """export() raises ValueError for unknown formats.""" - with pytest.raises(ValueError, match="Unsupported export format"): + """export() raises InvalidFileFormatError for unknown formats.""" + with pytest.raises(InvalidFileFormatError, match="Unsupported export format"): stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type] def test_export_returns_path_object(stub_dataset_creation_results, tmp_path): """export() returns a Path regardless of whether str or Path was passed.""" - from pathlib import Path - out = tmp_path / "out.jsonl" result = stub_dataset_creation_results.export(str(out)) assert isinstance(result, Path) From 50a93d98c093c1db36b13dba506bedaeccd2e1b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Tue, 21 Apr 2026 21:43:53 +0200 Subject: [PATCH 5/6] =?UTF-8?q?fix(results):=20correct=20Raises=20docstrin?= =?UTF-8?q?g=20=E2=80=94=20ValueError=20->=20InvalidFileFormatError?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/data-designer/src/data_designer/interface/results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 4d33ae59e..782f42060 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -112,7 +112,7 @@ def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path: Path to the written file. Raises: - ValueError: If an unsupported format is requested. + InvalidFileFormatError: If an unsupported format is requested. Example: >>> results = data_designer.create(config, num_records=1000) From 6c1b5034acffd6830ed9df83e09c01f9ae6bf107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw?= Date: Wed, 22 Apr 2026 20:56:15 +0200 Subject: [PATCH 6/6] feat(results): stream batch files in export() to avoid OOM on large datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rewrite export() to read batch parquet files one at a time instead of materialising the full dataset via load_dataset(); peak memory is now proportional to a single batch regardless of dataset size - Infer output format from file extension by default; format= parameter kept as an explicit override (e.g. writing .txt as JSONL) - _export_parquet unifies schemas across batches (pa.unify_schemas) to handle type drift (e.g. int64 vs float64 in the same column) - Drop format= from the controller's export() call — path already carries the correct extension - Rewrite export tests around real batch parquet files (stub_batch_dir fixture); add tests for multi-batch output, schema unification, unknown extension, empty batch directory, and explicit format override --- .../cli/controllers/generation_controller.py | 2 +- .../src/data_designer/interface/results.py | 86 +++++++++++--- .../controllers/test_generation_controller.py | 1 - .../tests/interface/test_results.py | 110 ++++++++++++++---- 4 files changed, 159 insertions(+), 40 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py index 88096ed54..07f8c9957 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py @@ -171,7 +171,7 @@ def run_create( if output_format is not None: export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}" try: - results.export(export_path, format=output_format) # type: ignore[arg-type] + results.export(export_path) except Exception as e: print_error(f"Export failed: {e}") raise typer.Exit(code=1) diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 782f42060..5670f3548 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal, get_args +import data_designer.lazy_heavy_imports as lazy from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata @@ -99,40 +100,55 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name - def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path: - """Export the generated dataset to a single file. + def export(self, path: Path | str, *, format: ExportFormat | None = None) -> Path: + """Export the generated dataset to a single file by streaming batch files. + + The output format is inferred from the file extension when *format* is + omitted. Pass *format* explicitly to override the extension (e.g. write a + ``.txt`` file as JSONL). + + Unlike :meth:`load_dataset`, this method never materialises the full dataset + in memory — it reads batch parquet files one at a time and appends each to + the output file, keeping peak memory proportional to a single batch. Args: - path: Output file path. The extension is not inferred from *format* — - the exact path is used as-is. + path: Output file path. The exact path is used as-is; the extension is + not rewritten. format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``. - Defaults to ``'jsonl'``. + When omitted, the format is inferred from the file extension. Returns: Path to the written file. Raises: - InvalidFileFormatError: If an unsupported format is requested. + InvalidFileFormatError: If the format cannot be determined or is not + one of the supported values. + ArtifactStorageError: If no batch parquet files are found. Example: >>> results = data_designer.create(config, num_records=1000) >>> results.export("output.jsonl") PosixPath('output.jsonl') - >>> results.export("output.csv", format="csv") + >>> results.export("output.csv") PosixPath('output.csv') + >>> results.export("output.txt", format="jsonl") + PosixPath('output.txt') """ - if format not in SUPPORTED_EXPORT_FORMATS: + path = Path(path) + resolved_format: str = format if format is not None else path.suffix.lstrip(".") + if resolved_format not in SUPPORTED_EXPORT_FORMATS: raise InvalidFileFormatError( - f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + f"Unsupported export format: {resolved_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." ) - path = Path(path) - df = self.load_dataset() - if format == "jsonl": - df.to_json(path, orient="records", lines=True, force_ascii=False, date_format="iso") - elif format == "csv": - df.to_csv(path, index=False) - elif format == "parquet": - df.to_parquet(path, index=False) + batch_files = sorted(self.artifact_storage.final_dataset_path.glob("batch_*.parquet")) + if not batch_files: + raise ArtifactStorageError("No batch parquet files found to export.") + if resolved_format == "jsonl": + _export_jsonl(batch_files, path) + elif resolved_format == "csv": + _export_csv(batch_files, path) + elif resolved_format == "parquet": + _export_parquet(batch_files, path) return path def push_to_hub( @@ -180,3 +196,39 @@ def push_to_hub( description=description, tags=tags, ) + + +def _export_jsonl(batch_files: list[Path], output: Path) -> None: + """Write *batch_files* to *output* as JSONL, one record per line. + + Each batch is appended in turn so peak memory stays proportional to one batch. + """ + with output.open("w", encoding="utf-8") as f: + for batch_file in batch_files: + chunk = lazy.pd.read_parquet(batch_file) + content = chunk.to_json(orient="records", lines=True, force_ascii=False, date_format="iso") + f.write(content) + if not content.endswith("\n"): + f.write("\n") + + +def _export_csv(batch_files: list[Path], output: Path) -> None: + """Write *batch_files* to *output* as CSV with a single header row.""" + for i, batch_file in enumerate(batch_files): + chunk = lazy.pd.read_parquet(batch_file) + chunk.to_csv(output, mode="a" if i > 0 else "w", header=(i == 0), index=False) + + +def _export_parquet(batch_files: list[Path], output: Path) -> None: + """Write *batch_files* to *output* as a single Parquet file. + + Schemas are unified across batches before writing so that columns with minor + type drift (e.g. ``int64`` vs ``float64`` across batches) are cast to a + consistent schema rather than causing a write error. + """ + schemas = [lazy.pq.read_schema(f) for f in batch_files] + unified_schema = lazy.pa.unify_schemas(schemas) + with lazy.pq.ParquetWriter(output, unified_schema) as writer: + for batch_file in batch_files: + table = lazy.pq.read_table(batch_file) + writer.write_table(table.cast(unified_schema)) diff --git a/packages/data-designer/tests/cli/controllers/test_generation_controller.py b/packages/data-designer/tests/cli/controllers/test_generation_controller.py index 3f40cc76f..87408bde4 100644 --- a/packages/data-designer/tests/cli/controllers/test_generation_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_generation_controller.py @@ -795,7 +795,6 @@ def test_run_create_with_output_format_happy_path(mock_load_config: MagicMock, m mock_results.export.assert_called_once_with( Path("/output/artifacts/dataset") / "dataset.jsonl", - format="jsonl", ) diff --git a/packages/data-designer/tests/interface/test_results.py b/packages/data-designer/tests/interface/test_results.py index 715741cf1..f0946ef40 100644 --- a/packages/data-designer/tests/interface/test_results.py +++ b/packages/data-designer/tests/interface/test_results.py @@ -17,6 +17,7 @@ from data_designer.config.preview_results import PreviewResults from data_designer.config.utils.errors import DatasetSampleDisplayError from data_designer.config.utils.visualization import display_sample_record as display_fn +from data_designer.engine.dataset_builders.errors import ArtifactStorageError from data_designer.engine.storage.artifact_storage import ArtifactStorage from data_designer.interface.results import DatasetCreationResults @@ -262,62 +263,129 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_ stub_artifact_storage.load_dataset.assert_called_once() +@pytest.fixture +def stub_batch_dir(stub_dataframe, tmp_path): + """Directory with two batch parquet files split from stub_dataframe. + + Splitting into two batches exercises the multi-batch streaming path in export(). + """ + batch_dir = tmp_path / "parquet-files" + batch_dir.mkdir() + mid = len(stub_dataframe) // 2 + stub_dataframe.iloc[:mid].to_parquet(batch_dir / "batch_00000.parquet", index=False) + stub_dataframe.iloc[mid:].to_parquet(batch_dir / "batch_00001.parquet", index=False) + return batch_dir + + @pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"]) -def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt): - """export() writes a file in the requested format.""" +def test_export_writes_file(stub_dataset_creation_results, stub_batch_dir, tmp_path, fmt) -> None: + """export() writes a non-empty file for each supported format.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir out = tmp_path / f"out.{fmt}" - result = stub_dataset_creation_results.export(out, format=fmt) + result = stub_dataset_creation_results.export(out) assert result == out assert out.exists() assert out.stat().st_size > 0 -def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path): - """JSONL export writes one JSON object per line.""" +def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None: + """JSONL export writes one valid JSON object per line, covering all records.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir out = tmp_path / "out.jsonl" - stub_dataset_creation_results.export(out, format="jsonl") + stub_dataset_creation_results.export(out) lines = out.read_text(encoding="utf-8").splitlines() assert len(lines) == len(stub_dataframe) - # Each line must be valid JSON for line in lines: json.loads(line) -def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path): - """CSV export has a header row and one data row per record.""" +def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None: + """CSV export produces a single header row and one data row per record.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir out = tmp_path / "out.csv" - stub_dataset_creation_results.export(out, format="csv") + stub_dataset_creation_results.export(out) loaded = lazy.pd.read_csv(out) assert list(loaded.columns) == list(stub_dataframe.columns) assert len(loaded) == len(stub_dataframe) -def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path): - """Parquet export round-trips to the original DataFrame.""" +def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None: + """Parquet export round-trips to the original DataFrame across two batches.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir out = tmp_path / "out.parquet" - stub_dataset_creation_results.export(out, format="parquet") + stub_dataset_creation_results.export(out) loaded = lazy.pd.read_parquet(out) - lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True)) + lazy.pd.testing.assert_frame_equal( + loaded.reset_index(drop=True), + stub_dataframe.reset_index(drop=True), + ) -def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path): - """export() defaults to JSONL when no format is given.""" +def test_export_infers_format_from_extension(stub_dataset_creation_results, stub_batch_dir, tmp_path) -> None: + """export() infers the output format from the file extension when format is omitted.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir out = tmp_path / "out.jsonl" stub_dataset_creation_results.export(out) lines = out.read_text(encoding="utf-8").splitlines() - # All lines must be valid JSON for line in lines: json.loads(line) -def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path): - """export() raises InvalidFileFormatError for unknown formats.""" +def test_export_explicit_format_overrides_extension( + stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path +) -> None: + """Passing format= explicitly overrides extension-based inference.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir + out = tmp_path / "data.txt" + stub_dataset_creation_results.export(out, format="jsonl") + lines = out.read_text(encoding="utf-8").splitlines() + assert len(lines) == len(stub_dataframe) + for line in lines: + json.loads(line) + + +def test_export_parquet_schema_unification(stub_dataset_creation_results, tmp_path) -> None: + """Parquet export unifies schemas across batches with diverging column types.""" + batch_dir = tmp_path / "parquet-files" + batch_dir.mkdir() + # Batch 0: 'value' as int64; Batch 1: 'value' as float64 (type drift) + lazy.pd.DataFrame({"value": lazy.pd.array([1, 2], dtype="int64")}).to_parquet( + batch_dir / "batch_00000.parquet", index=False + ) + lazy.pd.DataFrame({"value": lazy.pd.array([3.0, 4.0], dtype="float64")}).to_parquet( + batch_dir / "batch_00001.parquet", index=False + ) + stub_dataset_creation_results.artifact_storage.final_dataset_path = batch_dir + out = tmp_path / "out.parquet" + stub_dataset_creation_results.export(out) + loaded = lazy.pd.read_parquet(out) + assert list(loaded["value"]) == [1.0, 2.0, 3.0, 4.0] + + +def test_export_unknown_extension_raises(stub_dataset_creation_results, tmp_path) -> None: + """export() raises InvalidFileFormatError when the extension is not a supported format.""" with pytest.raises(InvalidFileFormatError, match="Unsupported export format"): - stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx") # type: ignore[arg-type] + stub_dataset_creation_results.export(tmp_path / "out.xyz") + + +def test_export_unsupported_explicit_format_raises(stub_dataset_creation_results, tmp_path) -> None: + """export() raises InvalidFileFormatError for an explicit unsupported format override.""" + with pytest.raises(InvalidFileFormatError, match="Unsupported export format"): + stub_dataset_creation_results.export(tmp_path / "out.jsonl", format="xlsx") # type: ignore[arg-type] + + +def test_export_no_batch_files_raises(stub_dataset_creation_results, tmp_path) -> None: + """export() raises ArtifactStorageError when the batch directory is empty.""" + empty_dir = tmp_path / "parquet-files" + empty_dir.mkdir() + stub_dataset_creation_results.artifact_storage.final_dataset_path = empty_dir + with pytest.raises(ArtifactStorageError, match="No batch parquet files found"): + stub_dataset_creation_results.export(tmp_path / "out.jsonl") -def test_export_returns_path_object(stub_dataset_creation_results, tmp_path): +def test_export_returns_path_object(stub_dataset_creation_results, stub_batch_dir, tmp_path) -> None: """export() returns a Path regardless of whether str or Path was passed.""" + stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir out = tmp_path / "out.jsonl" result = stub_dataset_creation_results.export(str(out)) assert isinstance(result, Path)