feat(results): stream batch files in export() to avoid OOM on large datasets

przemekboruta · przemekboruta · commit 6c1b5034acff · 2026-04-22T20:56:15.000+02:00
- Rewrite export() to read batch parquet files one at a time instead of
  materialising the full dataset via load_dataset(); peak memory is now
  proportional to a single batch regardless of dataset size
- Infer output format from file extension by default; format= parameter
  kept as an explicit override (e.g. writing .txt as JSONL)
- _export_parquet unifies schemas across batches (pa.unify_schemas) to
  handle type drift (e.g. int64 vs float64 in the same column)
- Drop format= from the controller's export() call — path already carries
  the correct extension
- Rewrite export tests around real batch parquet files (stub_batch_dir
  fixture); add tests for multi-batch output, schema unification, unknown
  extension, empty batch directory, and explicit format override
diff --git a/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py b/packages/data-designer/src/data_designer/cli/controllers/generation_controller.py
@@ -171,7 +171,7 @@ def run_create(
         if output_format is not None:
             export_path = results.artifact_storage.base_dataset_path / f"dataset.{output_format}"
             try:
-                results.export(export_path, format=output_format)  # type: ignore[arg-type]
+                results.export(export_path)
             except Exception as e:
                 print_error(f"Export failed: {e}")
                 raise typer.Exit(code=1)
diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, get_args
 
+import data_designer.lazy_heavy_imports as lazy
 from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
 from data_designer.config.config_builder import DataDesignerConfigBuilder
 from data_designer.config.dataset_metadata import DatasetMetadata
@@ -99,40 +100,55 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
             raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.")
         return self.artifact_storage.processors_outputs_path / processor_name
 
-    def export(self, path: Path | str, *, format: ExportFormat = "jsonl") -> Path:
-        """Export the generated dataset to a single file.
+    def export(self, path: Path | str, *, format: ExportFormat | None = None) -> Path:
+        """Export the generated dataset to a single file by streaming batch files.
+
+        The output format is inferred from the file extension when *format* is
+        omitted.  Pass *format* explicitly to override the extension (e.g. write a
+        ``.txt`` file as JSONL).
+
+        Unlike :meth:`load_dataset`, this method never materialises the full dataset
+        in memory — it reads batch parquet files one at a time and appends each to
+        the output file, keeping peak memory proportional to a single batch.
 
         Args:
-            path: Output file path. The extension is not inferred from *format* —
-                the exact path is used as-is.
+            path: Output file path. The exact path is used as-is; the extension is
+                not rewritten.
             format: Output format. One of ``'jsonl'``, ``'csv'``, or ``'parquet'``.
-                Defaults to ``'jsonl'``.
+                When omitted, the format is inferred from the file extension.
 
         Returns:
             Path to the written file.
 
         Raises:
-            InvalidFileFormatError: If an unsupported format is requested.
+            InvalidFileFormatError: If the format cannot be determined or is not
+                one of the supported values.
+            ArtifactStorageError: If no batch parquet files are found.
 
         Example:
             >>> results = data_designer.create(config, num_records=1000)
             >>> results.export("output.jsonl")
             PosixPath('output.jsonl')
-            >>> results.export("output.csv", format="csv")
+            >>> results.export("output.csv")
             PosixPath('output.csv')
+            >>> results.export("output.txt", format="jsonl")
+            PosixPath('output.txt')
         """
-        if format not in SUPPORTED_EXPORT_FORMATS:
+        path = Path(path)
+        resolved_format: str = format if format is not None else path.suffix.lstrip(".")
+        if resolved_format not in SUPPORTED_EXPORT_FORMATS:
             raise InvalidFileFormatError(
-                f"Unsupported export format: {format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}."
+                f"Unsupported export format: {resolved_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}."
             )
-        path = Path(path)
-        df = self.load_dataset()
-        if format == "jsonl":
-            df.to_json(path, orient="records", lines=True, force_ascii=False, date_format="iso")
-        elif format == "csv":
-            df.to_csv(path, index=False)
-        elif format == "parquet":
-            df.to_parquet(path, index=False)
+        batch_files = sorted(self.artifact_storage.final_dataset_path.glob("batch_*.parquet"))
+        if not batch_files:
+            raise ArtifactStorageError("No batch parquet files found to export.")
+        if resolved_format == "jsonl":
+            _export_jsonl(batch_files, path)
+        elif resolved_format == "csv":
+            _export_csv(batch_files, path)
+        elif resolved_format == "parquet":
+            _export_parquet(batch_files, path)
         return path
 
     def push_to_hub(
@@ -180,3 +196,39 @@ def push_to_hub(
             description=description,
             tags=tags,
         )
+
+
+def _export_jsonl(batch_files: list[Path], output: Path) -> None:
+    """Write *batch_files* to *output* as JSONL, one record per line.
+
+    Each batch is appended in turn so peak memory stays proportional to one batch.
+    """
+    with output.open("w", encoding="utf-8") as f:
+        for batch_file in batch_files:
+            chunk = lazy.pd.read_parquet(batch_file)
+            content = chunk.to_json(orient="records", lines=True, force_ascii=False, date_format="iso")
+            f.write(content)
+            if not content.endswith("\n"):
+                f.write("\n")
+
+
+def _export_csv(batch_files: list[Path], output: Path) -> None:
+    """Write *batch_files* to *output* as CSV with a single header row."""
+    for i, batch_file in enumerate(batch_files):
+        chunk = lazy.pd.read_parquet(batch_file)
+        chunk.to_csv(output, mode="a" if i > 0 else "w", header=(i == 0), index=False)
+
+
+def _export_parquet(batch_files: list[Path], output: Path) -> None:
+    """Write *batch_files* to *output* as a single Parquet file.
+
+    Schemas are unified across batches before writing so that columns with minor
+    type drift (e.g. ``int64`` vs ``float64`` across batches) are cast to a
+    consistent schema rather than causing a write error.
+    """
+    schemas = [lazy.pq.read_schema(f) for f in batch_files]
+    unified_schema = lazy.pa.unify_schemas(schemas)
+    with lazy.pq.ParquetWriter(output, unified_schema) as writer:
+        for batch_file in batch_files:
+            table = lazy.pq.read_table(batch_file)
+            writer.write_table(table.cast(unified_schema))
diff --git a/packages/data-designer/tests/cli/controllers/test_generation_controller.py b/packages/data-designer/tests/cli/controllers/test_generation_controller.py
@@ -795,7 +795,6 @@ def test_run_create_with_output_format_happy_path(mock_load_config: MagicMock, m
 
     mock_results.export.assert_called_once_with(
         Path("/output/artifacts/dataset") / "dataset.jsonl",
-        format="jsonl",
     )
 
 
diff --git a/packages/data-designer/tests/interface/test_results.py b/packages/data-designer/tests/interface/test_results.py
@@ -17,6 +17,7 @@
 from data_designer.config.preview_results import PreviewResults
 from data_designer.config.utils.errors import DatasetSampleDisplayError
 from data_designer.config.utils.visualization import display_sample_record as display_fn
+from data_designer.engine.dataset_builders.errors import ArtifactStorageError
 from data_designer.engine.storage.artifact_storage import ArtifactStorage
 from data_designer.interface.results import DatasetCreationResults
 
@@ -262,62 +263,129 @@ def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_
     stub_artifact_storage.load_dataset.assert_called_once()
 
 
+@pytest.fixture
+def stub_batch_dir(stub_dataframe, tmp_path):
+    """Directory with two batch parquet files split from stub_dataframe.
+
+    Splitting into two batches exercises the multi-batch streaming path in export().
+    """
+    batch_dir = tmp_path / "parquet-files"
+    batch_dir.mkdir()
+    mid = len(stub_dataframe) // 2
+    stub_dataframe.iloc[:mid].to_parquet(batch_dir / "batch_00000.parquet", index=False)
+    stub_dataframe.iloc[mid:].to_parquet(batch_dir / "batch_00001.parquet", index=False)
+    return batch_dir
+
+
 @pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"])
-def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt):
-    """export() writes a file in the requested format."""
+def test_export_writes_file(stub_dataset_creation_results, stub_batch_dir, tmp_path, fmt) -> None:
+    """export() writes a non-empty file for each supported format."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
     out = tmp_path / f"out.{fmt}"
-    result = stub_dataset_creation_results.export(out, format=fmt)
+    result = stub_dataset_creation_results.export(out)
     assert result == out
     assert out.exists()
     assert out.stat().st_size > 0
 
 
-def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
-    """JSONL export writes one JSON object per line."""
+def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None:
+    """JSONL export writes one valid JSON object per line, covering all records."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
     out = tmp_path / "out.jsonl"
-    stub_dataset_creation_results.export(out, format="jsonl")
+    stub_dataset_creation_results.export(out)
     lines = out.read_text(encoding="utf-8").splitlines()
     assert len(lines) == len(stub_dataframe)
-    # Each line must be valid JSON
     for line in lines:
         json.loads(line)
 
 
-def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
-    """CSV export has a header row and one data row per record."""
+def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None:
+    """CSV export produces a single header row and one data row per record."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
     out = tmp_path / "out.csv"
-    stub_dataset_creation_results.export(out, format="csv")
+    stub_dataset_creation_results.export(out)
     loaded = lazy.pd.read_csv(out)
     assert list(loaded.columns) == list(stub_dataframe.columns)
     assert len(loaded) == len(stub_dataframe)
 
 
-def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
-    """Parquet export round-trips to the original DataFrame."""
+def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path) -> None:
+    """Parquet export round-trips to the original DataFrame across two batches."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
     out = tmp_path / "out.parquet"
-    stub_dataset_creation_results.export(out, format="parquet")
+    stub_dataset_creation_results.export(out)
     loaded = lazy.pd.read_parquet(out)
-    lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True))
+    lazy.pd.testing.assert_frame_equal(
+        loaded.reset_index(drop=True),
+        stub_dataframe.reset_index(drop=True),
+    )
 
 
-def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path):
-    """export() defaults to JSONL when no format is given."""
+def test_export_infers_format_from_extension(stub_dataset_creation_results, stub_batch_dir, tmp_path) -> None:
+    """export() infers the output format from the file extension when format is omitted."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
     out = tmp_path / "out.jsonl"
     stub_dataset_creation_results.export(out)
     lines = out.read_text(encoding="utf-8").splitlines()
-    # All lines must be valid JSON
     for line in lines:
         json.loads(line)
 
 
-def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path):
-    """export() raises InvalidFileFormatError for unknown formats."""
+def test_export_explicit_format_overrides_extension(
+    stub_dataset_creation_results, stub_dataframe, stub_batch_dir, tmp_path
+) -> None:
+    """Passing format= explicitly overrides extension-based inference."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
+    out = tmp_path / "data.txt"
+    stub_dataset_creation_results.export(out, format="jsonl")
+    lines = out.read_text(encoding="utf-8").splitlines()
+    assert len(lines) == len(stub_dataframe)
+    for line in lines:
+        json.loads(line)
+
+
+def test_export_parquet_schema_unification(stub_dataset_creation_results, tmp_path) -> None:
+    """Parquet export unifies schemas across batches with diverging column types."""
+    batch_dir = tmp_path / "parquet-files"
+    batch_dir.mkdir()
+    # Batch 0: 'value' as int64; Batch 1: 'value' as float64 (type drift)
+    lazy.pd.DataFrame({"value": lazy.pd.array([1, 2], dtype="int64")}).to_parquet(
+        batch_dir / "batch_00000.parquet", index=False
+    )
+    lazy.pd.DataFrame({"value": lazy.pd.array([3.0, 4.0], dtype="float64")}).to_parquet(
+        batch_dir / "batch_00001.parquet", index=False
+    )
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = batch_dir
+    out = tmp_path / "out.parquet"
+    stub_dataset_creation_results.export(out)
+    loaded = lazy.pd.read_parquet(out)
+    assert list(loaded["value"]) == [1.0, 2.0, 3.0, 4.0]
+
+
+def test_export_unknown_extension_raises(stub_dataset_creation_results, tmp_path) -> None:
+    """export() raises InvalidFileFormatError when the extension is not a supported format."""
     with pytest.raises(InvalidFileFormatError, match="Unsupported export format"):
-        stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx")  # type: ignore[arg-type]
+        stub_dataset_creation_results.export(tmp_path / "out.xyz")
+
+
+def test_export_unsupported_explicit_format_raises(stub_dataset_creation_results, tmp_path) -> None:
+    """export() raises InvalidFileFormatError for an explicit unsupported format override."""
+    with pytest.raises(InvalidFileFormatError, match="Unsupported export format"):
+        stub_dataset_creation_results.export(tmp_path / "out.jsonl", format="xlsx")  # type: ignore[arg-type]
+
+
+def test_export_no_batch_files_raises(stub_dataset_creation_results, tmp_path) -> None:
+    """export() raises ArtifactStorageError when the batch directory is empty."""
+    empty_dir = tmp_path / "parquet-files"
+    empty_dir.mkdir()
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = empty_dir
+    with pytest.raises(ArtifactStorageError, match="No batch parquet files found"):
+        stub_dataset_creation_results.export(tmp_path / "out.jsonl")
 
 
-def test_export_returns_path_object(stub_dataset_creation_results, tmp_path):
+def test_export_returns_path_object(stub_dataset_creation_results, stub_batch_dir, tmp_path) -> None:
     """export() returns a Path regardless of whether str or Path was passed."""
+    stub_dataset_creation_results.artifact_storage.final_dataset_path = stub_batch_dir
     out = tmp_path / "out.jsonl"
     result = stub_dataset_creation_results.export(str(out))
     assert isinstance(result, Path)

Original file line number	Diff line number	Diff line change
`@@ -795,7 +795,6 @@ def test_run_create_with_output_format_happy_path(mock_load_config: MagicMock, m`
`795`	`795`
`796`	`796`	`mock_results.export.assert_called_once_with(`
`797`	`797`	`Path("/output/artifacts/dataset") / "dataset.jsonl",`
`798`		`- format="jsonl",`
`799`	`798`	`)`
`800`	`799`
`801`	`800`