Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ class RunConfig(ConfigBase):
Default is False.
progress_interval: How often (in seconds) the async progress reporter emits a
consolidated log block. Must be > 0. Default is 5.0.
preserve_dropped_columns: If True, write columns removed by drop processors to
separate dropped-column parquet files. Set to False to omit those artifacts
while still removing dropped columns from the final dataset. Default is True.
jinja_rendering_engine: Template renderer used for engine-side Jinja evaluation.
``native`` uses Jinja2's built-in sandbox with the standard filter set and
fewer Data Designer-specific restrictions. ``secure`` uses Data Designer's
Expand All @@ -139,6 +142,12 @@ class RunConfig(ConfigBase):
async_trace: bool = False
progress_bar: bool = False
progress_interval: float = Field(default=5.0, gt=0.0)
preserve_dropped_columns: bool = Field(
default=True,
description=(
"Whether columns removed by drop processors are preserved in separate dropped-column parquet files."
),
)
jinja_rendering_engine: JinjaRenderingEngine = Field(
default=JinjaRenderingEngine.SECURE,
description=(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ def test_run_config_accepts_native_renderer() -> None:
assert JinjaRenderingEngine(run_config.jinja_rendering_engine) == JinjaRenderingEngine.NATIVE


def test_run_config_preserves_dropped_columns_by_default() -> None:
assert RunConfig().preserve_dropped_columns is True


def test_run_config_accepts_disabled_dropped_column_preservation() -> None:
run_config = RunConfig(preserve_dropped_columns=False)
assert run_config.preserve_dropped_columns is False


def test_throttle_config_accepts_rampup_seconds() -> None:
config = ThrottleConfig(rampup_seconds=30.0)
assert config.rampup_seconds == 30.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _resolve_columns(self, available: pd.Index) -> list[str]:
def process_after_batch(self, data: pd.DataFrame, *, current_batch_number: int | None) -> pd.DataFrame:
logger.info(f"πŸ™ˆ Dropping columns: {self.config.column_names}")
resolved = self._resolve_columns(data.columns)
if current_batch_number is not None:
if current_batch_number is not None and self.resource_provider.run_config.preserve_dropped_columns:
Comment thread
nabinchha marked this conversation as resolved.
self._save_dropped_columns(data, resolved, current_batch_number)
if resolved:
data.drop(columns=resolved, inplace=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import data_designer.lazy_heavy_imports as lazy
from data_designer.config.processors import DropColumnsProcessorConfig
from data_designer.config.run_config import RunConfig
from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
from data_designer.engine.storage.artifact_storage import BatchStage

Expand All @@ -24,6 +25,7 @@ def stub_processor(stub_processor_config):
mock_resource_provider.artifact_storage = Mock()
mock_resource_provider.artifact_storage.create_batch_file_path = Mock()
mock_resource_provider.artifact_storage.create_batch_file_path.return_value.name = "dropped.parquet"
mock_resource_provider.run_config = RunConfig()
processor = DropColumnsProcessor(
config=stub_processor_config,
resource_provider=mock_resource_provider,
Expand Down Expand Up @@ -181,3 +183,15 @@ def test_process_after_batch_preview_mode_does_not_save(stub_processor, stub_sam

# But no file should be written
stub_processor.artifact_storage.write_parquet_file.assert_not_called()


def test_process_after_batch_does_not_save_when_preservation_disabled(stub_processor, stub_sample_dataframe):
stub_processor.config.column_names = ["col1", "col2"]
stub_processor.resource_provider.run_config = RunConfig(preserve_dropped_columns=False)

result = stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=0)

assert "col1" not in result.columns
assert "col2" not in result.columns
assert "col3" in result.columns
stub_processor.artifact_storage.write_parquet_file.assert_not_called()
79 changes: 79 additions & 0 deletions packages/data-designer/tests/interface/test_data_designer.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,85 @@ def test_create_dataset_e2e_using_only_sampler_columns(
analysis.to_report()


def test_create_with_drop_true_can_skip_dropped_column_artifacts(
stub_artifact_path,
stub_model_providers,
stub_model_configs,
stub_managed_assets_path,
):
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
config_builder.add_column(
SamplerColumnConfig(
name="uuid",
sampler_type="uuid",
params={"prefix": "id_", "short_form": True, "uppercase": False},
)
)
config_builder.add_column(
SamplerColumnConfig(
name="hidden_category",
sampler_type="category",
params={"values": ["private"]},
drop=True,
)
)

data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
data_designer.set_run_config(RunConfig(preserve_dropped_columns=False))

results = data_designer.create(config_builder, num_records=3)

df = results.load_dataset()
assert "uuid" in df.columns
assert "hidden_category" not in df.columns
assert not results.artifact_storage.dropped_columns_dataset_path.exists()


def test_create_with_drop_true_preserves_columns_only_in_dropped_artifacts(
stub_artifact_path,
stub_model_providers,
stub_model_configs,
stub_managed_assets_path,
):
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
config_builder.add_column(
SamplerColumnConfig(
name="uuid",
sampler_type="uuid",
params={"prefix": "id_", "short_form": True, "uppercase": False},
)
)
config_builder.add_column(
SamplerColumnConfig(
name="hidden_category",
sampler_type="category",
params={"values": ["private"]},
drop=True,
)
)

data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)

results = data_designer.create(config_builder, num_records=3)

main_df = results.load_dataset()
dropped_df = lazy.pd.read_parquet(results.artifact_storage.dropped_columns_dataset_path)
assert "uuid" in main_df.columns
assert "hidden_category" not in main_df.columns
assert "hidden_category" in dropped_df.columns
assert "uuid" not in dropped_df.columns


def test_create_raises_error_when_builder_fails(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
Expand Down
Loading