1111
1212import data_designer .engine .dataset_builders .dataset_builder as builder_mod
1313import data_designer .lazy_heavy_imports as lazy
14+ from data_designer .config .base import SkipConfig
1415from data_designer .config .column_configs import CustomColumnConfig , LLMTextColumnConfig , SamplerColumnConfig
1516from data_designer .config .config_builder import DataDesignerConfigBuilder
1617from data_designer .config .custom_column import custom_column_generator
@@ -962,6 +963,21 @@ def fn(df: pd.DataFrame) -> pd.DataFrame:
962963 return fn
963964
964965
966+ def _make_label_generator_with_side_effect (label : str , side_effect_label : str , * required : str ):
967+ """FULL_COLUMN generator that adds a column plus one side-effect column."""
968+
969+ @custom_column_generator (required_columns = list (required ), side_effect_columns = [side_effect_label ])
970+ def fn (df : pd .DataFrame ) -> pd .DataFrame :
971+ return df .assign (
972+ ** {
973+ label : f"generated_{ label } " ,
974+ side_effect_label : f"generated_{ side_effect_label } " ,
975+ }
976+ )
977+
978+ return fn
979+
980+
965981def test_skip_metadata_preserved_across_non_skip_aware_full_column (
966982 stub_resource_provider , stub_model_configs , seed_data_setup
967983):
@@ -972,8 +988,6 @@ def test_skip_metadata_preserved_across_non_skip_aware_full_column(
972988 Before the fix, summary's replace_buffer erased __internal_skipped_columns,
973989 causing complaint to generate for rows that should have been skipped.
974990 """
975- from data_designer .config .base import SkipConfig
976-
977991 config_builder = DataDesignerConfigBuilder (model_configs = stub_model_configs )
978992 config_builder .with_seed_dataset (LocalFileSeedSource (path = str (seed_data_setup ["seed_path" ])))
979993
@@ -1031,8 +1045,6 @@ def test_skip_metadata_preserved_when_no_rows_skipped_for_current_column(
10311045 own expression (it has none). The has_skipped=False fallthrough must still
10321046 preserve review's skip metadata so propagation works.
10331047 """
1034- from data_designer .config .base import SkipConfig
1035-
10361048 config_builder = DataDesignerConfigBuilder (model_configs = stub_model_configs )
10371049 config_builder .with_seed_dataset (LocalFileSeedSource (path = str (seed_data_setup ["seed_path" ])))
10381050
@@ -1069,6 +1081,53 @@ def test_skip_metadata_preserved_when_no_rows_skipped_for_current_column(
10691081 assert row ["analysis" ] == "generated_analysis" , f"seed_id={ row ['seed_id' ]} : analysis should be generated"
10701082
10711083
1084+ def test_skip_propagation_resolves_side_effect_dependencies_in_sync_builder (
1085+ stub_resource_provider , stub_model_configs , seed_data_setup
1086+ ):
1087+ """A downstream dependency on a skipped side-effect should auto-skip.
1088+
1089+ Scenario: review(skip.when, produces review_side_effect) ->
1090+ analysis(required_columns=[review_side_effect], propagate_skip=True).
1091+ """
1092+ config_builder = DataDesignerConfigBuilder (model_configs = stub_model_configs )
1093+ config_builder .with_seed_dataset (LocalFileSeedSource (path = str (seed_data_setup ["seed_path" ])))
1094+
1095+ config_builder .add_column (
1096+ CustomColumnConfig (
1097+ name = "review" ,
1098+ generator_function = _make_label_generator_with_side_effect ("review" , "review_side_effect" , "seed_id" ),
1099+ generation_strategy = GenerationStrategy .FULL_COLUMN ,
1100+ skip = SkipConfig (when = "{{ seed_id < 3 }}" ),
1101+ )
1102+ )
1103+ config_builder .add_column (
1104+ CustomColumnConfig (
1105+ name = "analysis" ,
1106+ generator_function = _make_label_generator ("analysis" , "review_side_effect" ),
1107+ generation_strategy = GenerationStrategy .FULL_COLUMN ,
1108+ propagate_skip = True ,
1109+ )
1110+ )
1111+
1112+ builder = DatasetBuilder (
1113+ data_designer_config = config_builder .build (),
1114+ resource_provider = stub_resource_provider ,
1115+ )
1116+ result = builder .build_preview (num_records = 5 )
1117+
1118+ skipped_ids = {1 , 2 }
1119+ for _ , row in result .iterrows ():
1120+ if row ["seed_id" ] in skipped_ids :
1121+ assert row ["review_side_effect" ] is None or lazy .pd .isna (row ["review_side_effect" ]), (
1122+ f"seed_id={ row ['seed_id' ]} : review_side_effect should be cleared when review is skipped"
1123+ )
1124+ assert row ["analysis" ] is None or lazy .pd .isna (row ["analysis" ]), (
1125+ f"seed_id={ row ['seed_id' ]} : analysis should propagate skip from review"
1126+ )
1127+ else :
1128+ assert row ["analysis" ] == "generated_analysis" , f"seed_id={ row ['seed_id' ]} : analysis should be generated"
1129+
1130+
10721131def test_allow_resize_column_not_blocked_by_upstream_skip (stub_resource_provider , stub_model_configs , seed_data_setup ):
10731132 """An allow_resize=True column depending on a skippable upstream must not
10741133 enter the skip-aware branch (which enforces 1:1 row counts).
@@ -1077,8 +1136,6 @@ def test_allow_resize_column_not_blocked_by_upstream_skip(stub_resource_provider
10771136 with propagate_skip=True and required_columns pointing to a skippable
10781137 upstream, causing a DatasetGenerationError on the row-count check.
10791138 """
1080- from data_designer .config .base import SkipConfig
1081-
10821139 config_builder = DataDesignerConfigBuilder (model_configs = stub_model_configs )
10831140 config_builder .with_seed_dataset (LocalFileSeedSource (path = str (seed_data_setup ["seed_path" ])))
10841141
0 commit comments