|
42 | 42 | "dataset_name": {"type": "string", "required": True, "description": "Dataset name"}, |
43 | 43 | }, |
44 | 44 | }, |
| 45 | + "join_datasets": { |
| 46 | + "type": "transform", |
| 47 | + "params": { |
| 48 | + "dataset_name": {"type": "string", "required": True, "description": "Dataset name"}, |
| 49 | + "join_key": {"type": "string", "required": False}, |
| 50 | + "join_type": {"type": "string", "required": False}, |
| 51 | + }, |
| 52 | + }, |
45 | 53 | } |
46 | 54 | } |
47 | 55 |
|
@@ -171,6 +179,62 @@ def test_step_missing_service_field(self): |
171 | 179 | errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
172 | 180 | assert any("service" in e.lower() for e in errors) |
173 | 181 |
|
| 182 | + def test_non_extract_requires_depends_on(self): |
| 183 | + pipeline = _pipeline([ |
| 184 | + {"id": "extract", "service": "extract_csv", "params": {"file_path": "/data/f.csv"}}, |
| 185 | + {"id": "clean", "service": "clean_nan"}, |
| 186 | + ]) |
| 187 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 188 | + assert any("requires depends_on" in e.lower() for e in errors) |
| 189 | + |
| 190 | + def test_extract_cannot_have_depends_on(self): |
| 191 | + pipeline = _pipeline([ |
| 192 | + {"id": "extract", "service": "extract_csv", "params": {"file_path": "/data/f.csv"}, "depends_on": ["x"]}, |
| 193 | + ]) |
| 194 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 195 | + assert any("must not have depends_on" in e.lower() for e in errors) |
| 196 | + |
| 197 | + def test_params_must_be_dict(self): |
| 198 | + pipeline = _pipeline([ |
| 199 | + {"id": "extract", "service": "extract_csv", "params": []}, |
| 200 | + ]) |
| 201 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 202 | + assert any("params" in e.lower() and "dict" in e.lower() for e in errors) |
| 203 | + |
| 204 | + def test_depends_on_must_be_list(self): |
| 205 | + pipeline = _pipeline([ |
| 206 | + {"id": "extract", "service": "extract_csv", "params": {"file_path": "/data/f.csv"}}, |
| 207 | + {"id": "clean", "service": "clean_nan", "depends_on": "extract"}, |
| 208 | + ]) |
| 209 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 210 | + assert any("depends_on" in e.lower() and "list" in e.lower() for e in errors) |
| 211 | + |
| 212 | + def test_non_join_multiple_depends_on_invalid(self): |
| 213 | + pipeline = _pipeline([ |
| 214 | + {"id": "extract1", "service": "extract_csv", "params": {"file_path": "/data/a.csv"}}, |
| 215 | + {"id": "extract2", "service": "extract_csv", "params": {"file_path": "/data/b.csv"}}, |
| 216 | + {"id": "clean", "service": "clean_nan", "depends_on": ["extract1", "extract2"]}, |
| 217 | + ]) |
| 218 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 219 | + assert any("multiple depends_on" in e.lower() for e in errors) |
| 220 | + |
| 221 | + def test_join_requires_two_depends_on(self): |
| 222 | + pipeline = _pipeline([ |
| 223 | + {"id": "extract1", "service": "extract_csv", "params": {"file_path": "/data/a.csv"}}, |
| 224 | + {"id": "join", "service": "join_datasets", "depends_on": ["extract1"]}, |
| 225 | + ]) |
| 226 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 227 | + assert any("join_datasets" in e.lower() and "exactly 2" in e.lower() for e in errors) |
| 228 | + |
| 229 | + def test_join_accepts_two_depends_on(self): |
| 230 | + pipeline = _pipeline([ |
| 231 | + {"id": "extract1", "service": "extract_csv", "params": {"file_path": "/data/a.csv"}}, |
| 232 | + {"id": "extract2", "service": "extract_csv", "params": {"file_path": "/data/b.csv"}}, |
| 233 | + {"id": "join", "service": "join_datasets", "depends_on": ["extract1", "extract2"]}, |
| 234 | + ]) |
| 235 | + errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY) |
| 236 | + assert errors == [] |
| 237 | + |
174 | 238 |
|
175 | 239 | # ── Return type contract ────────────────────────────────────────────────────── |
176 | 240 |
|
|
0 commit comments