Skip to content

Commit eaba366

Browse files
committed
feat: add validate workflow step type with JSON schema validation
Add a new 'validate' step type for workflows that performs JSON schema validation with a custom rule engine: - Full JSON schema validation (type, enum, const, minLength, maxLength, pattern, minimum, maximum, exclusiveMinimum, exclusiveMaximum, multipleOf, minItems, maxItems, uniqueItems, required, properties, additionalProperties, nested items) - Custom rule engine for expression-based validation predicates - Detailed error reporting with JSON paths and rule identifiers - Configurable fail_on_error behavior - Step configuration validation - Registration in STEP_REGISTRY as 'validate' type Also includes: - Comprehensive pytest test suite (test_validate_step.py) - Bug fix: handle json.JSONDecodeError in RunState.load() for corrupted state.json and inputs.json files Signed-off-by: Srikanth Patchava <spatchava@meta.com>
1 parent f6abe06 commit eaba366

4 files changed

Lines changed: 654 additions & 2 deletions

File tree

src/specify_cli/workflows/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def _register_builtin_steps() -> None:
5151
from .steps.prompt import PromptStep
5252
from .steps.shell import ShellStep
5353
from .steps.switch import SwitchStep
54+
from .steps.validate import ValidateStep
5455
from .steps.while_loop import WhileStep
5556

5657
_register_step(CommandStep())
@@ -62,6 +63,7 @@ def _register_builtin_steps() -> None:
6263
_register_step(PromptStep())
6364
_register_step(ShellStep())
6465
_register_step(SwitchStep())
66+
_register_step(ValidateStep())
6567
_register_step(WhileStep())
6668

6769

src/specify_cli/workflows/engine.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,11 @@ def load(cls, run_id: str, project_root: Path) -> RunState:
289289
raise FileNotFoundError(msg)
290290

291291
with open(state_path, encoding="utf-8") as f:
292-
state_data = json.load(f)
292+
try:
293+
state_data = json.load(f)
294+
except json.JSONDecodeError as exc:
295+
msg = f"Corrupted run state file: {state_path}: {exc}"
296+
raise ValueError(msg) from exc
293297

294298
state = cls(
295299
run_id=state_data["run_id"],
@@ -306,7 +310,11 @@ def load(cls, run_id: str, project_root: Path) -> RunState:
306310
inputs_path = runs_dir / "inputs.json"
307311
if inputs_path.exists():
308312
with open(inputs_path, encoding="utf-8") as f:
309-
inputs_data = json.load(f)
313+
try:
314+
inputs_data = json.load(f)
315+
except json.JSONDecodeError as exc:
316+
msg = f"Corrupted inputs file: {inputs_path}: {exc}"
317+
raise ValueError(msg) from exc
310318
state.inputs = inputs_data.get("inputs", {})
311319

312320
return state
Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
"""Validate step — JSON schema validation with custom rules.
2+
3+
Validates step context data (inputs, step outputs, or arbitrary JSON)
4+
against a JSON-Schema-like rule set, then aggregates errors into a
5+
detailed report stored in ``output``.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import re
11+
from typing import Any
12+
13+
from specify_cli.workflows.base import StepBase, StepContext, StepResult, StepStatus
14+
from specify_cli.workflows.expressions import evaluate_expression
15+
16+
17+
# ── Schema types ────────────────────────────────────────────────────
18+
19+
_VALID_TYPES = {"string", "integer", "number", "boolean", "array", "object", "null"}
20+
21+
22+
def _python_type_name(value: Any) -> str:
23+
"""Map a Python value to its JSON-schema type name."""
24+
if value is None:
25+
return "null"
26+
if isinstance(value, bool):
27+
return "boolean"
28+
if isinstance(value, int):
29+
return "integer"
30+
if isinstance(value, float):
31+
return "number"
32+
if isinstance(value, str):
33+
return "string"
34+
if isinstance(value, list):
35+
return "array"
36+
if isinstance(value, dict):
37+
return "object"
38+
return type(value).__name__
39+
40+
41+
# ── Schema validator ────────────────────────────────────────────────
42+
43+
class ValidationError:
44+
"""A single validation error with path and message."""
45+
46+
__slots__ = ("path", "message", "rule")
47+
48+
def __init__(self, path: str, message: str, rule: str = "") -> None:
49+
self.path = path
50+
self.message = message
51+
self.rule = rule
52+
53+
def to_dict(self) -> dict[str, str]:
54+
d: dict[str, str] = {"path": self.path, "message": self.message}
55+
if self.rule:
56+
d["rule"] = self.rule
57+
return d
58+
59+
def __repr__(self) -> str:
60+
return f"ValidationError(path={self.path!r}, message={self.message!r})"
61+
62+
63+
class SchemaValidator:
64+
"""Validate a value against a JSON-schema-like definition."""
65+
66+
def __init__(self) -> None:
67+
self.errors: list[ValidationError] = []
68+
69+
def validate(self, value: Any, schema: dict[str, Any], path: str = "$") -> list[ValidationError]:
70+
"""Validate *value* against *schema* and return all errors."""
71+
self.errors = []
72+
self._validate_node(value, schema, path)
73+
return list(self.errors)
74+
75+
def _add_error(self, path: str, message: str, rule: str = "") -> None:
76+
self.errors.append(ValidationError(path, message, rule))
77+
78+
def _validate_node(self, value: Any, schema: dict[str, Any], path: str) -> None:
79+
# ── required check (handled at object level) ──
80+
# ── type check ──
81+
expected_type = schema.get("type")
82+
if expected_type is not None:
83+
if expected_type not in _VALID_TYPES:
84+
self._add_error(path, f"Unknown schema type: {expected_type!r}", "type")
85+
return
86+
actual = _python_type_name(value)
87+
# Allow integer where number is expected
88+
if expected_type == "number" and actual == "integer":
89+
actual = "number"
90+
if actual != expected_type:
91+
self._add_error(
92+
path,
93+
f"Expected type {expected_type!r} but got {actual!r}",
94+
"type",
95+
)
96+
return # skip further checks if type is wrong
97+
98+
# ── enum ──
99+
if "enum" in schema:
100+
if value not in schema["enum"]:
101+
self._add_error(
102+
path,
103+
f"Value {value!r} is not one of {schema['enum']!r}",
104+
"enum",
105+
)
106+
107+
# ── const ──
108+
if "const" in schema:
109+
if value != schema["const"]:
110+
self._add_error(
111+
path,
112+
f"Value must be {schema['const']!r}",
113+
"const",
114+
)
115+
116+
# ── string constraints ──
117+
if isinstance(value, str):
118+
if "minLength" in schema and len(value) < schema["minLength"]:
119+
self._add_error(
120+
path,
121+
f"String length {len(value)} is less than minimum {schema['minLength']}",
122+
"minLength",
123+
)
124+
if "maxLength" in schema and len(value) > schema["maxLength"]:
125+
self._add_error(
126+
path,
127+
f"String length {len(value)} exceeds maximum {schema['maxLength']}",
128+
"maxLength",
129+
)
130+
if "pattern" in schema:
131+
if not re.search(schema["pattern"], value):
132+
self._add_error(
133+
path,
134+
f"String does not match pattern {schema['pattern']!r}",
135+
"pattern",
136+
)
137+
138+
# ── numeric constraints ──
139+
if isinstance(value, (int, float)) and not isinstance(value, bool):
140+
if "minimum" in schema and value < schema["minimum"]:
141+
self._add_error(
142+
path,
143+
f"Value {value} is less than minimum {schema['minimum']}",
144+
"minimum",
145+
)
146+
if "maximum" in schema and value > schema["maximum"]:
147+
self._add_error(
148+
path,
149+
f"Value {value} exceeds maximum {schema['maximum']}",
150+
"maximum",
151+
)
152+
if "exclusiveMinimum" in schema and value <= schema["exclusiveMinimum"]:
153+
self._add_error(
154+
path,
155+
f"Value {value} must be > {schema['exclusiveMinimum']}",
156+
"exclusiveMinimum",
157+
)
158+
if "exclusiveMaximum" in schema and value >= schema["exclusiveMaximum"]:
159+
self._add_error(
160+
path,
161+
f"Value {value} must be < {schema['exclusiveMaximum']}",
162+
"exclusiveMaximum",
163+
)
164+
if "multipleOf" in schema and value % schema["multipleOf"] != 0:
165+
self._add_error(
166+
path,
167+
f"Value {value} is not a multiple of {schema['multipleOf']}",
168+
"multipleOf",
169+
)
170+
171+
# ── array constraints ──
172+
if isinstance(value, list):
173+
if "minItems" in schema and len(value) < schema["minItems"]:
174+
self._add_error(
175+
path,
176+
f"Array length {len(value)} is less than minimum {schema['minItems']}",
177+
"minItems",
178+
)
179+
if "maxItems" in schema and len(value) > schema["maxItems"]:
180+
self._add_error(
181+
path,
182+
f"Array length {len(value)} exceeds maximum {schema['maxItems']}",
183+
"maxItems",
184+
)
185+
if schema.get("uniqueItems") and len(value) != len(set(repr(v) for v in value)):
186+
self._add_error(path, "Array items are not unique", "uniqueItems")
187+
items_schema = schema.get("items")
188+
if items_schema:
189+
for i, item in enumerate(value):
190+
self._validate_node(item, items_schema, f"{path}[{i}]")
191+
192+
# ── object constraints ──
193+
if isinstance(value, dict):
194+
required_keys = schema.get("required", [])
195+
for rk in required_keys:
196+
if rk not in value:
197+
self._add_error(
198+
f"{path}.{rk}",
199+
f"Missing required property {rk!r}",
200+
"required",
201+
)
202+
properties = schema.get("properties", {})
203+
for prop_name, prop_schema in properties.items():
204+
if prop_name in value:
205+
self._validate_node(
206+
value[prop_name], prop_schema, f"{path}.{prop_name}"
207+
)
208+
additional = schema.get("additionalProperties")
209+
if additional is False:
210+
allowed = set(properties.keys())
211+
for k in value:
212+
if k not in allowed:
213+
self._add_error(
214+
f"{path}.{k}",
215+
f"Additional property {k!r} is not allowed",
216+
"additionalProperties",
217+
)
218+
219+
220+
# ── Custom rule engine ──────────────────────────────────────────────
221+
222+
class CustomRuleEngine:
223+
"""Evaluate custom validation rules expressed as simple predicates."""
224+
225+
@staticmethod
226+
def evaluate_rules(
227+
rules: list[dict[str, Any]], data: Any, context: StepContext,
228+
) -> list[ValidationError]:
229+
"""Evaluate a list of custom rules against *data*.
230+
231+
Each rule is a dict with:
232+
- ``expr``: a ``{{ }}``-style expression that should resolve truthy.
233+
- ``message``: error message if the rule fails.
234+
- ``path`` (optional): JSON path for the error.
235+
- ``severity`` (optional): ``error`` | ``warning`` (default ``error``).
236+
"""
237+
errors: list[ValidationError] = []
238+
for rule in rules:
239+
expr = rule.get("expr", "")
240+
if not expr:
241+
continue
242+
try:
243+
result = evaluate_expression(expr, context)
244+
except Exception:
245+
result = None
246+
247+
if not result:
248+
errors.append(
249+
ValidationError(
250+
path=rule.get("path", "$"),
251+
message=rule.get("message", f"Custom rule failed: {expr}"),
252+
rule="custom",
253+
)
254+
)
255+
return errors
256+
257+
258+
# ── Step implementation ─────────────────────────────────────────────
259+
260+
class ValidateStep(StepBase):
261+
"""Workflow step that validates data against a JSON schema.
262+
263+
YAML configuration::
264+
265+
- id: check-inputs
266+
type: validate
267+
target: "{{ inputs }}" # expression resolving to data
268+
schema:
269+
type: object
270+
required: [name, version]
271+
properties:
272+
name: { type: string, minLength: 1 }
273+
version: { type: string, pattern: "^\\\\d+\\\\.\\\\d+\\\\.\\\\d+$" }
274+
custom_rules:
275+
- expr: "{{ inputs.name != inputs.version }}"
276+
message: "name and version must differ"
277+
fail_on_error: true # default true
278+
"""
279+
280+
type_key = "validate"
281+
282+
def execute(self, config: dict[str, Any], context: StepContext) -> StepResult:
283+
# ── Resolve the target data ──
284+
target_expr = config.get("target", "{{ inputs }}")
285+
if isinstance(target_expr, str) and "{{" in target_expr:
286+
try:
287+
target_data = evaluate_expression(target_expr, context)
288+
except Exception as exc:
289+
return StepResult(
290+
status=StepStatus.FAILED,
291+
error=f"Failed to resolve target expression: {exc}",
292+
output={"valid": False, "errors": [], "error_count": 0},
293+
)
294+
else:
295+
target_data = target_expr
296+
297+
all_errors: list[ValidationError] = []
298+
299+
# ── JSON schema validation ──
300+
schema = config.get("schema")
301+
if schema:
302+
validator = SchemaValidator()
303+
all_errors.extend(validator.validate(target_data, schema))
304+
305+
# ── Custom rules ──
306+
custom_rules = config.get("custom_rules", [])
307+
if custom_rules:
308+
all_errors.extend(
309+
CustomRuleEngine.evaluate_rules(custom_rules, target_data, context)
310+
)
311+
312+
# ── Build result ──
313+
error_dicts = [e.to_dict() for e in all_errors]
314+
is_valid = len(all_errors) == 0
315+
fail_on_error = config.get("fail_on_error", True)
316+
317+
output = {
318+
"valid": is_valid,
319+
"errors": error_dicts,
320+
"error_count": len(error_dicts),
321+
}
322+
323+
if not is_valid and fail_on_error:
324+
summary = "; ".join(f"{e.path}: {e.message}" for e in all_errors[:5])
325+
if len(all_errors) > 5:
326+
summary += f" ... and {len(all_errors) - 5} more"
327+
return StepResult(
328+
status=StepStatus.FAILED,
329+
error=f"Validation failed ({len(all_errors)} errors): {summary}",
330+
output=output,
331+
)
332+
333+
return StepResult(status=StepStatus.COMPLETED, output=output)
334+
335+
def validate(self, config: dict[str, Any]) -> list[str]:
336+
errors = super().validate(config)
337+
schema = config.get("schema")
338+
if schema is not None and not isinstance(schema, dict):
339+
errors.append(
340+
f"Validate step {config.get('id', '?')!r}: 'schema' must be a dict."
341+
)
342+
custom_rules = config.get("custom_rules")
343+
if custom_rules is not None and not isinstance(custom_rules, list):
344+
errors.append(
345+
f"Validate step {config.get('id', '?')!r}: 'custom_rules' must be a list."
346+
)
347+
return errors

0 commit comments

Comments
 (0)