Skip to content

Commit 9d087f9

Browse files
committed
feat(wassirman): add validation IR generation target
New `--format wassirman` option for `overture-codegen generate` that emits YAML validation IR from Pydantic schema models. The pipeline walks expanded FeatureSpec trees and emits one rule per field constraint: not_null, numeric bounds (gte/lte/between), length, enum/literal membership, geometry type, pattern, and uniqueness. Model-level constraints (require_any_of, radio_group, require_if, forbid_if) produce multi-column or conditional rules. list_columns tracks array nesting for element-level checks. Parent optionality propagates as `when: not_null` guards. Structural fields (theme, type, bbox, ext_*) are skipped. With --output-dir, writes one YAML file per feature type. Without it, emits a single envelope to stdout. Golden snapshot tests cover all 16 discovered feature types, verified against the reference validator output.
1 parent e3b6f3d commit 9d087f9

27 files changed

Lines changed: 6664 additions & 2 deletions

packages/overture-schema-codegen/src/overture/schema/codegen/cli.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@
2222
entry_point_module,
2323
)
2424
from .markdown.pipeline import generate_markdown_pages
25+
from .wassirman.ir import ValidationIR
26+
from .wassirman.pipeline import generate_validation_ir
2527

2628
log = logging.getLogger(__name__)
2729

2830
__all__ = ["cli"]
2931

30-
_OUTPUT_FORMATS = ("markdown",)
32+
_OUTPUT_FORMATS = ("markdown", "wassirman")
3133

3234
_FEATURE_FRONTMATTER = "---\nsidebar_position: 1\n---\n\n"
3335

@@ -120,7 +122,26 @@ def generate(
120122
)
121123
)
122124

123-
_generate_markdown(feature_specs, schema_root, output_dir)
125+
if output_format == "markdown":
126+
_generate_markdown(feature_specs, schema_root, output_dir)
127+
elif output_format == "wassirman":
128+
_generate_wassirman(feature_specs, output_dir)
129+
130+
131+
def _generate_wassirman(
132+
feature_specs: list[FeatureSpec],
133+
output_dir: Path | None,
134+
) -> None:
135+
"""Generate validation IR as YAML."""
136+
ir = generate_validation_ir(feature_specs)
137+
if output_dir:
138+
for dataset in ir.datasets:
139+
file_path = output_dir / f"{dataset.name}.yaml"
140+
file_path.parent.mkdir(parents=True, exist_ok=True)
141+
single_ir = ValidationIR(datasets=[dataset])
142+
file_path.write_text(single_ir.to_yaml())
143+
else:
144+
click.echo(ir.to_yaml())
124145

125146

126147
def _generate_markdown(

packages/overture-schema-codegen/src/overture/schema/codegen/extraction/type_registry.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"PRIMITIVE_TYPES",
1010
"get_type_mapping",
1111
"is_semantic_newtype",
12+
"is_storage_primitive_source",
1213
"resolve_type_name",
1314
]
1415

@@ -82,6 +83,27 @@ def get_type_mapping(type_name: str) -> TypeMapping | None:
8283
return PRIMITIVE_TYPES.get(type_name)
8384

8485

86+
def is_storage_primitive_source(source_name: str | None) -> bool:
87+
"""Whether a ConstraintSource name refers to a registered storage primitive.
88+
89+
Used by validation renderers to filter out storage-level constraints
90+
(e.g., int32 range) in favor of domain-level constraints.
91+
92+
Parameters
93+
----------
94+
source_name
95+
The NewType or primitive name to check, or None.
96+
97+
Returns
98+
-------
99+
bool
100+
True if source_name is a key in PRIMITIVE_TYPES.
101+
"""
102+
if source_name is None:
103+
return False
104+
return source_name in PRIMITIVE_TYPES
105+
106+
85107
def resolve_type_name(type_info: TypeInfo, target: str) -> str:
86108
"""Resolve a TypeInfo to the base type string for a given target.
87109

packages/overture-schema-codegen/src/overture/schema/codegen/wassirman/__init__.py

Whitespace-only changes.
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""Validation IR data types for YAML serialization."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass
6+
7+
import yaml
8+
9+
__all__ = ["ConditionIR", "DatasetIR", "RuleIR", "ValidationIR"]
10+
11+
12+
@dataclass(frozen=True, slots=True)
13+
class ConditionIR:
14+
"""Guard predicate for conditional rules."""
15+
16+
column: str
17+
check: str
18+
value: object | None = None
19+
20+
def to_dict(self) -> dict[str, object]:
21+
"""Serialize to dict, omitting None fields."""
22+
d: dict[str, object] = {"column": self.column, "check": self.check}
23+
if self.value is not None:
24+
d["value"] = self.value
25+
return d
26+
27+
28+
@dataclass(frozen=True, slots=True)
29+
class RuleIR:
30+
"""Single validation rule."""
31+
32+
name: str
33+
check: str
34+
severity: str
35+
column: str | None = None
36+
columns: list[str] | None = None
37+
value: object | None = None
38+
list_columns: list[str] | None = None
39+
when: ConditionIR | None = None
40+
41+
def to_dict(self) -> dict[str, object]:
42+
"""Serialize to dict, omitting None fields."""
43+
d: dict[str, object] = {"name": self.name}
44+
if self.column is not None:
45+
d["column"] = self.column
46+
if self.columns is not None:
47+
d["columns"] = self.columns
48+
d["check"] = self.check
49+
if self.value is not None:
50+
d["value"] = self.value
51+
if self.list_columns is not None:
52+
d["list_columns"] = self.list_columns
53+
if self.when is not None:
54+
d["when"] = self.when.to_dict()
55+
d["severity"] = self.severity
56+
return d
57+
58+
59+
@dataclass(frozen=True, slots=True)
60+
class DatasetIR:
61+
"""Validation rules for one feature type."""
62+
63+
name: str
64+
source_model: str
65+
id_column: str
66+
rules: list[RuleIR]
67+
68+
def to_dict(self) -> dict[str, object]:
69+
"""Serialize to dict."""
70+
return {
71+
"name": self.name,
72+
"source_model": self.source_model,
73+
"id_column": self.id_column,
74+
"rules": [r.to_dict() for r in self.rules],
75+
}
76+
77+
78+
@dataclass(frozen=True, slots=True)
79+
class ValidationIR:
80+
"""Full validation IR envelope."""
81+
82+
datasets: list[DatasetIR]
83+
version: str = "1"
84+
85+
def to_yaml(self) -> str:
86+
"""Serialize to YAML string."""
87+
data = {
88+
"version": self.version,
89+
"datasets": [ds.to_dict() for ds in self.datasets],
90+
}
91+
return yaml.dump(data, default_flow_style=False, sort_keys=False)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Validation IR generation pipeline."""
2+
3+
from __future__ import annotations
4+
5+
from collections.abc import Sequence
6+
7+
from ..extraction.model_extraction import expand_model_tree
8+
from ..extraction.specs import FeatureSpec, ModelSpec
9+
from ..extraction.type_analyzer import TypeKind
10+
from .ir import DatasetIR, ValidationIR
11+
from .walker import walk_feature
12+
13+
__all__ = ["generate_validation_ir"]
14+
15+
16+
def _dataset_name(spec: FeatureSpec) -> str:
17+
"""Derive dataset name from the model's type Literal field."""
18+
for field_spec in spec.fields:
19+
if field_spec.name == "type" and field_spec.type_info.kind == TypeKind.LITERAL:
20+
vals = field_spec.type_info.literal_values
21+
if vals and len(vals) == 1:
22+
return str(vals[0])
23+
return spec.name.lower()
24+
25+
26+
def _source_model_fqn(spec: FeatureSpec) -> str:
27+
"""Fully qualified name of the source model."""
28+
src = spec.source_type
29+
if src is None:
30+
return spec.name
31+
return f"{src.__module__}.{src.__qualname__}"
32+
33+
34+
def generate_validation_ir(
35+
feature_specs: Sequence[FeatureSpec],
36+
) -> ValidationIR:
37+
"""Generate validation IR from feature specs.
38+
39+
Parameters
40+
----------
41+
feature_specs
42+
Extracted feature specs to convert to validation IR.
43+
44+
Returns
45+
-------
46+
ValidationIR
47+
Full validation IR with one dataset per feature spec.
48+
"""
49+
cache: dict[type, ModelSpec] = {}
50+
for spec in feature_specs:
51+
expand_model_tree(spec, cache)
52+
53+
datasets: list[DatasetIR] = []
54+
for spec in feature_specs:
55+
name = _dataset_name(spec)
56+
rules = walk_feature(spec, name)
57+
datasets.append(
58+
DatasetIR(
59+
name=name,
60+
source_model=_source_model_fqn(spec),
61+
id_column="id",
62+
rules=rules,
63+
)
64+
)
65+
66+
return ValidationIR(datasets=datasets)

0 commit comments

Comments
 (0)