Skip to content

Commit f2aea81

Browse files
sethfitzvcschapp
authored andcommitted
feat(codegen): rewrite example pipeline with model instances
Replace dict-walking flatten machinery with Pydantic model-instance traversal. validate_example returns a BaseModel instance; flatten_model_instance walks it via isinstance checks to produce dot-notation key-value pairs, eliminating the need for external schema information (collect_dict_paths). augment_missing_fields adds cross-arm union fields as None. Remove "null" sentinel convention from TOML examples. Pydantic fills None defaults for omitted fields, making the _denull pipeline stage unnecessary. Fix BBox dict validation (missing return in __get_pydantic_core_schema__), BBox flattening via __slots__ property detection, datetime isoformat rendering, and non-string value truncation for Geometry objects. Signed-off-by: Seth Fitzsimmons <sethfitz@amazon.com>
1 parent 52a9a95 commit f2aea81

13 files changed

Lines changed: 496 additions & 703 deletions

File tree

packages/overture-schema-addresses-theme/pyproject.toml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,8 @@ testpaths = ["tests"]
4343
id = "416ab01c-d836-4c4f-aedc-2f30941ce94d"
4444
geometry = "POINT (-176.5637854 -43.9471955)"
4545
country = "NZ"
46-
postcode = "null"
4746
street = "Tikitiki Hill Road"
4847
number = "54"
49-
unit = "null"
50-
postal_city = "null"
5148
version = 1
5249
theme = "addresses"
5350
type = "address"
@@ -67,7 +64,3 @@ value = "Chatham Island"
6764
[[examples.Address.sources]]
6865
property = ""
6966
dataset = "OpenAddresses/LINZ"
70-
record_id = "null"
71-
update_time = "null"
72-
confidence = "null"
73-
between = "null"

packages/overture-schema-base-theme/pyproject.toml

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -60,25 +60,16 @@ ymax = -75.64299774169922
6060
property = ""
6161
dataset = "ETOPO/GLOBathy"
6262
record_id = "2024-12-09T00:00:00.000Z"
63-
update_time = "null"
64-
confidence = "null"
65-
between = "null"
6663

6764
[examples.Bathymetry.cartography]
68-
prominence = "null"
69-
min_zoom = "null"
70-
max_zoom = "null"
7165
sort_key = 12
7266

7367
[[examples.Infrastructure]]
7468
id = "e9e3d506-89c0-3473-8cee-5e5ac6596d6c"
7569
geometry = "POINT (-179.9999994 -82.42408)"
7670
version = 0
77-
level = "null"
7871
subtype = "pedestrian"
7972
class = "information"
80-
height = "null"
81-
surface = "null"
8273
wikidata = "Q800558"
8374
theme = "base"
8475
type = "infrastructure"
@@ -94,13 +85,9 @@ property = ""
9485
dataset = "OpenStreetMap"
9586
record_id = "n7674174803@2"
9687
update_time = "2023-04-07T17:37:48.000Z"
97-
confidence = "null"
98-
between = "null"
9988

10089
[examples.Infrastructure.names]
10190
primary = "1306 km to South Pole"
102-
common = "null"
103-
rules = "null"
10491

10592
[examples.Infrastructure.source_tags]
10693
description = "1036 km to South Pole."
@@ -114,12 +101,9 @@ wikipedia = "en:South Pole Traverse"
114101
id = "70fc3596-a987-3fea-820c-c016c0a2f0da"
115102
geometry = "POINT (-178.7 -85.45)"
116103
version = 0
117-
level = "null"
118104
subtype = "physical"
119105
class = "cliff"
120-
surface = "null"
121106
wikidata = "Q5282342"
122-
elevation = "null"
123107
theme = "base"
124108
type = "land"
125109

@@ -134,13 +118,9 @@ property = ""
134118
dataset = "OpenStreetMap"
135119
record_id = "n11693475112@1"
136120
update_time = "2024-03-05T09:23:39.000Z"
137-
confidence = "null"
138-
between = "null"
139121

140122
[examples.Land.names]
141123
primary = "Dismal Buttress"
142-
common = "null"
143-
rules = "null"
144124

145125
[examples.Land.source_tags]
146126
natural = "cliff"
@@ -164,13 +144,9 @@ ymax = 65.96218872070312
164144
[[examples.LandCover.sources]]
165145
property = ""
166146
dataset = "ESA WorldCover"
167-
record_id = "null"
168147
update_time = "2024-11-07T00:00:00.000Z"
169-
confidence = "null"
170-
between = "null"
171148

172149
[examples.LandCover.cartography]
173-
prominence = "null"
174150
min_zoom = 8
175151
max_zoom = 15
176152
sort_key = 3
@@ -179,12 +155,8 @@ sort_key = 3
179155
id = "1e1f6095-5bd2-3fdb-a422-41351b848e9d"
180156
geometry = "POLYGON ((-176.5623454 -43.9567812, -176.5627644 -43.9561272, -176.5626898 -43.9557432, -176.5624297 -43.9553592, -176.562679 -43.9551603, -176.5629058 -43.9552064, -176.5631441 -43.9551769, -176.5632428 -43.9550676, -176.5633066 -43.9548702, -176.5634402 -43.9548071, -176.5639052 -43.9546682, -176.5642479 -43.9544118, -176.5647302 -43.9542142, -176.5651547 -43.954277, -176.5658293 -43.9545243, -176.5659454 -43.9543521, -176.566934 -43.9547987, -176.5669179 -43.955018, -176.5682465 -43.9553205, -176.5671004 -43.9579593, -176.5662034 -43.9600044, -176.5655366 -43.9597247, -176.5646109 -43.9595326, -176.564467 -43.9592563, -176.5639885 -43.9589226, -176.5637013 -43.9586925, -176.563223 -43.9586237, -176.5623454 -43.9567812))"
181157
version = 0
182-
level = "null"
183158
subtype = "golf"
184159
class = "golf_course"
185-
surface = "null"
186-
wikidata = "null"
187-
elevation = "null"
188160
theme = "base"
189161
type = "land_use"
190162

@@ -199,13 +171,9 @@ property = ""
199171
dataset = "OpenStreetMap"
200172
record_id = "w56117029@3"
201173
update_time = "2010-04-24T22:35:13.000Z"
202-
confidence = "null"
203-
between = "null"
204174

205175
[examples.LandUse.names]
206176
primary = "Chatham Islands Golf Club"
207-
common = "null"
208-
rules = "null"
209177

210178
[examples.LandUse.source_tags]
211179
"LINZ:source_version" = "V16"
@@ -217,12 +185,9 @@ source_ref = "http://www.linz.govt.nz/topography/topo-maps/"
217185
id = "6bbb5fe5-bf26-3efa-b120-0a7079b60840"
218186
geometry = "POINT (-177.031799 -84.934793)"
219187
version = 0
220-
level = "null"
221188
subtype = "physical"
222189
class = "cape"
223190
wikidata = "Q33140589"
224-
is_salt = "null"
225-
is_intermittent = "null"
226191
theme = "base"
227192
type = "water"
228193

@@ -237,13 +202,9 @@ property = ""
237202
dataset = "OpenStreetMap"
238203
record_id = "n11109190647@2"
239204
update_time = "2024-02-11T05:52:05.000Z"
240-
confidence = "null"
241-
between = "null"
242205

243206
[examples.Water.names]
244207
primary = "Thanksgiving Point"
245-
common = "null"
246-
rules = "null"
247208

248209
[examples.Water.source_tags]
249210
natural = "cape"

packages/overture-schema-buildings-theme/pyproject.toml

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,8 @@ packages = ["src/overture"]
4242
id = "148f35b1-7bc1-4180-9280-10d39b13883b"
4343
geometry = "POLYGON ((-176.6435004 -43.9938042, -176.6435738 -43.9937107, -176.6437726 -43.9937913, -176.6436992 -43.9938849, -176.6435004 -43.9938042))"
4444
version = 1
45-
level = "null"
46-
subtype = "null"
47-
class = "null"
48-
height = "null"
49-
names = "null"
5045
has_parts = false
5146
is_underground = false
52-
num_floors = "null"
53-
num_floors_underground = "null"
54-
min_height = "null"
55-
min_floor = "null"
56-
facade_color = "null"
57-
facade_material = "null"
58-
roof_material = "null"
59-
roof_shape = "null"
60-
roof_direction = "null"
61-
roof_orientation = "null"
62-
roof_color = "null"
63-
roof_height = "null"
6447
theme = "buildings"
6548
type = "building"
6649

@@ -75,29 +58,13 @@ property = ""
7558
dataset = "OpenStreetMap"
7659
record_id = "w519166507@1"
7760
update_time = "2017-08-27T21:39:50.000Z"
78-
confidence = "null"
79-
between = "null"
8061

8162
[[examples.BuildingPart]]
8263
id = "19412d64-51ac-3d6a-ac2f-8a8c8b91bb60"
8364
geometry = "POLYGON ((-73.2462509 -39.8108937, -73.2462755 -39.8109047, -73.246291 -39.8109182, -73.2463022 -39.8109382, -73.2463039 -39.810959, -73.2462962 -39.81098, -73.2462796 -39.8109977, -73.2462674 -39.8110052, -73.2462281 -39.8110153, -73.2461998 -39.811013, -73.2461743 -39.8110034, -73.2461566 -39.8109898, -73.246144 -39.8109702, -73.2461418 -39.8109427, -73.2461511 -39.8109221, -73.2461669 -39.8109066, -73.2461908 -39.8108947, -73.2462184 -39.8108898, -73.2462509 -39.8108937))"
8465
version = 0
8566
level = 3
86-
height = "null"
87-
names = "null"
8867
is_underground = false
89-
num_floors = "null"
90-
num_floors_underground = "null"
91-
min_height = "null"
92-
min_floor = "null"
93-
facade_color = "null"
94-
facade_material = "null"
95-
roof_material = "null"
96-
roof_shape = "null"
97-
roof_direction = "null"
98-
roof_orientation = "null"
99-
roof_color = "null"
100-
roof_height = "null"
10168
building_id = "bd663bd4-1844-4d7d-a400-114de051cf49"
10269
theme = "buildings"
10370
type = "building_part"
@@ -113,5 +80,3 @@ property = ""
11380
dataset = "OpenStreetMap"
11481
record_id = "w223076787@2"
11582
update_time = "2014-10-31T22:55:36.000Z"
116-
confidence = "null"
117-
between = "null"

packages/overture-schema-codegen/docs/design.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,10 @@ syntax. Extraction and the type registry carry no presentation logic.
213213
### Type registry
214214

215215
`extraction/type_registry.py` maps type names to per-target string representations via
216-
`TypeMapping`. `format_type_string()` wraps the resolved name with list/optional
217-
qualifiers. `is_semantic_newtype()` distinguishes NewTypes that deserve their own
218-
identity (like `FeatureVersion` wrapping `int32`) from pass-through aliases to
219-
registered primitives.
216+
`TypeMapping`. `resolve_type_name()` looks up the registry and returns the display
217+
string for a given target. `is_semantic_newtype()` distinguishes NewTypes that deserve
218+
their own identity (like `FeatureVersion` wrapping `int32`) from pass-through aliases
219+
to registered primitives.
220220

221221
### Markdown renderer
222222

@@ -240,11 +240,12 @@ Loads example data from theme `pyproject.toml` files, validates against Pydantic
240240
and flattens to dot-notation rows for display in feature pages. Also provides a starting
241241
point for generated test data.
242242

243-
`collect_dict_paths` walks the `FieldSpec` tree to identify dict-typed fields (like
244-
`tags: dict[str, str]`), returning their dot-paths as a `frozenset`. `flatten_example`
245-
checks this set before recursing into dicts -- paths in the set are kept as leaf values
246-
rather than being split into dot-notation rows. The pipeline computes `dict_paths` from
247-
`spec.fields` and threads it through `load_examples`.
243+
`validate_example` returns a Pydantic model instance. `flatten_model_instance` walks the
244+
instance recursively using `isinstance(value, BaseModel)` to distinguish model fields
245+
(recurse with dot notation) from dict fields (keep as leaf values). This eliminates the
246+
need for external schema information -- the model instance itself encodes the type
247+
structure. `augment_missing_fields` appends `(name, None)` entries for union cross-arm
248+
fields absent from the concrete variant instance.
248249

249250
## Extension Points
250251

packages/overture-schema-codegen/docs/walkthrough.md

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -594,36 +594,36 @@ schema.
594594
`resolve_pyproject_path` walks up from a model's module file to find `pyproject.toml`.
595595
`load_examples_from_toml` reads the `[examples.ModelName]` TOML section.
596596

597-
Validation requires three preprocessing steps that handle TOML's limitations and
598-
flat-schema conventions.
599-
600-
TOML has no null literal, so examples use the string `"null"` as a stand-in. `_denull`
601-
replaces these recursively, walking nested dicts and lists.
597+
Validation requires two preprocessing steps that handle flat-schema conventions.
602598

603599
Literal fields (like `theme="buildings"`) are omitted from examples since they carry
604600
constant values. `_inject_literal_fields` adds them back before validation by scanning
605601
`model_fields` for single-value `Literal` annotations via `single_literal_value`.
606602

607-
Discriminated union examples from flat parquet schemas include null fields from
603+
Discriminated union examples from flat Parquet schemas include null fields from
608604
non-selected variant arms. `_strip_null_unknown_fields` removes null-valued fields not
609605
in the common base's field set, so the selected arm's validator accepts the data without
610606
choking on fields that belong to sibling variants.
611607

612-
`collect_dict_paths` walks the `FieldSpec` tree to identify dict-typed fields (like
613-
`tags: dict[str, str]`), returning their dot-paths as a `frozenset`. Schema-notation
614-
paths use empty brackets (`items[].tags`) while runtime paths carry indices
615-
(`items[0].tags`); `_normalize_path` strips indices before membership checks.
608+
`validate_example` returns a Pydantic model instance. `flatten_model_instance` walks the
609+
instance recursively using `isinstance(value, BaseModel)` to distinguish model fields
610+
(recurse with dot notation) from dict fields (keep as leaf values). Lists of models
611+
use bracket notation (`sources[0].dataset`), nested lists use double-index notation
612+
(`hierarchies[0][1].name`). The model instance itself encodes the type structure,
613+
eliminating the need for external schema information.
614+
615+
For discriminated unions, the concrete variant instance lacks fields from other arms.
616+
`augment_missing_fields` compares base field names against the union's merged field list
617+
and appends `(name, None)` for absent fields, matching the flat Parquet schema where all
618+
variant columns exist.
616619

617-
`flatten_example` converts nested dicts to dot-notation. Nested dicts become
618-
`parent.child`, lists of dicts become `parent[0].child`. Dicts at paths in `dict_paths`
619-
are kept as leaf values -- a `tags` field typed as `dict[str, str]` renders as a whole
620-
map rather than being split into `tags.color`, `tags.size`. `order_example_rows` sorts by
621-
field position in the documentation's field order using a stable sort, so sub-fields
622-
maintain their original relative order.
620+
`order_example_rows` sorts by field position in the documentation's field order using a
621+
stable sort, so sub-fields maintain their original relative order.
623622

624623
`load_examples` orchestrates the full flow: find the pyproject.toml, load the TOML
625-
section, validate each example, flatten, and order. Invalid examples log a warning and
626-
skip rather than failing the pipeline.
624+
section, validate each example, flatten via `flatten_model_instance`, augment missing
625+
fields, and order. Invalid examples log a warning and skip rather than failing the
626+
pipeline.
627627

628628
## 16. Orchestration and CLI
629629

@@ -739,9 +739,9 @@ sources appear on the source NewType's page instead.
739739

740740
The example loader finds `pyproject.toml` in the transportation theme package, reads
741741
`[examples.Segment]`, validates each example against the union alias (injecting literal
742-
fields, stripping null fields from non-selected arms), computes `dict_paths` from
743-
`spec.fields` to identify dict-typed fields, flattens to dot-notation (keeping dict-typed
744-
fields as leaf values), and orders by field position.
742+
fields, stripping null fields from non-selected arms), flattens the model instance to
743+
dot-notation via `flatten_model_instance`, augments missing cross-arm fields, and orders
744+
by field position.
745745

746746
The Jinja2 template assembles the field table, optional constraints section, examples,
747747
and "Used By" partial into markdown.

0 commit comments

Comments
 (0)