Skip to content

Commit 15c5915

Browse files
committed
Add schema validation for shapes
1 parent a103aff commit 15c5915

4 files changed

Lines changed: 31 additions & 2 deletions

File tree

workflow/Snakefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ with open(workflow.source_path("internal/settings.yaml"), "r") as f:
1717
internal = yaml.safe_load(f)
1818

1919
# Python files that are imported from other scripts and need to be included when accessing the module
20-
workflow.source_path("scripts/_script_utils.py")
2120
workflow.source_path("scripts/_geo.py")
21+
workflow.source_path("scripts/_schemas.py")
22+
workflow.source_path("scripts/_script_utils.py")
2223

2324

2425
wildcard_constraints:

workflow/envs/default.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ dependencies:
77
- click=8.2.1
88
- geopandas=1.1.0
99
- pandas=2.3.0
10+
- pandera-geopandas=0.24.0
1011
- pyarrow=19.0.1
1112
- xarray=2025.6.1
1213
- netcdf4=1.7.2
@@ -21,4 +22,4 @@ dependencies:
2122
- pyproj=3.7.1
2223
- utm=0.7.0
2324
- glom=24.11.0
24-
- dask=2025.7.0
25+
- dask=2025.7.0

workflow/scripts/_schemas.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Schemas for tabular data used in the workflow."""
2+
3+
from pandera.pandas import DataFrameModel, Field, check
4+
from pandera.typing.geopandas import GeoSeries
5+
from pandera.typing.pandas import Series
6+
from shapely.geometry import Point
7+
8+
9+
class Shapes(DataFrameModel):
10+
class Config:
11+
coerce = True
12+
strict = False
13+
14+
shape_id: Series[str] = Field(unique=True)
15+
"Unique ID for this shape."
16+
country_id: Series[str]
17+
"ISO alpha-3 code."
18+
shape_class: Series[str] = Field(isin=["land", "maritime"])
19+
"Shape classifier"
20+
geometry: GeoSeries[Point] = Field()
21+
"Shape polygon."
22+
23+
@check("geometry", element_wise=True)
24+
def geom_not_empty(cls, geom):
25+
return (geom is not None) and (not geom.is_empty) and geom.is_valid

workflow/scripts/breakup_shape.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import click
66
import geopandas as gpd
7+
from _schemas import Shapes
78

89

910
@click.command()
@@ -25,6 +26,7 @@ def breakup_shape(shapes_path, split_by, output_path):
2526
output_path = Path(output_path)
2627
output_path.mkdir(parents=True, exist_ok=True)
2728
shapes = gpd.read_parquet(shapes_path)
29+
shapes = Shapes.validate(shapes)
2830

2931
# Print rows where geometry is empty
3032
if shapes.geometry.is_empty.any():

0 commit comments

Comments
 (0)