Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion src/geozarr_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def create_parser() -> argparse.ArgumentParser:
validate_parser.add_argument(
"--conventions",
nargs="+",
choices=["spatial", "proj", "multiscales"],
choices=["spatial", "proj", "multiscales", "geoemb"],
help="Conventions to validate (auto-detected if not specified)",
)
validate_parser.add_argument(
Expand Down Expand Up @@ -233,6 +233,26 @@ def info_command(args: argparse.Namespace) -> int:
print(f" - {asset}")
print()

if "geoemb" in conventions:
print("Geoembeddings:")
print(f" Type: {attrs.get('geoemb:type')}")
print(f" Dimensions: {attrs.get('geoemb:dimensions')}")
print(f" Model: {attrs.get('geoemb:model')}")
source_data = attrs.get("geoemb:source_data", [])
print(f" Source data: {len(source_data)} reference(s)")
print(f" Data type: {attrs.get('geoemb:data_type')}")
if attrs.get("geoemb:gsd"):
print(f" GSD: {attrs['geoemb:gsd']}m")
if attrs.get("geoemb:spatial_layout"):
print(f" Spatial layout: {attrs['geoemb:spatial_layout']}")
if attrs.get("geoemb:chip_layout"):
cl = attrs["geoemb:chip_layout"]
print(f" Chip layout: {cl.get('layout_type')} {cl.get('chip_size')}")
if attrs.get("geoemb:quantization"):
q = attrs["geoemb:quantization"]
print(f" Quantization: {q.get('method')} (from {q.get('original_dtype')})")
print()

if args.verbose:
print("Members:")
for name, item in group.items():
Expand Down
16 changes: 16 additions & 0 deletions src/geozarr_toolkit/conventions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@
"""

from geozarr_toolkit.conventions.common import ZarrConventionMetadata
from geozarr_toolkit.conventions.geoemb import (
GEOEMB_SCHEMA_URL,
GEOEMB_SPEC_URL,
GEOEMB_UUID,
ChipLayout,
Geoemb,
GeoembConventionMetadata,
Quantization,
)
from geozarr_toolkit.conventions.multiscales import (
MULTISCALES_SCHEMA_URL,
MULTISCALES_SPEC_URL,
Expand Down Expand Up @@ -61,6 +70,9 @@
)

__all__ = [
"GEOEMB_SCHEMA_URL",
"GEOEMB_SPEC_URL",
"GEOEMB_UUID",
"MULTISCALES_SCHEMA_URL",
"MULTISCALES_SPEC_URL",
"MULTISCALES_UUID",
Expand All @@ -70,12 +82,16 @@
"SPATIAL_SCHEMA_URL",
"SPATIAL_SPEC_URL",
"SPATIAL_UUID",
"ChipLayout",
"Geoemb",
"GeoembConventionMetadata",
"GeoProj",
"Multiscales",
"MultiscalesAttrs",
"MultiscalesConventionMetadata",
"Proj",
"ProjConventionMetadata",
"Quantization",
"ScaleLevel",
"Spatial",
"SpatialConventionMetadata",
Expand Down
211 changes: 211 additions & 0 deletions src/geozarr_toolkit/conventions/geoemb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""
Models for the Geoembeddings Zarr Convention.

This convention defines metadata for geospatial embedding groups stored in
Zarr format, including encoder model provenance, source data references,
and processing parameters.

Specification: https://github.com/geo-embeddings/embeddings-zarr-convention
"""

from __future__ import annotations

from typing import Annotated, Final, Literal

from pydantic import BaseModel, Field, model_validator

from geozarr_toolkit.conventions.common import ZarrConventionMetadata, is_none

GEOEMB_UUID: Final[Literal["61c12cc5-0e28-4056-999a-480cf3fb7e4c"]] = (
"61c12cc5-0e28-4056-999a-480cf3fb7e4c"
)
GEOEMB_SCHEMA_URL: Final[str] = (
"https://github.com/geo-embeddings/embeddings-zarr-convention/blob/main/schema.json"
)
GEOEMB_SPEC_URL: Final[str] = (
"https://github.com/geo-embeddings/embeddings-zarr-convention/blob/main/README.md"
)
Comment on lines +23 to +27
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not actually sure what to do about these links (these don't feel like the right thing to use). The readme of convention references v1 but those don't actually exist yet, and those links 404

https://github.com/geo-embeddings/embeddings-zarr-convention



class GeoembConventionMetadata(ZarrConventionMetadata):
"""Metadata for the geoemb: convention in zarr_conventions array."""

uuid: Literal["61c12cc5-0e28-4056-999a-480cf3fb7e4c"] = GEOEMB_UUID
name: Literal["geoemb:"] = "geoemb:"
schema_url: str = GEOEMB_SCHEMA_URL
spec_url: str = GEOEMB_SPEC_URL
description: str = (
"Geoembeddings convention for geospatial embedding arrays with model provenance"
)


class ChipLayout(BaseModel):
"""
Chip layout configuration for chip-type embeddings.

Describes how the source imagery was divided into chips (patches).

Attributes
----------
layout_type : str
Type of chip layout. Either "regular_grid" or "irregular".
chip_size : list[int]
Chip dimensions [height, width] in pixels.
stride : list[int] | None
Stride between chips [y, x]. Defaults to chip_size if not specified.
grid_id : str | None
Identifier for a predefined grid system.
grid_definition : str | None
URL to grid definition document.
"""

layout_type: Literal["regular_grid", "irregular"]
chip_size: list[int] = Field(min_length=2, max_length=2)
stride: list[int] | None = Field(None, exclude_if=is_none)
grid_id: str | None = Field(None, exclude_if=is_none)
grid_definition: str | None = Field(None, exclude_if=is_none)

model_config = {"extra": "forbid"}

@model_validator(mode="after")
def validate_chip_size_positive(self) -> ChipLayout:
"""Validate that chip_size values are positive."""
if any(s < 1 for s in self.chip_size):
raise ValueError("chip_size values must be positive integers")
return self

@model_validator(mode="after")
def validate_stride_length(self) -> ChipLayout:
"""Validate that stride has exactly 2 elements when provided."""
if self.stride is not None and len(self.stride) != 2:
raise ValueError("stride must have exactly 2 elements [y, x]")
return self


class ScaleScalar(BaseModel):
"""
Scalar scale for linear dequantization.

Dequantize with: value = quantized * scale + offset.
"""

type: Literal["scalar"]
scale: float
offset: float = 0.0

model_config = {"extra": "forbid"}


class ScaleArray(BaseModel):
"""
Per-pixel scale factors stored in a separate Zarr array.

Dequantize with: value[..., y, x] = quantized[..., y, x] * array[..., y, x].
Non-finite values (NaN, +inf) in the scale array indicate no-data pixels.
"""

type: Literal["array"]
array_name: str
nodata: float | str | None = Field(None, exclude_if=is_none)

model_config = {"extra": "forbid"}


Scale = Annotated[ScaleScalar | ScaleArray, Field(discriminator="type")]


class Quantization(BaseModel):
"""
Quantization details for compressed embeddings.

Attributes
----------
method : str
Quantization method (e.g., "linear", "per_pixel_scale",
"product_quantization", "binary").
original_dtype : str
Original data type before quantization (e.g., "float32").
quantized_dtype : str | None
Data type after quantization (e.g., "int8").
scale : ScaleScalar | ScaleArray | None
Scale parameters for dequantization.
link : str | None
URL to quantization codebook or lookup table.
"""

method: str
original_dtype: str
quantized_dtype: str | None = Field(None, exclude_if=is_none)
scale: Scale | None = Field(None, exclude_if=is_none)
link: str | None = Field(None, exclude_if=is_none)

model_config = {"extra": "forbid"}


class Geoemb(BaseModel):
"""
Geoembeddings convention attributes for a Zarr group.

Attributes
----------
type : str
Type of embedding: "pixel" for per-pixel embeddings,
"chip" for image patch embeddings. Required.
dimensions : int
Dimensionality of the embedding vector. Required.
model : str
URL reference to the encoder model used to generate embeddings. Required.
source_data : list[str]
URL references to the source datasets. Required, at least one item.
data_type : str
Data type of stored embeddings (e.g., "float32", "int8"). Required.
gsd : float | None
Ground sample distance in meters.
chip_layout : ChipLayout | None
Chip layout configuration. Required when type is "chip".
quantization : Quantization | None
Compression/quantization details.
spatial_layout : str | None
Spatial organization scheme: "utm_zones" or "global".
build_version : str | None
Version of the software that built this store.
benchmark : list[str] | None
URLs to benchmark evaluation results.
"""

type: Literal["pixel", "chip"] = Field(alias="geoemb:type")
dimensions: int = Field(alias="geoemb:dimensions", ge=1)
model: str = Field(alias="geoemb:model")
source_data: list[str] = Field(alias="geoemb:source_data", min_length=1)
data_type: str = Field(alias="geoemb:data_type")
gsd: float | None = Field(None, alias="geoemb:gsd", exclude_if=is_none)
chip_layout: ChipLayout | None = Field(
None, alias="geoemb:chip_layout", exclude_if=is_none
)
quantization: Quantization | None = Field(
None, alias="geoemb:quantization", exclude_if=is_none
)
spatial_layout: Literal["utm_zones", "global"] | None = Field(
None, alias="geoemb:spatial_layout", exclude_if=is_none
)
build_version: str | None = Field(
None, alias="geoemb:build_version", exclude_if=is_none
)
benchmark: list[str] | None = Field(
None, alias="geoemb:benchmark", exclude_if=is_none
)

model_config = {
"extra": "allow",
"populate_by_name": True,
"serialize_by_alias": True,
}

@model_validator(mode="after")
def validate_chip_layout_required(self) -> Geoemb:
"""Validate that chip_layout is provided when type is 'chip'."""
if self.type == "chip" and self.chip_layout is None:
raise ValueError(
"geoemb:chip_layout is required when geoemb:type is 'chip'"
)
return self
2 changes: 2 additions & 0 deletions src/geozarr_toolkit/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from geozarr_toolkit.helpers.validation import (
detect_conventions,
validate_attrs,
validate_geoemb,
validate_group,
validate_multiscales,
validate_multiscales_structure,
Expand All @@ -37,6 +38,7 @@
"from_geotransform",
"from_rioxarray",
"validate_attrs",
"validate_geoemb",
"validate_group",
"validate_multiscales",
"validate_multiscales_structure",
Expand Down
Loading
Loading