diff --git a/.github/workflows/generate.yml b/.github/workflows/generate.yml new file mode 100644 index 0000000..afb8517 --- /dev/null +++ b/.github/workflows/generate.yml @@ -0,0 +1,43 @@ +name: generate + +on: + push: + branches: [ main ] + pull_request: + paths-ignore: + - .gitlab-ci.yml + - .github/workflows/create_swg_report.yml + workflow_dispatch: + +jobs: + generate-html: + name: Generate site with Metanorma container + runs-on: ubuntu-latest + container: + image: metanorma/metanorma:latest + + steps: + - uses: actions/checkout@v4 + with: + token: ${{ github.token }} + submodules: true # ❓ Set to 'false' if submodules are not used + + # Optional: cache Fontist fonts (used by Metanorma for PDF generation) + - uses: actions/cache@v3 + with: + path: /root/.fontist + key: fontist-${{ runner.os }} + restore-keys: fontist-${{ runner.os }} + + - name: Generate site + env: + METANORMA_DEBUG: "1" + run: | + metanorma site generate --agree-to-terms + + # Optional: deploy to GitHub Pages if needed + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./_site diff --git a/metanorma.yml b/metanorma.yml new file mode 100644 index 0000000..f4ba3fc --- /dev/null +++ b/metanorma.yml @@ -0,0 +1,12 @@ +--- +metanorma: + source: + plantuml: + enabled: true + format: svg + files: + - standard/template/geozarr-spec.adoc + + collection: + organization: "OGC" + name: "OGC GeoZarr Specification" \ No newline at end of file diff --git a/standard/template/document.adoc b/standard/template/geozarr-spec.adoc similarity index 75% rename from standard/template/document.adoc rename to standard/template/geozarr-spec.adoc index 6d3e876..5e4c5d5 100644 --- a/standard/template/document.adoc +++ b/standard/template/geozarr-spec.adoc @@ -10,18 +10,20 @@ :received-date: 2029-03-30 :issued-date: 2029-03-30 :published-date: 2029-03-30 -:fullname: Editor One -:fullname_2: Editor Two -:docsubtype: Interface +:fullname: Christophe Noël +:fullname_2: Brianna Pagán +:docsubtype: Format :keywords: ogcdoc, OGC document, API, openapi, html :submitting-organizations: Organization One; Organization Two :mn-document-class: ogc :mn-output-extensions: xml,html,doc,pdf :local-cache-only: +:plantuml-format: svg +:plantuml-opts: debug :data-uri-image: -:pdf-uri: ./document.pdf -:xml-uri: ./document.xml -:doc-uri: ./document.doc +:pdf-uri: ./geozarr-spec.pdf +:xml-uri: ./geozarr-spec.xml +:doc-uri: ./geozarr-spec.doc :edition: 1.0 //// @@ -41,9 +43,13 @@ include::sections/clause_5_conventions.adoc[] include::sections/clause_6_informative_text.adoc[] -include::sections/clause_7_normative_text.adoc[] +include::sections/clause_7_unified_data_model.adoc[] -include::sections/clause_8_media_types.adoc[] +include::sections/clause_8_conformance.adoc[] + +include::sections/clause_9_zarr_encoding.adoc[] + +include::sections/clause_10_geotiff_encoding.adoc[] //// add or remove annexes after "A" as necessary diff --git a/standard/template/sections/clause_0_front_material.adoc b/standard/template/sections/clause_0_front_material.adoc index 407f411..b9f7975 100644 --- a/standard/template/sections/clause_0_front_material.adoc +++ b/standard/template/sections/clause_0_front_material.adoc @@ -1,77 +1,32 @@ .Preface -[NOTE] -==== -Insert Preface Text here. Give OGC specific commentary: describe the technical content, reason for document, history of the document and precursors, and plans for future work. -==== +The GeoZarr Unified Data Model and Encoding Standard defines a layered, standards-based framework for representing and encoding geospatial and scientific datasets in the Zarr format. It integrates foundational specifications such as the Unidata Common Data Model (CDM), the CF Conventions, and selected OGC and community standards to enable semantic, structural, and operational interoperability across Earth observation platforms and geospatial ecosystems. -//// -*OGC Declaration* -//// +This Standard introduces a unified model that harmonises metadata structures, array-based data representations, coordinate referencing, and multiscale tiling semantics. It provides a coherent framework that facilitates encoding into Zarr v2 and v3, supporting scalable, cloud-native workflows. -Attention is drawn to the possibility that some of the elements of this document may be the subject of patent rights. The Open Geospatial Consortium shall not be held responsible for identifying any or all such patent rights. +The purpose of this document is to provide implementation guidance and normative structure for consistent, interoperable adoption of GeoZarr across tools, platforms, and services. This work extends prior standardisation efforts within the OGC, including OGC API – Tiles, the Tile Matrix Set Standard, and EO metadata conventions, and anticipates integration with catalogue systems such as STAC. -Recipients of this document are requested to submit, with their comments, notification of any relevant patent claims or other intellectual property rights of which they may be aware that might be infringed by any implementation of the standard set forth in this document, and to provide supporting documentation. - -//// -NOTE: Uncomment ISO section if necessary - -*ISO Declaration* - -ISO (the International Organization for Standardization) is a worldwide federation of national standards bodies (ISO member bodies). The work of preparing International Standards is normally carried out through ISO technical committees. Each member body interested in a subject for which a technical committee has been established has the right to be represented on that committee. International organizations, governmental and non-governmental, in liaison with ISO, also take part in the work. ISO collaborates closely with the International Electrotechnical Commission (IEC) on all matters of electrotechnical standardization. - -International Standards are drafted in accordance with the rules given in the ISO/IEC Directives, Part 2. - -The main task of technical committees is to prepare International Standards. Draft International Standards adopted by the technical committees are circulated to the member bodies for voting. Publication as an International Standard requires approval by at least 75 % of the member bodies casting a vote. - -Attention is drawn to the possibility that some of the elements of this document may be the subject of patent rights. ISO shall not be held responsible for identifying any or all such patent rights. -//// +This Standard has been developed in collaboration with contributors from Earth observation, climate science, geospatial analysis, and cloud-native geodata infrastructure communities. Future work may extend this model to additional storage formats, API services, and semantic layers. [abstract] == Abstract - - - - -== Preface - -[NOTE] -==== -Insert Preface Text here. Give OGC specific commentary: describe the technical content, reason for document, history of the document and precursors, and plans for future work. > -Attention is drawn to the possibility that some of the elements of this document may be the subject of patent rights. The Open Geospatial Consortium shall not be held responsible for identifying any or all such patent rights. - -Recipients of this document are requested to submit, with their comments, notification of any relevant patent claims or other intellectual property rights of which they may be aware that might be infringed by any implementation of the standard set forth in this document, and to provide supporting documentation. -==== - -== Security considerations - -//If no security considerations have been made for this Standard, use the following text. - -No security considerations have been made for this Standard. - -//// -If security considerations have been made for this Standard, follow the examples found in IANA or IETF documents. Please see the following example. -“VRRP is designed for a range of internetworking environments that may employ different security policies. The protocol includes several authentication methods ranging from no authentication, simple clear text passwords, and strong authentication using IP Authentication with MD5 HMAC. The details on each approach including possible attacks and recommended environments follows. -Independent of any authentication type VRRP includes a mechanism (setting TTL=255, checking on receipt) that protects against VRRP packets being injected from another remote network. This limits most vulnerabilities to local attacks. -NOTE: The security measures discussed in the following sections only provide various kinds of authentication. No confidentiality is provided at all. This should be explicitly described as outside the scope....” -//// +The GeoZarr Unified Data Model and Encoding Standard specifies a conceptual and implementation framework for representing multidimensional, geospatial datasets using the Zarr format. This Standard builds upon the Unidata Common Data Model (CDM) and the Climate and Forecast (CF) Conventions, and introduces interoperable constructs for tiling, georeferencing, and metadata integration. +The model defines core elements—dimensions, coordinate variables, data variables, attributes—and optional extensions for multi-resolution overviews, affine geotransforms, and STAC metadata. Encoding guidance is provided for Zarr Version 2 and Zarr Version 3, including chunking, group hierarchy, and metadata conventions. +GeoZarr aims to bridge scientific and geospatial communities by enabling round-trip transformations with formats such as NetCDF and GeoTIFF, and supporting compatibility with tools in the scientific Python and geospatial ecosystems. This Standard enables scalable, standards-compliant, and semantically rich data structures for cloud-native Earth observation applications. == Submitters All questions regarding this submission should be directed to the editor or the submitters: +.Table of submitters +[%unnumbered] |=== -|*Name* |*Affiliation* -| | -|=== - -== Contributors - -//This clause is optional. - -Additional contributors to this Standard include the following: - -Individual name(s), Organization +| *Name* | *Affiliation* +|Christophe Noël _(editor)_ | Spacebel +|Brianna Pagán _(editor)_ | DevSeed +|Ryan Abernathey| EarthMover +| TBD | TBD +|=== \ No newline at end of file diff --git a/standard/template/sections/clause_10_geotiff_encoding.adoc b/standard/template/sections/clause_10_geotiff_encoding.adoc new file mode 100644 index 0000000..94d88dc --- /dev/null +++ b/standard/template/sections/clause_10_geotiff_encoding.adoc @@ -0,0 +1,3 @@ +== Unified Data Model Encoding for GeoTiff + +TIP: This is a very preliminary draft. The content is primarily for demonstrating the purpose of the proposed sections. diff --git a/standard/template/sections/clause_1_scope.adoc b/standard/template/sections/clause_1_scope.adoc index d21cf4b..93a5d91 100644 --- a/standard/template/sections/clause_1_scope.adoc +++ b/standard/template/sections/clause_1_scope.adoc @@ -1,5 +1,7 @@ == Scope -[NOTE] -==== -Insert Scope text here. Give the subject of the document and the aspects of that scope covered by the document. -==== + +The GeoZarr Unified Data Model and Encoding Standard defines a conceptual and implementation framework for representing and encoding geospatial and scientific datasets using the Zarr format. The scope of this Standard includes the definition of a format-agnostic unified data model, the specification of its encoding into Zarr Version 2 and Version 3, and the establishment of extension points to support interoperability with external metadata and tiling standards. + +This Standard addresses the needs of Earth observation, environmental monitoring, and geospatial analysis applications that require efficient, scalable access to multidimensional datasets. It enables the harmonisation of existing data models, such as the Unidata Common Data Model (CDM) and the Climate and Forecast (CF) Conventions, with operational encoding formats suitable for cloud-native storage and analysis. + +Typical use cases include the storage, transformation, discovery, and processing of raster and gridded data, data cubes with temporal or vertical dimensions, and catalogue-enabled datasets integrated with metadata standards such as STAC and OGC Tile Matrix Sets. diff --git a/standard/template/sections/clause_2_conformance.adoc b/standard/template/sections/clause_2_conformance.adoc index 82f3cff..52e1a7c 100644 --- a/standard/template/sections/clause_2_conformance.adoc +++ b/standard/template/sections/clause_2_conformance.adoc @@ -1,16 +1,49 @@ == Conformance -This standard defines XXXX. -Requirements for N standardization target types are considered: +The GeoZarr Unified Data Model is structured around a modular set of requirements classes. These classes define the conformance criteria for datasets and implementations adopting the GeoZarr specification. Each class provides a distinct set of structural or semantic expectations, facilitating interoperability across a broad spectrum of geospatial and scientific use cases. -* AAAA -* BBBB +The *Core* requirements class defines the minimal compliance necessary to claim conformance with the GeoZarr Unified Data Model. It is intentionally open and permissive, supporting incremental adoption and broad compatibility with existing Zarr tools and data models based on the Unidata Common Data Model (CDM). -Conformance with this standard shall be checked using all the relevant tests specified in Annex A (normative) of this document. The framework, concepts, and methodology for testing, and the criteria to be achieved to claim conformance are specified in the OGC Compliance Testing Policies and Procedures and the OGC Compliance Testing web site. +Additional requirements classes are defined to support enhanced functionality, semantic richness, and interoperability with established geospatial conventions and systems. These include extensions for time series, coordinate systems, affine transformations, and multiscale tiling. -In order to conform to this OGC® interface standard, a software implementation shall choose to implement: +.Requirements Classes Overview +[cols="30,40,30", options="header"] +|=== +|Requirements Class | Description | Identifier -* Any one of the conformance levels specified in Annex A (normative). -* Any one of the Distributed Computing Platform profiles specified in Annexes TBD through TBD (normative). +|Core Model +|Specifies minimum conformance for encoding multidimensional datasets in Zarr using CDM-aligned constructs. Includes dimensions, variables, attributes, and groups. +|`http://www.opengis.net/spec/geozarr/1.0/conf/core` -All requirements-classes and conformance-classes described in this document are owned by the standard(s) identified. +|Time Series Support +|Defines conventions for temporal dimensions and time coordinate variables to support time-aware arrays. +|`http://www.opengis.net/spec/geozarr/1.0/conf/time` + +|Coordinate Reference Systems +|Specifies use of CF-compliant CRS metadata, including `grid_mapping`, `standard_name`, and EPSG codes. +|`http://www.opengis.net/spec/geozarr/1.0/conf/crs` + +|GeoTransform Metadata +|Enables affine spatial referencing via GDAL-compatible `GeoTransform` metadata and optional interpolation hints. +|`http://www.opengis.net/spec/geozarr/1.0/conf/geotransform` + +|Multiscale Overviews +|Specifies multiscale tiled layout using zoom levels and Tile Matrix Sets as per OGC API – Tiles. +|`http://www.opengis.net/spec/geozarr/1.0/conf/overviews` + +|STAC Metadata Integration +|Allows embedding or referencing of STAC Collection/Item metadata for discovery and indexing. +|`http://www.opengis.net/spec/geozarr/1.0/conf/stac` + +|Projection Coordinates +|Supports encoding of data in projected coordinate systems and association with spatial reference metadata. +|`http://www.opengis.net/spec/geozarr/1.0/conf/projected` + +|Spectral Bands +|Defines conventions for encoding multi-band imagery, including band identifiers, wavelengths, and metadata attributes. +|`http://www.opengis.net/spec/geozarr/1.0/conf/bands` +|=== + +Each requirements class is independently defined. Implementations may declare conformance with any subset of classes appropriate to their use case. All classes build upon the Core model. + +Associated conformance tests for each class are detailed in Annex A. diff --git a/standard/template/sections/clause_3_references.adoc b/standard/template/sections/clause_3_references.adoc index dfb9212..b5f9d31 100644 --- a/standard/template/sections/clause_3_references.adoc +++ b/standard/template/sections/clause_3_references.adoc @@ -5,49 +5,32 @@ The following normative documents contain provisions that, through reference in [NOTE] ==== -Insert References here. If there are no references, leave this section empty. - -References are to follow the Springer LNCS style, with the exception that optional information may be appended to references: DOIs are added after the date and web resource references may include an access date at the end of the reference in parentheses. See examples from Springer and OGC below. +References follow the Springer LNCS citation style. DOIs and persistent URLs are provided where applicable. ==== -* [[[Smith81,Identification of Common Molecular Subsequences]]], -_Identification of Common Molecular Subsequences_. -Smith, T.F., Waterman, M.S., J. Mol. Biol. 147, 195–197 (1981) - -* [[[May06,ZIB Structure Prediction Pipeline]]], -_ZIB Structure Prediction Pipeline: Composing a Complex Biological Workflow through Web Services_. -May, P., Ehrlich, H.C., Steinke, T. In: Nagel, W.E., Walter, -W.V., Lehner, W. (eds.) Euro-Par 2006. LNCS, vol. 4128, pp. 1148–1158. Springer, -Heidelberg (2006) - -* [[[Grid,The Grid]]], _The Grid: Blueprint for a New Computing Infrastructure._, -Foster, I., Kesselman, C.. Morgan Kaufmann, San Francisco (1999). - -* [[[Czajkowski01,Grid Information Services for Distributed Resource Sharing]]], -_Grid Information Services for Distributed Resource Sharing._ -Czajkowski, K., Fitzgerald, S., Foster, I., Kesselman, C. In: 10th IEEE International Symposium on High -Performance Distributed Computing, pp. 181–184. IEEE Press, New York (2001) - -* [[[Foster02,The Physiology of the Grid]]], -_The Physiology of the Grid: an Open Grid Services Architecture for Distributed Systems Integration._ -Foster, I., Kesselman, C., Nick, J., Tuecke, S. Technical report, Global Grid Forum (2002) - -* [[[NCBI,NCBI]]], _National Center for Biotechnology Information_, http://www.ncbi.nlm.nih.gov - -* [[[ISO19101-1,ISO 19101-1:2014]]], Geographic information -- Reference model -- Part 1: Fundamentals +* [[[ZarrV2, Zarr Specification v2]]], +Miles, A., et al.: _Zarr Specification Version 2_. Zarr Developers. https://zarr.readthedocs.io/en/stable/spec/v2.html -* [[[ISO19115-1,ISO 19115-1:2014]]], _Geographic information -- Metadata -- Part 1: Fundamentals_ +* [[[ZarrV3, Zarr Specification v3]]], +Zarr Community: _Zarr Specification Version 3_. https://zarr-specs.readthedocs.io/en/latest/v3.0 -* [[[ISO19157,ISO 19157:2013]]], _Geographic information -- Data quality_ +* [[[CDM, Unidata Common Data Model]]], +Unidata: _The Common Data Model_. https://docs.unidata.ucar.edu/netcdf-java/5.0/userguide/common_data_model_overview.html -* [[[ISO19139,ISO 19139:2007]]], _Geographic information -- Metadata -- XML schema implementation_ +* [[[NetCDFClassic, NetCDF Classic Format]]], +Rew, R., Davis, G.: _NetCDF: An Interface for Scientific Data Access_. IEEE Computer Graphics and Applications, 10(4), 76–82 (1990). https://doi.org/10.1109/38.56302 -* [[[ISO19115-3,ISO 19115-3]]], _Geographic information -- Metadata -- Part 3: XML schemas_ (2016) +* [[[CFConventions, CF Metadata Conventions]]], +CF Community: _Climate and Forecast (CF) Metadata Conventions, Version 1.10_. https://cfconventions.org/ -* [[[OGC15-097,OGC 15-097]]], _OGC Geospatial User Feedback Standard: Conceptual Model_ (2016) +* [[[GDAL, GDAL – Geospatial Data Abstraction Library]]], +GDAL Developers: _GDAL/OGR Version 3.8 Documentation_. Open Source Geospatial Foundation. https://gdal.org -* [[[OGC12-019,OGC 12-019]]], _OGC City Geography Markup Language (CityGML) Encoding Standard_ (2012) +* [[[OGCTMS, OGC Two Dimensional Tile Matrix Set]]], +Open Geospatial Consortium: _OGC Two Dimensional Tile Matrix Set and Tile Pyramid_ (OGC 17-083r2). https://docs.ogc.org/is/17-083r2/17-083r2.html -* [[[OGC14-005r3,OGC 14-005r3]]], _OGC IndoorGML_ (2014) +* [[[STAC, SpatioTemporal Asset Catalog (STAC) Specification]]], +STAC Community: _STAC Specification v1.0.0_. https://stacspec.org/en/ -* [[[OGC06121r9,OGC 06-121r9]]], _OGC Web Service Common Implementation Specification_, April 7, 2010. http://portal.opengeospatial.org/files/?artifact_id=38867 \ No newline at end of file +* [[[OGCCTPP, OGC Compliance Testing Policies and Procedures]]], +Open Geospatial Consortium: _OGC Compliance Testing Policies and Procedures_, OGC 08-134r10. https://portal.ogc.org/files/?artifact_id=55184 diff --git a/standard/template/sections/clause_4_terms_and_definitions.adoc b/standard/template/sections/clause_4_terms_and_definitions.adoc index f6411d3..007f320 100644 --- a/standard/template/sections/clause_4_terms_and_definitions.adoc +++ b/standard/template/sections/clause_4_terms_and_definitions.adoc @@ -1,17 +1,64 @@ -== Terms and definitions +== Terms, definitions and abbreviated terms -This document uses the terms defined in Sub-clause 5.3 of <>, which is based on the ISO/IEC Directives, Part 2, Rules for the structure and drafting of International Standards. In particular, the word "`shall`" (not "`must`") is the verb form used to indicate a requirement to be strictly followed to conform to this standard. +=== Terms and definitions -For the purposes of this document, the following additional terms and definitions apply. +==== array -=== example term +A multidimensional, regularly spaced collection of values (e.g., raster data or gridded measurements), typically indexed by dimensions such as time, latitude, longitude, or spectral band. -term used for exemplary purposes +==== chunk -[.source] -<> +A sub-array representing a partition of a larger array, used to optimise data access and storage. In Zarr, data is stored and accessed as a collection of independently compressed chunks. -NOTE: An example note. +==== coordinate variable -[example] -Here's an example of an example term. +A one-dimensional array whose values define the coordinate system for a dimension of one or more data variables. Typical examples include latitude, longitude, time, or vertical levels. + +==== data variable + +An array containing the primary geospatial or scientific measurements of interest (e.g., temperature, reflectance). Data variables are defined over one or more dimensions and associated with attributes. + +==== dimension + +An index axis along which arrays are organised. Dimensions provide a naming and ordering scheme for accessing data in multidimensional arrays (e.g., `time`, `x`, `y`, `band`). + +==== group + +A container for datasets, variables, dimensions, and metadata in Zarr. Groups may be nested to represent a logical hierarchy (e.g., for resolutions or collections). + +==== metadata + +Structured information describing the content, context, and semantics of datasets, variables, and attributes. GeoZarr metadata includes CF attributes, geotransform definitions, and links to STAC metadata where applicable. + +==== multiscale dataset + +A dataset that includes multiple representations of the same data variable at varying spatial resolutions. Each resolution level is associated with a tile matrix from an OGC Tile Matrix Set. + +==== tile matrix set + +A spatial tiling scheme defined by a hierarchy of zoom levels and consistent grid parameters (e.g., scale, CRS). Tile Matrix Sets enable spatial indexing and tiling of gridded data. + +==== transform + +An affine transformation used to convert between grid coordinates and geospatial coordinates, typically defined using the GDAL GeoTransform convention. + +==== unified data model (UDM) + +A conceptual model that defines how to structure geospatial data in Zarr using CDM-based constructs, including support for coordinate referencing, metadata integration, and multiscale representations. + +=== Abbreviated Terms + +API:: Application Programming Interface +CDM:: Common Data Model +CF:: Climate and Forecast Conventions +CRS:: Coordinate Reference System +EPSG:: European Petroleum Survey Group +GDAL:: Geospatial Data Abstraction Library +GeoTIFF:: Georeferenced Tagged Image File Format +JSON:: JavaScript Object Notation +OGC:: Open Geospatial Consortium +STAC:: SpatioTemporal Asset Catalog +UDM:: Unified Data Model +URI:: Uniform Resource Identifier +URL:: Uniform Resource Locator +Zarr:: Zipped Array Storage format diff --git a/standard/template/sections/clause_5_conventions.adoc b/standard/template/sections/clause_5_conventions.adoc index 6a26680..79640e5 100644 --- a/standard/template/sections/clause_5_conventions.adoc +++ b/standard/template/sections/clause_5_conventions.adoc @@ -1,10 +1,34 @@ == Conventions -This sections provides details and examples for any conventions used in the document. Examples of conventions are symbols, abbreviations, use of XML schema, or special notes regarding how to read the document. +This section describes the conventions used throughout this Standard, including identifiers, metadata schemas, and referencing mechanisms relevant to the GeoZarr Unified Data Model. === Identifiers -The normative provisions in this standard are denoted by the URI -`http://www.opengis.net/spec/{standard}/{m.n}` +The normative provisions in this Standard are denoted by the base URI: + +`http://www.opengis.net/spec/geozarr/1.0` + +All requirements, recommendations, permissions, and conformance tests that appear in this document are assigned relative URIs anchored to this base. + +For example: + +`http://www.opengis.net/spec/geozarr/1.0/conf/core` — refers to the Core Requirements Class of the GeoZarr Unified Data Model. + +=== Data Encoding + +This Standard specifies the encoding of geospatial data in the Zarr format. Zarr is a chunked, compressed, binary format for n-dimensional arrays, with support for both Version 2 and Version 3 encodings. + +The specification makes extensive use of: + +- `zarr.json` metadata documents (Zarr v3) +- `.zgroup`, `.zattrs`, `.zarray` metadata files (Zarr v2) +- JSON-compatible structures for metadata, attributes, and conformance declarations + +=== Schemas + +Metadata schemas referenced in this Standard are represented using JSON-compatible objects and may be defined formally using JSON Schema. Metadata structures for tile matrix sets, STAC properties, or CF metadata may be embedded inline or referenced externally via URI. + +=== URI Usage + +URIs used in this Standard must comply with [RFC3986] (URI Syntax). When including reserved characters in a URI, they must be percent-encoded. Dataset identifiers, metadata links, and STAC references should use persistent and canonical forms to support reproducibility and catalogue integration. -All requirements and conformance tests that appear in this document are denoted by partial URIs which are relative to this base. diff --git a/standard/template/sections/clause_6_informative_text.adoc b/standard/template/sections/clause_6_informative_text.adoc index 719cad4..591f4f1 100644 --- a/standard/template/sections/clause_6_informative_text.adoc +++ b/standard/template/sections/clause_6_informative_text.adoc @@ -1,10 +1,30 @@ -[obligation=informative] -== Clauses not containing normative material +[[overview]] +== Overview -Paragraph +The GeoZarr Unified Data Model and Encoding Standard defines a conceptual and implementation framework for representing multidimensional geospatial data using the Zarr format. Developed under the guidance of the OGC GeoZarr Standards Working Group (SWG), the Standard establishes conventions for encoding scientific and Earth observation datasets in a way that promotes scalability, interoperability, and compatibility with cloud-native infrastructure. -=== Clauses not containing normative material sub-clause 1 +GeoZarr is built on widely adopted community standards, including the Unidata Common Data Model (CDM) and Climate and Forecast (CF) Conventions. It introduces additional extensions and structural constructs to support multi-resolution tiling, geospatial referencing, and catalogue-enabled metadata integration (e.g., STAC). -Paragraph +This Standard provides both: + +* **Core requirements**, which define minimal compliance to represent array-based datasets using CDM constructs in Zarr, supporting open and permissive adoption across use cases. +* **Modular extension classes**, which define additional capabilities such as time series support, affine geotransform referencing, multi-resolution overviews, and projection coordinates, in line with OGC and community practices. + +These modular components enable GeoZarr to serve a wide range of applications—from basic EO data storage to high-performance, cloud-native visualisation and analytics workflows. + +=== Encodings + +GeoZarr supports encoding in both Zarr Version 2 and Zarr Version 3. Each version defines how arrays, groups, and metadata are stored within a directory-based structure. All metadata is encoded in JSON-compatible formats, ensuring both human readability and machine interoperability. + +Encoding guidelines include: + +* Hierarchical grouping of datasets via Zarr groups. +* Dimension indexing and binding via dimension metadata. +* Attribute-based metadata compliant with CF conventions. +* Multi-resolution overviews aligned with OGC Tile Matrix Sets. +* Optional integration of STAC metadata for discovery and cataloguing. + +JSON is the primary format for metadata, attributes, and structural declarations. Implementations are encouraged to support standardised naming conventions, EPSG code references, and structured metadata to facilitate search, validation, and transformation across platforms. + +GeoZarr does not prescribe a single interface for data access. Instead, it enables **serverless and cloud-native** data access strategies by aligning its model with chunked, parallelisable storage patterns that are optimised for use in object stores and analytical environments. -=== Clauses not containing normative material sub-clause 2 diff --git a/standard/template/sections/clause_7_annex_mapping.adoc.adoc b/standard/template/sections/clause_7_annex_mapping.adoc.adoc deleted file mode 100644 index 8a075cb..0000000 --- a/standard/template/sections/clause_7_annex_mapping.adoc.adoc +++ /dev/null @@ -1,432 +0,0 @@ -== GeoZarr format requirements - -TIP: This is a very preliminary draft. The content is primarily for demonstrating the purpose of the proposed sections. The main focus should be on the table of contents. - -A GeoZarr data format is a regular Zarr hierarchy that represents geospatial coverages such as granules, scenes series, mosaics or any spatio temporal asset (as per STAC specification). - -=== Overview and Definitions - -The OGC Abstract Topic 6 [OGC 07-011] standard defines all geographic object as a feature, with coverage being a special type for any digital geospatial information representing space and time varying phenomena. The ISO 19123 standard provides a formal definition for coverages: a "feature acting as a function that returns values from its range for any direct position within its spatiotemporal domain". - -GeoZarr, like many array-oriented geospatial data formats (e.g., NetCDF, GeoTIFF), primarily supports *Rectified Grid Coverages*. Rectified Grid Coverage is a type of grid coverage that aligns with a coordinate reference system, ensuring that each cell or grid point corresponds precisely to a specific geographic location. - -The base terminology in the scope of this specification includes the following terms: - -- A *GeoZarr Instance* (or GeoZarr) is a hierarchy of objects including attributes and arrays describing geospatial information. -- A *GeoZarr Data Variable* is the arrray and attributes that define the values of a n-D coverage (i.e. a Rectified Grid Coverage) -- A *GeoZarr Coordinate Variable* is the array (might be empty) and attributes that define a coordinate dimension of a n-D coverage -- A *GeoZarr Auxiliary Variable* is the array (might be empty) and attributes that define other type of information -- A *GeoZarr Dataset* is a self-describing collection of n-D coverages defined by a set of values, coordinates and attributes (similar to a NetCDF dataset). - -Unlike some popular geospatial data formats, GeoZarr is not limited to 2D rasters and extends to multiple dimensions, including time, altitude, wavelength, and others. Additionally, the order of these dimensions is not fixed, allowing for optimizations in data analysis. - -A Dataset (referenced as an asset in a STAC item) facilitates the discovery and handling of the coverages by clients, such as web maps, and supports advanced capabilities such as pyramiding. However,GeoZarr also flexible and adaptable enough to accommodate other types of information: the specification aim to ensure that a transformation to GeoZarr from a source format can be reverted back to the original format. - -=== Underlying GeoZarr Requirements - -The requirement class GeoZarr Core is mandatory for all GeoZarr instances and must be specified at the root level with the `conformsTo` attribute. - -Some requirement classes are optional and define specific type of assets to facilitate standard interpretation by clients, such as a the requirement class Dataset. These optional requirement classes must be specified at the appropriate level within the hierarchy, using the conformsTo attribute to indicate adherence to the respective requirement class. - -TIP: maybe list possible requirements classes and purpose - -==== Requirement Class GeoZarr Core - -The base requirement class is designed to be flexible, facilitating conversion from any data source and support most source formats.. - -[[req_geozarr-core]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/geozarr-core {set:cellbgcolor:#FFFFFF} -|Target type | Zarr Encoder -|Dependency | TBD -|=== - -===== Structure - -A GeoZarr instance must consist of any Zarr tree structure with a root that is always a Zarr group, include the attribute conformsTo set to the GeoZarr requirement class in the root group, and allow any number of hierarchical levels within the groups. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr/structure -| A {set:cellbgcolor:#EEEEEE} | A GeoZarr instance (or GeoZarr) must consist of a Zarr tree structure. -| B {set:cellbgcolor:#EEEEEE} | The GeoZarr root must always be a Zarr group. -| C {set:cellbgcolor:#EEEEEE} | The root group must include the attribute `conformsTo` set to the GeoZarr requirement class. -| D {set:cellbgcolor:#EEEEEE} | The GeoZarr instance may have any number of levels within the hierarchy. -|=== - -===== Variables - -A variable represents an array of values of the same type in a Zarr array. A variable can describe coverage values, their coordinates, or any auxiliary data (i.e., additional information to the coverage). These different types of variables are defined further in the specification. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr/variable -| A {set:cellbgcolor:#EEEEEE} | A variable represents an array of values of the same type and is stored in Zarr arrays. -|=== - - -===== Attributes - -GeoZarr attributes are used to store ancillary data or metadata at any level of the hierarchy. This information can pertain to variables, coordinates, spatio-temporal assets, or any other relevant purpose. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr/attributes -| A {set:cellbgcolor:#EEEEEE} | GeoZarr attributes are used to store ancillary data or metadata at any level of the hierarchy. -| B {set:cellbgcolor:#EEEEEE} | GeoZarr attributes must be encoded as Zarr attributes. -|=== - -==== Requirement Class GeoZarr Dataset - -[[req_geozarr-dataset]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/geozarr-dataset {set:cellbgcolor:#FFFFFF} -|Target type | Zarr Encoder -|Dependency | TBD -|=== - -===== Notion of Dataset - -GeoZarr defines flexible conventions to encode content from various source formats, including geospatial arrays (raster), non-geospatial arrays, and hierarchical structures. - -GeoZarr emphasizes Rectified Grid Coverages, which can be easily discovered, interpreted, and displayed on a map. A coverage is spatial data organized in a regular grid, where each cell holds a value representing a specific geographic area. Examples include 2D Rasters, Raster Time Series, and Geo-Datacubes (with dimensions like time, light spectrum, altitude, etc.). - -A GeoZarr Dataset, consists of a collection of coverage defined by data variables, their common coordinates, and some attributes which together form a self describing dataset and represent a geospatial phenomenon in the data hierarchy. GeoZarr defines the structure and necessary metadata for understanding this dataset, such as an index of available variables, the projection used, and the coordinates describing the dimensions of these variables. - -The purpose of a GeoZarr Dataset is to maximize compatibility and facilitate the seamless mapping of diverse source formats into a standardized, easily interpretable structure. - -**Figure 1: GeoZarr Dataset Abstract Representation** - -```mermaid -classDiagram - class Dataset { - +attributes - } - class DataVariable { - +values - +attributes - } - class CoordinateVariable { - +coordinates - +attributes - } - class AuxiliaryVariable { - +data - +attributes - } - - Dataset --> "1..*" DataVariable : includes - Dataset --> "1..*" CoordinateVariable : includes - Dataset --> "0..*" AuxiliaryVariable : includes - CoordinateVariable --> DataVariable : coordinates -``` - -===== Dataset Structure - -A GeoZarr may include Dataset Groups which consists in n-D variables observed by a sensor (temperature, humidity, elevation). These variables are defined by geospatial coordinates and optional extra dimensions (time, altitude, etc.). - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/group -| A {set:cellbgcolor:#EEEEEE} | A Dataset must be represented by a Zarr group. -| B {set:cellbgcolor:#EEEEEE} | The Zarr group must include the attribute `conformsTo` set to the Dataset requirement class. -| C {set:cellbgcolor:#EEEEEE} | Coordinates, attributes, and any additional information must be represented in the Zarr group or children Zarr objects (see furhter equirements) -|=== - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | Each coordinate variable must include the Climate and Forecast (CF) standard name in the `standard_name` attribute of the Zarr array. -|=== - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/data-variable-coordinates -| A {set:cellbgcolor:#EEEEEE} | Data Variables (coverages) in a dataset should share a common set of coordinates and coordinate reference system. -|=== - - -**Hierarchy of Zarr Elements** - -```mermaid -classDiagram - class ZarrGroup { - +attrs (attributes) - } - class ZarrArray { - +attrs (attributes) - } - - ZarrGroup <|-- Dataset : maps to - ZarrArray <|-- Coordinate : maps to - ZarrArray <|-- DataVariable : maps to - - class Dataset { - } - class Coordinate { - } - class DataVariable { - } - - Dataset --> ZarrGroup - ZarrGroup --> "1..*" ZarrArray : contains - Coordinate --> ZarrArray - DataVariable --> ZarrArray -``` - -Below is a representation of a Zarr structure for an abstract Dataset with a single data variable. - -``` -GeoZarr_Dataset/ -├── .zgroup -├── attrs.json -├── data_variable/ -│ ├── .zarray -│ ├── attrs.json -│ └── data (chunks) -├── latitude/ -│ ├── .zarray -│ ├── attrs.json -│ └── data (chunks) -├── longitude/ -│ ├── .zarray -│ ├── attrs.json -│ └── data (chunks) -└── time/ - ├── .zarray - ├── attrs.json - └── data (chunks) -``` - -INFO: a coordinate is not necessary a list of positions (labelled coordinates) but might be encoded in different ways further defined. - -NOTE: We may require or recommend that a Dataset is restricted to a single data variable or to variable with consistent coordinates (otherwise the group is a mess). We might specify also an attribute for a index of variables. - - -===== Data Variables - -TIP: Defines the requirements for the variables in a dataset (how to specify dimensions and relationship with the coordinates sibling) - -A Data Variable holds the data values of the observed geospatial phenomena. A variable has a name, type,any dimension, attributes and values. - -TBD: can/should a data variable have dimensions which are not coordinates - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/data-variable -| A {set:cellbgcolor:#EEEEEE} | Each data variable (values of a rectified grid coverage) must be stored as a child Zarr array within the dataset group. -| B {set:cellbgcolor:#EEEEEE} | The child Zarr array must include the attribute `_ARRAY_DIMENSIONS` which lists the dimension names. -| C {set:cellbgcolor:#EEEEEE} | For each dimension listed in `_ARRAY_DIMENSIONS`, there must be a corresponding coordinate variable in the dataset group. -|=== - -Each data variable must: -- Be stored as a child Zarr array within the dataset group. -- Include the attribute `_ARRAY_DIMENSIONS` listing the dimension names. -- Have a corresponding coordinate variable for each dimension listed in `_ARRAY_DIMENSIONS` within the dataset group. - - -===== Coordinates - -TIP: Defines the requirement for the data coordinates and reference to the requirement classes for the different encoding of data coordinate. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | Each coordinate variable (representing the positions of one dimension of a data variable) must be represented in a child Zarr array within the dataset group. -| B {set:cellbgcolor:#EEEEEE} | The Zarr array variables must be named with the same name as the dimension of the data variable they represent. -|=== - -Each coordinate variable must: -- Be represented in a child Zarr array within the dataset group. -- Be named with the same name as the dimension of the data variable it represents. - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | Each coordinate variable must include the Climate and Forecast (CF) standard name in the `standard_name` attribute of the Zarr array. -|=== - -Each coordinate variable should: -- Include the Climate and Forecast (CF) standard name in the `standard_name` attribute of the Zarr array. - - -=== Coordinates - - -==== Coordinate Types - -TIP: Defines what are the requirement in GeoZarr related to latitude, longitude, time, etc. metadata such as does it impose to use CF standard names for qualifying the coordinate (or another convention from GDAL) - -==== Geospatial Coordinate Encodings - -There are multiple types of encoding for coordinates, each serving different purposes and applications in geospatial data processing. Some common examples include: - -* Geospatial Control Points (labeled Coordinates) : each data point or grid cell is explicitly assigned a coordinate value, which can be used to directly map and reference spatial data. -* Affine Transforms (Coordinate Origin and Step): this involves defining a starting point (origin) and a regular interval (step) between points. This method is commonly used in grid-based data where the position of each cell is calculated based on its distance from the origin. - -Proposed encoding: -- 2D array (the nominal encoding applied by xarray) -- origin/offset: -- COARDS : - -===== Requirements Class Geospatial_Control_Points - -Geospatial Control Points (GCPs), also known as Labeled Coordinates, are specific geographic locations with known coordinates. These points serve as reference markers to accurately align and georeference spatial data in mapping and GIS applications, ensuring that the data corresponds correctly to real-world locations. - -[[req_geozarr-coordinate-labelled]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-labelled {set:cellbgcolor:#FFFFFF} -|Target type | Dataset Coordinate -|Dependency | TBD -|=== - - -===== Requirements Class CoordinateOriginOffset - -TIP: It is not supported yet in the model, but this seems relevant to be added. - -[[req_geozarr-coordinate-oo]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-oo {set:cellbgcolor:#FFFFFF} -|Target type | Dataset Coordinate -|Dependency | TBD -|=== - -To accurately represent the spatial dimensions of the dataset, each coordinate type origin offset must be defined in a child Zarr array within the dataset. This array must contain the triplet of values: origin, offset, and end, to describe the coordinate's range and intervals. Additionally, the coordinate variable must include a CF standard name in the `standard_name` attribute, specifically for latitude or longitude. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | A coordinate type origin offset should be represented in a child Zarr array of the dataset. -| B {set:cellbgcolor:#EEEEEE} | The coordinate variable must define in the array the triplet of values: origin, offset, end. -| C {set:cellbgcolor:#EEEEEE} | The coordinate variable must provide a standard name (CF) for latitude or longitude in the `standard_name` attribute. -|=== - - To enhance clarity and interoperability, it is recommended that each coordinate variable link to the `grid_mapping` variable, which describes the CRS applicable to this coordinate. - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | The coordinate variable should link to the `grid_mapping` variable defined to describe the CRS that applies to this coordinate. -|=== - -The coordinate variable should: -- Link to the `grid_mapping` variable defined to describe the CRS that applies to this coordinate. - - -===== Requirements Class CoordinateVector - -TIP: please add the definition - -[[req_geozarr-coordinate-vector]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-vector {set:cellbgcolor:#FFFFFF} -|Target type | TBD -|Dependency | TBD -|=== - - -==== Coordinates Reference System Encodings - -TIP: any consideration with projections and affine transformations ? - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/data-variable-coordinates -| A {set:cellbgcolor:#EEEEEE} | The coordinate reference system (CRS) must be indicated for each data variable (coverage). -| B {set:cellbgcolor:#EEEEEE} | The CRS should be represented in a child Zarr array of the dataset (auxiliary variable). -| C {set:cellbgcolor:#EEEEEE} | The CRS variable name should be referenced in the data variable (coverage) in the `grid_mapping` attribute. -| D {set:cellbgcolor:#EEEEEE} | The CRS should be described in the attributes of the CRS variable using CF conventions properties. -|=== - -Each data variable (coverage) must: -- Indicate the coordinate reference system used. -- Reference the CRS variable name in the `grid_mapping` attribute. - -The CRS should: -- Be represented in a child Zarr array of the dataset (auxiliary variable). -- Be described in the attributes of the CRS variable using CF conventions properties. - -While it is recommended that all coverages in a dataset share the same set of coordinates and coordinate reference system to ensure consistency and ease of use, explicitly indicating the coordinate reference system for each data variable is necessary to avoid any ambiguity and to support interoperability when integrating data from diverse sources. - -TBD explain the grid_mapping and required properties - - -=== Tiling and Pyramiding - -TIP: equivalent to GeoTiff (https://docs.ogc.org/is/21-026/21-026.html). GeoZarr should specify if and how tiling might be applied for three-dimensional and higher-dimensional data (for example, order of dimensions might be critical) - -==== Requirements Class Tiling - -[[req_geozarr-tiling]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/tiling {set:cellbgcolor:#FFFFFF} -|Target type | Dataset -|Dependency | TBD -|=== - - -Tiling is a strategy for optimising chunking in GeoZarr. With tiling, access to a specific area or two-dimensional bounding box is much quicker, as the relevant data is stored closer together in the file, reducing the number of bytes that need to be read compared to the strips approach. - -==== Requirements Class Pyramiding - -Pyramiding is useful when the client wants to quickly render an image of the entire area or a large portion of the area represented in the file. Instead of downloading every pixel, the software can request a smaller, pre-created, lower-resolution version. - -[[req_geozarr-coordinate-pyramiding]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-piramidiing {set:cellbgcolor:#FFFFFF} -|Target type | Dataset -|Dependency | TBD -|=== - - -==== Requirements Class Map Rendering - -TIP: in addition to traditional 2D formats, some conventions might be needed to faciltiate the rendering of time series or N-D arrays on map tools. For example, how the bands / layers of the array are referenced, etc. - - -==== Requirement - -=== Referencing in STAC - -TIP: might be useful to describe or provide extension for referencing GeoZarr assets (e.g. dataset) in STAC Items. - -== Annex B: Mappings with other formats - -TIP: Provides the mappings for information purpose to show how source formats can preserve information from any data source. - -To maximize compatibility with various source formats, GeoZarr preserves as much metadata and structure as possible from these formats. - -NOTE: In particular, if relevant information which cannot be encoded in GeoZarr is identified, the specification might be extended. - -=== Mappings with CF - - -=== Mappings with GeoTiff - -To map a GeoTIFF to the GeoZarr structure, we need to carefully translate the data arrays, coordinate variables, and metadata (such as the CRS) into the appropriate GeoZarr elements. - -GeoZarr is structured as a single GeoZarr dataset at the root, encapsulating all necessary components to represent the geospatial data and metadata effectively. - -- GeoTIFF Data Array to GeoZarr Data Variable: In the case of a single band, the data variable represents a 2D raster with latitude and longitude dimensions. If there are multiple bands, they might be mapped to positions within a band dimension, with coordinates providing the wavelength and standard names indicating the units of measure for those coordinates. -- GeoTIFF Coordinates to GeoZarr Coordinate Variables: Latitude and longitude coordinates are extracted and stored as GeoZarr Coordinate Variables. -- GeoTIFF Metadata to GeoZarr Attributes: Metadata from the GeoTIFF (such as CRS and transform) are stored in the attributes of the GeoZarr Data Varaible. The CRS is translated to an auxiliary variable, referenced from the GeoZarr Data Variable in the grid_mapping attribute. -- GeoZarr Dataset Group for Organizing: All the data variables and coordinate variables are organized within a GeoZarr Dataset Group, ensuring a coherent structure. This group is the root of the GeoZarr hierarchy, making it a self-contained and self-describing dataset. - - -=== Mappings with GDAL entities - diff --git a/standard/template/sections/clause_7_format_requirements.adoc b/standard/template/sections/clause_7_format_requirements.adoc deleted file mode 100644 index 4ce82f7..0000000 --- a/standard/template/sections/clause_7_format_requirements.adoc +++ /dev/null @@ -1,23 +0,0 @@ -== GeoZarr format requirements - -TIP: This is a very preliminary draft. The content is primarily for demonstrating the purpose of the proposed sections. The main focus should be on the table of contents. - -include::clause_7a_format_overview.adoc[] - -=== Underlying GeoZarr Requirements - -The requirement class GeoZarr Core is mandatory for all GeoZarr instances and must be specified at the root level with the `conformsTo` attribute. - -Some requirement classes are optional and define specific type of assets to facilitate standard interpretation by clients, such as a the requirement class Dataset. These optional requirement classes must be specified at the appropriate level within the hierarchy, using the conformsTo attribute to indicate adherence to the respective requirement class. - -TIP: maybe list possible requirements classes and purpose - -include::clause_7b_format_core.adoc[] - -include::clause_7c_format_coordinates.adoc[] - -include::clause_7d_format_pyramiding.adoc[] - -include::clause_7e_format_dataset_types.adoc[] - -include::clause_7_annex_mapping.adoc.adoc[] \ No newline at end of file diff --git a/standard/template/sections/clause_7_unified_data_model.adoc b/standard/template/sections/clause_7_unified_data_model.adoc new file mode 100644 index 0000000..8af7598 --- /dev/null +++ b/standard/template/sections/clause_7_unified_data_model.adoc @@ -0,0 +1,354 @@ +[obligation==informative] + +== Unified Data Model + +=== Scope and Purpose + +This Standard defines a unified data model (UDM) that provides a conceptual framework for representing geospatial and scientific data in Zarr. The purpose of this model is to support standards-based interoperability across Earth observation systems and analytical environments, while preserving compatibility with existing data models and software ecosystems.. + +The unified data model incorporates and extends the following established specifications and community standards: + +- **Unidata Common Data Model (CDM)** – Provides the foundational resource structure for scientific datasets, encompassing dimensions, coordinate systems, variables, and associated metadata elements. +- **CF (Climate and Forecast) Conventions** – Defines a widely adopted metadata profile for describing spatiotemporal semantics in CDM-based datasets. +- **Selected constructs from related Standards and practices**, including: + - The **OGC Tile Matrix Set Standard**, which enables multi-resolution representations of gridded data. + - **GDAL geotransform metadata**, used to express affine transformations and interpolation characteristics. + - **SpatioTemporal Asset Catalog (STAC)** metadata elements for resource discovery and cataloguing (Collection and Item constructs). + +The unified model is format-agnostic and describes the abstract structure of resources independently of the physical encoding. It does not redefine the semantics of the CDM or CF conventions, but introduces integration and extension points required to support tiled multiscale data, geospatial referencing, and metadata for discovery. + +This clause specifies the logical composition of the unified model, the external standards it leverages, and the conformance points that facilitate harmonised implementation within the GeoZarr framework. + +=== Foundational Model and Standards Reuse + +The unified data model described in this Standard is derived from established community specifications to maximise interoperability and to enable the reuse of mature tools and practices. The model is grounded in the Unidata Common Data Model (CDM) and the Climate and Forecast (CF) Conventions, which together provide a robust framework for representing scientific and geospatial datasets. + +==== Common Data Model (CDM) + +The CDM defines a generalised schema for representing array-based scientific datasets. The following constructs are reused directly within the unified model: + +- **Dimensions** – Integer-valued, named axes that define the extents of data variables. +- **Coordinate Variables** – Variables that supply coordinate values along dimensions, establishing spatial or temporal context. +- **Data Variables** – Multidimensional arrays representing observed or simulated phenomena, associated with dimensions and coordinate variables. +- **Attributes** – Key-value metadata elements used to describe variables and datasets semantically. +- **Groups** – Optional hierarchical containers enabling logical organisation of resources and metadata. + +The unified data model adopts these CDM components without modification excluding the user-defined types. Semantic interpretation remains consistent with the original CDM specification. GeoZarr structures are mapped to CDM constructs to ensure compatibility and clarity. + +==== CF Conventions + +The CF Conventions specify standardised metadata attributes and practices to describe spatiotemporal context within CDM-compliant datasets. These conventions support consistent interpretation of: + +- Coordinate systems +- Grid mappings +- Physical units +- Standard variable naming + +The unified data model supports CF-compliant metadata, including attributes such as `standard_name`, `units`, and `grid_mapping`. The unified data model does not prescribe CF compliance but enables it through permissive design. Partial adoption of CF attributes is supported, and non-compliant datasets may selectively adopt CF metadata as needed. + +==== Standards-Based Extensions + +To support additional capabilities, the model defines optional extension points referencing external OGC and community standards: + +- **OGC Tile Matrix Set** – Facilitates the definition of multiscale grid hierarchies for raster overviews. +- **GDAL Geotransform** – Enables geospatial referencing through affine transformations and optional interpolation specifications. +- **STAC Metadata (Collection and Item)** – Provides linkage to SpatioTemporal Asset Catalogs for resource discovery and indexing. + +These extensions are integrated in a modular fashion and do not alter the core semantics of the CDM or CF structures. Implementations may selectively adopt these extensions based on their application requirements. + +=== Model Extension Points + +The unified data model specifies a series of optional, standards-aligned extension points to support functionality beyond the base CDM and CF constructs. These extensions enhance applicability to Earth observation and spatial analysis use cases without imposing additional mandatory requirements. + +Each extension is defined as an independent module. Implementation of any given extension does not necessitate support for others. + +==== Multi-Resolution Overviews (OGC Tile Matrix Set) + +Support for multi-resolution imagery is enabled via integration with the OGC Tile Matrix Set Standard: + +- Tile matrix sets define spatial tiling schemes with consistent resolutions and coordinate reference systems across zoom levels. +- Overviews may be represented as separate Zarr arrays or groups, each aligned to a specific tile matrix level. +- Metadata includes identifiers for tile matrices, spatial resolution, and spatial alignment. + +This approach aligns with the OGC API – Tiles and enables efficient access to large gridded datasets. + +==== GeoTransform Metadata (GDAL Interpolation and Affine Transform) + +Geospatial referencing can be further refined through the inclusion of metadata consistent with GDAL conventions: + +- Affine transformation is specified via the `GeoTransform` attribute or equivalent structures. +- Interpolation methods may be declared to indicate sampling behaviour or sub-pixel alignment strategies. + +This extension augments CF grid mappings by providing precise control over grid placement and coordinate transformations. + +==== STAC Collection and Item Integration + +To enable discovery of resources within the hierarchical structure of the data model, this Standard supports the inclusion of STAC metadata elements at appropriate locations within the group hierarchy. + +A STAC extension consists of embedding or referencing STAC Collection and Item metadata within the data model: + +* Each dataset resource MAY reference a corresponding STAC `Collection` or `Item` using an identifier or embedded object. +* STAC properties such as `datetime`, `bbox`, and `eo:bands` MAY be included in the metadata to enable spatial, temporal, and spectral filtering. +* The structure is compatible with external STAC APIs and metadata harvesting systems. + +STAC integration is non-intrusive and modular. It does not impose changes on the internal organisation of datasets and MAY be adopted incrementally by implementations requiring catalogue-based discovery capabilities. + + +==== Modularity and Interoperability + +Each extension point is specified independently. Implementations may advertise support for one or more extensions by declaring conformance to corresponding extension modules. This modularity facilitates incremental adoption, promotes reuse, and enhances interoperability across varied implementation environments. + + +=== Unified Model Structure + +This clause defines the structural organisation of datasets conforming to the unified data model (UDM). It consolidates the foundational elements and optional extensions into a coherent architecture suitable for Zarr encoding, while remaining format-agnostic. The model establishes a modular and extensible framework that supports structured representation of multidimensional, geospatially-referenced resources. + +The model represents datasets as abstract compositions of dimensions, coordinate variables, data variables, and associated metadata. This abstraction ensures that applications and services can reason about the content and semantics of a dataset without reliance on storage layout or specific serialisation. + +==== Dataset Structure + +A dataset conforming to the Unified Data Model (UDM) is structured as a hierarchy rooted at a top-level dataset entity. This design enables modularity and facilitates the representation of complex, multi-resolution, or thematically partitioned data collections. + +Each dataset node comprises the following core components, aligned with the Unidata Common Data Model (CDM) and Climate and Forecast (CF) Conventions: + +- **Dimensions** – Named, integer-valued axes defining the extent of data variables. Examples include `time`, `x`, `y`, and `band`. +- **Coordinate Variables** – Arrays that supply coordinate values along dimensions, providing spatial, temporal, or contextual referencing. These may be scalar or higher-dimensional, depending on the referencing scheme. +- **Data Variables** – Multidimensional arrays representing physical measurements or derived products. Defined over one or more dimensions, these variables are associated with coordinate variables and annotated with metadata. +- **Attributes** – Key-value pairs attached to variables or dataset components. Attributes convey semantic information such as units, standard names, and geospatial metadata. + +The hierarchy is implemented through **groups**, which function as containers for variables, dimensions, and metadata. Groups may define local context while inheriting attributes from parent nodes. This supports the logical subdivision of datasets by theme, resolution, or processing stage, and enhances the clarity and reusability of complex geospatial structures. + +The diagram below represents the structural layer of the unified data model, derived from the Unidata Common Data Model, which serves as the foundational framework for supporting all overlaying model layer. + +//image::udm-core.png[] + +//ifdef::never-shown[] +//Note: Hide until plantuml is supported +.Conformance-class model +[plantuml, cdm_model, svg, opts="debug"] +.... +@startuml CDM_DAL_Object_Model + +class Dataset { + + String location + + open() + + close() +} + +class Group { + + String name + + List subgroups + + List variables + + List dimensions + + List attributes +} + +class Dimension { + + String name + + int length + + boolean isUnlimited + + boolean isShared +} + +class Variable { + + String name + + DataType dataType + + List shape + + List attributes + + read() +} + +class DataType { + + String name + <> +} + +class Attribute { + + String name + + String type + + List values +} + +Dataset --> Group : rootGroup +Group --> Group : contains > +Group --> Variable : contains > +Group --> Dimension : defines > +Group --> Attribute : has > +Variable --> Dimension : uses > +Variable --> DataType : has > +Variable --> Attribute : has > +@enduml +.... +//endif::never-shown[] + +Note that, conceptually, node within this hierarchy might be treated as a self-contained dataset. + +==== Coordinate Referencing + +Coordinate systems are defined using: + +- **CF Conventions** – Including attributes such as `standard_name`, `units`, `axis`, and `grid_mapping` to express spatiotemporal semantics and coordinate system properties. +- **Affine Transformation Extensions** – Optional support for georeferencing via affine transforms and interpolation metadata (e.g., as defined in GDAL practices), providing enhanced flexibility for irregular grids and grid-aligned imagery. + +The model accommodates both standard CF-compatible definitions and extended referencing mechanisms to support use cases that span scientific analysis and geospatial mapping. + +==== Metadata Integration + +Metadata may be declared at various levels within the model structure: + +- **Global Metadata** – Attributes describing the dataset as a whole, including elements such as `title`, `summary`, and `license`. +- **Variable Metadata** – Attributes associated with individual data or coordinate variables, conveying descriptive or semantic information. +- **Extension Metadata** – Structured metadata linked to optional model extensions (e.g., multiscale tiling, catalogue references, geotransform properties). + +All metadata follows harmonised naming and semantics consistent with the CDM and CF standards, enabling machine and human interpretability while supporting metadata exchange across diverse systems. + +==== Overviews + +The *Overviews* construct defines a formal, interoperable abstraction for multiscale gridded data. It ensures structural consistency across zoom levels and provides a semantic model for integration with tiled representations such as GeoTIFF overviews, OGC API – Tiles, and STAC Tiled Assets. + +===== Purpose + +The *Overviews* construct provides a general mechanism for associating a single logical data variable with a collection of resampled representations, referred to as *zoom levels*. Each zoom level holds a reduced-resolution version of the original variable, with progressively decreasing spatial resolution from the base (highest detail) to the coarsest level. + +Overviews enable: + +- Fast access to summary representations for visualisation +- Progressive transmission and downsampling +- Multi-resolution analytics and adaptive processing + +===== Conceptual Structure + +An *Overviews* construct is defined as a *hierarchical set of multiscale representations* of one or more data variables. It comprises the following components: + +[horizontal] +*Base Variable*:: The original, highest-resolution variable to which the overview hierarchy is anchored. It is defined using the standard `DataVariable` structure in the model. +*Overview Levels*:: A sequence of variables representing the same logical quantity as the base variable, but sampled at coarser spatial resolutions. +*Zoom Level Identifier*:: A unique identifier associated with each level, ordered from finest (e.g. `"0"`) to coarsest resolution (e.g. `"N"`). +*Tile Grid Definition*:: A mapping that associates each zoom level with a spatial tiling layout, defined in alignment with a `TileMatrixSet`. +*Spatial Alignment*:: Each overview variable MUST be spatially aligned with the base variable using a consistent coordinate reference system and compatible axis orientation. +*Resampling Method*:: A declared method indicating the technique used to derive coarser levels from the base variable (e.g. `nearest`, `average`, `cubic`). + +===== Model Components + +The *Overviews* construct is represented in the unified data model using the following logical elements: + +[cols="1,3"] +|=== +|Element |Definition + +|`OverviewSet` | A logical grouping of variables at multiple zoom levels associated with a single base variable. + +|`OverviewLevel` | A single resampled variable at a specific resolution, identified by a zoom level string. + +|`TileMatrixSetRef` | A reference to the tile grid specification applied across all overview levels. May refer to a well-known identifier, a URI, or an inline object. + +|`TileMatrixLimits` | (Optional) Constraints on the tile coverage per zoom level. + +|`resampling_method` | A string indicating the uniform method used to downsample data across all levels. +|=== + +All overview levels MUST preserve: + +- The data variable’s semantic identity (`standard_name`, `units`, etc.) +- The coordinate reference system +- The axis order and dimension semantics + +Only the resolution and extent (through tiling and shape) may differ across levels. + +===== Relationship to Tile Matrix Set + +The *Overviews* construct is structurally aligned with the OGC Tile Matrix Set concept. Each zoom level is mapped to a `TileMatrix`, and the chunk layout for the corresponding data variable SHALL match the tile grid’s `tileWidth` and `tileHeight`. + +The `OverviewSet` MAY constrain tile matrix limits using `TileMatrixSetLimits`, which restrict tile indices to actual data coverage, consistent with the spatial extent of the overview variable. + +===== Usage Context + +The *Overviews* construct is applicable to any gridded data variable with at least two spatial dimensions. It is primarily designed for: + +- Raster imagery (e.g. reflectance, temperature) +- Data cubes with spatial slices (e.g. time-series of spatial grids) +- Multi-band products with consistent spatial structure across levels + +The structure may be extended for N-dimensional datasets in future revisions, provided that two spatial axes can be unambiguously identified. + +=== Conformance and Extensibility + +The GeoZarr data model is designed with an open conformance approach to support a wide range of use cases and implementation contexts. Its core model is permissive, allowing partial implementations, while optional extensions and compliance profiles can define stricter requirements for interoperability. + +==== Core Conformance + +- Datasets conforming to the core model must: +* Represent data using CDM-compatible constructs (dimensions, variables, attributes). +* Follow attribute conventions where applicable. +* Be parsable as valid Zarr with structured metadata following this specification. + +- CF compliance is not mandatory but is recommended for semantic interoperability. + +==== Extension Conformance + +- Implementations may optionally support one or more extension modules: +* Multi-resolution overviews (Tile Matrix Set) +* GeoTransform metadata (GDAL) +* STAC metadata integration + +- Each extension defines its own requirement class with validation rules and expected metadata structures. + +- Tools may advertise which extensions they support and validate datasets accordingly. + +==== Conformance Classes + +- Conformance Classes may be defined to specify required components and extensions for specific application domains (e.g., visualisation clients, EO archives, catalogue indexing). +- Conformance Classes enable selective validation without constraining the general model. + +==== Extensibility Principles + +- All extensions must preserve compatibility with the core model and avoid redefining existing CDM or CF semantics. +- New extensions should be documented with clear identifiers, schemas, and conformance criteria. +- The model encourages interoperability by allowing tools to interpret unknown extensions without failure. + +This extensibility framework supports both minimum-viable use and high-fidelity metadata integration, enabling incremental adoption across the geospatial and scientific data communities. + +=== Interoperability Considerations + +Interoperability is a core objective of the GeoZarr unified data model. The model is designed to bridge diverse Earth observation and scientific data ecosystems by enabling structural and semantic compatibility with established formats and standards, while providing a forward-looking foundation for scalable, cloud-native workflows. + +This section outlines the principles and mechanisms supporting interoperability across formats, tools, and communities. + +==== Format Mapping and Alignment + +The data model is explicitly aligned with foundational standards including the Unidata Common Data Model (CDM), the CF Conventions, and established practices in formats such as NetCDF and GeoTIFF. Where applicable, GeoZarr datasets may be derived from or transformed into these formats using consistent mappings. + +- *NetCDF (classic and enhanced models)*: +* GeoZarr shares a common conceptual structure with NetCDF via CDM. +* Variables, dimensions, coordinate systems, and attributes follow directly mappable patterns. +* Metadata expressed in CF conventions in NetCDF can be preserved in GeoZarr without loss of fidelity. + +- *GeoTIFF*: +* Raster-based datasets in GeoZarr can map to GeoTIFF by interpreting spatial referencing (via CF or GeoTransform) and band structures. +* Overviews aligned to OGC Tile Matrix Sets may correspond to TIFF image pyramids. +* Projection metadata and resolution information can be mapped via standard tags. + +These mappings facilitate round-trip transformations and enable toolchains that consume or produce multiple formats without reengineering semantic models. + +==== Semantic Interoperability + +Semantic interoperability is supported through adherence to CF conventions, use of standardised attribute names (e.g., `standard_name`, `units`), and alignment with metadata vocabularies used in other ecosystems (e.g., STAC, EPSG codes, ISO 19115 keywords). + +The model does not prescribe specific vocabularies beyond CF but encourages reuse and recognition of widely accepted descriptors to promote cross-domain understanding. + +==== Metadata and Discovery Integration + +STAC compatibility enables integration with catalogue services for discovery and indexing. Datasets can expose STAC-compliant metadata alongside core metadata, supporting federated search and filtering via STAC APIs. + +This approach enables seamless integration into modern data catalogues and platforms that support EO discovery standards. + +==== Tool and Ecosystem Support + +The unified data model facilitates interoperability with tools and libraries across the following domains: + +- *Scientific computing*: NetCDF-based libraries (e.g., xarray, netCDF4), Zarr-compatible clients. +- *Geospatial processing*: GDAL, rasterio, QGIS (via Zarr driver extensions or translations). +- *Cloud-native infrastructure*: support for parallel access, chunked storage, and hierarchical grouping compatible with object storage. + +Tooling support is expected to grow via standard-conformant implementations, easing adoption across domains and infrastructures. + diff --git a/standard/template/sections/clause_7a_format_overview.adoc b/standard/template/sections/clause_7a_format_overview.adoc deleted file mode 100644 index ee9393c..0000000 --- a/standard/template/sections/clause_7a_format_overview.adoc +++ /dev/null @@ -1,19 +0,0 @@ -A GeoZarr data format is a regular Zarr hierarchy that represents geospatial coverages such as granules, scenes series, mosaics or any spatio temporal asset (as per STAC specification). - -=== Overview and Definitions - -The OGC Abstract Topic 6 [OGC 07-011] standard defines all geographic object as a feature, with coverage being a special type for any digital geospatial information representing space and time varying phenomena. The ISO 19123 standard provides a formal definition for coverages: a "feature acting as a function that returns values from its range for any direct position within its spatiotemporal domain". - -GeoZarr, like many array-oriented geospatial data formats (e.g., NetCDF, GeoTIFF), primarily supports *Rectified Grid Coverages*. Rectified Grid Coverage is a type of grid coverage that aligns with a coordinate reference system, ensuring that each cell or grid point corresponds precisely to a specific geographic location. - -The base terminology in the scope of this specification includes the following terms: - -- A *GeoZarr Instance* (or GeoZarr) is a hierarchy of objects including attributes and arrays describing geospatial information. -- A *GeoZarr Data Variable* is the arrray and attributes that define the values of a n-D coverage (i.e. a Rectified Grid Coverage) -- A *GeoZarr Coordinate Variable* is the array (might be empty) and attributes that define a coordinate dimension of a n-D coverage -- A *GeoZarr Auxiliary Variable* is the array (might be empty) and attributes that define other type of information -- A *GeoZarr Dataset* is a self-describing collection of n-D coverages defined by a set of values, coordinates and attributes (similar to a NetCDF dataset). - -Unlike some popular geospatial data formats, GeoZarr is not limited to 2D rasters and extends to multiple dimensions, including time, altitude, wavelength, and others. Additionally, the order of these dimensions is not fixed, allowing for optimizations in data analysis. - -A Dataset (referenced as an asset in a STAC item) facilitates the discovery and handling of the coverages by clients, such as web maps, and supports advanced capabilities such as pyramiding. However,GeoZarr also flexible and adaptable enough to accommodate other types of information: the specification aim to ensure that a transformation to GeoZarr from a source format can be reverted back to the original format. diff --git a/standard/template/sections/clause_7b_format_core.adoc b/standard/template/sections/clause_7b_format_core.adoc deleted file mode 100644 index 09b35d5..0000000 --- a/standard/template/sections/clause_7b_format_core.adoc +++ /dev/null @@ -1,223 +0,0 @@ -==== Requirement Class GeoZarr Core - -The base requirement class is designed to be flexible, facilitating conversion from any data source and support most source formats.. - -[[req_geozarr-core]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/geozarr-core {set:cellbgcolor:#FFFFFF} -|Target type | Zarr Encoder -|Dependency | TBD -|=== - -===== Structure - -A GeoZarr instance must consist of any Zarr tree structure with a root that is always a Zarr group, include the attribute conformsTo set to the GeoZarr requirement class in the root group, and allow any number of hierarchical levels within the groups. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr/structure -| A {set:cellbgcolor:#EEEEEE} | A GeoZarr instance (or GeoZarr) must consist of a Zarr tree structure. -| B {set:cellbgcolor:#EEEEEE} | The GeoZarr root must always be a Zarr group. -| C {set:cellbgcolor:#EEEEEE} | The root group must include the attribute `conformsTo` set to the GeoZarr requirement class. -| D {set:cellbgcolor:#EEEEEE} | The GeoZarr instance may have any number of levels within the hierarchy. -|=== - -===== Variables - -A variable represents an array of values of the same type in a Zarr array. A variable can describe coverage values, their coordinates, or any auxiliary data (i.e., additional information to the coverage). These different types of variables are defined further in the specification. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr/variable -| A {set:cellbgcolor:#EEEEEE} | A variable represents an array of values of the same type and is stored in Zarr arrays. -|=== - - -===== Attributes - -GeoZarr attributes are used to store ancillary data or metadata at any level of the hierarchy. This information can pertain to variables, coordinates, spatio-temporal assets, or any other relevant purpose. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr/attributes -| A {set:cellbgcolor:#EEEEEE} | GeoZarr attributes are used to store ancillary data or metadata at any level of the hierarchy. -| B {set:cellbgcolor:#EEEEEE} | GeoZarr attributes must be encoded as Zarr attributes. -|=== - -==== Requirement Class GeoZarr Dataset - -[[req_geozarr-dataset]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/geozarr-dataset {set:cellbgcolor:#FFFFFF} -|Target type | Zarr Encoder -|Dependency | TBD -|=== - -===== Notion of Dataset - -GeoZarr defines flexible conventions to encode content from various source formats, including geospatial arrays (raster), non-geospatial arrays, and hierarchical structures. - -GeoZarr emphasizes Rectified Grid Coverages, which can be easily discovered, interpreted, and displayed on a map. A coverage is spatial data organized in a regular grid, where each cell holds a value representing a specific geographic area. Examples include 2D Rasters, Raster Time Series, and Geo-Datacubes (with dimensions like time, light spectrum, altitude, etc.). - -A GeoZarr Dataset, consists of a collection of coverage defined by data variables, their common coordinates, and some attributes which together form a self describing dataset and represent a geospatial phenomenon in the data hierarchy. GeoZarr defines the structure and necessary metadata for understanding this dataset, such as an index of available variables, the projection used, and the coordinates describing the dimensions of these variables. - -The purpose of a GeoZarr Dataset is to maximize compatibility and facilitate the seamless mapping of diverse source formats into a standardized, easily interpretable structure. - -**Figure 1: GeoZarr Dataset Abstract Representation** - -```mermaid -classDiagram - class Dataset { - +attributes - } - class DataVariable { - +values - +attributes - } - class CoordinateVariable { - +coordinates - +attributes - } - class AuxiliaryVariable { - +data - +attributes - } - - Dataset --> "1..*" DataVariable : includes - Dataset --> "1..*" CoordinateVariable : includes - Dataset --> "0..*" AuxiliaryVariable : includes - CoordinateVariable --> DataVariable : coordinates -``` - -===== Dataset Structure - -A GeoZarr may include Dataset Groups which consists in n-D variables observed by a sensor (temperature, humidity, elevation). These variables are defined by geospatial coordinates and optional extra dimensions (time, altitude, etc.). - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/group -| A {set:cellbgcolor:#EEEEEE} | A Dataset must be represented by a Zarr group. -| B {set:cellbgcolor:#EEEEEE} | The Zarr group must include the attribute `conformsTo` set to the Dataset requirement class. -| C {set:cellbgcolor:#EEEEEE} | Coordinates, attributes, and any additional information must be represented in the Zarr group or children Zarr objects (see furhter equirements) -|=== - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | Each coordinate variable must include the Climate and Forecast (CF) standard name in the `standard_name` attribute of the Zarr array. -|=== - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/data-variable-coordinates -| A {set:cellbgcolor:#EEEEEE} | Data Variables (coverages) in a dataset should share a common set of coordinates and coordinate reference system. -|=== - - -**Hierarchy of Zarr Elements** - -```mermaid -classDiagram - class ZarrGroup { - +attrs (attributes) - } - class ZarrArray { - +attrs (attributes) - } - - ZarrGroup <|-- Dataset : maps to - ZarrArray <|-- Coordinate : maps to - ZarrArray <|-- DataVariable : maps to - - class Dataset { - } - class Coordinate { - } - class DataVariable { - } - - Dataset --> ZarrGroup - ZarrGroup --> "1..*" ZarrArray : contains - Coordinate --> ZarrArray - DataVariable --> ZarrArray -``` - -Below is a representation of a Zarr structure for an abstract Dataset with a single data variable. - -``` -GeoZarr_Dataset/ -├── .zgroup -├── attrs.json -├── data_variable/ -│ ├── .zarray -│ ├── attrs.json -│ └── data (chunks) -├── latitude/ -│ ├── .zarray -│ ├── attrs.json -│ └── data (chunks) -├── longitude/ -│ ├── .zarray -│ ├── attrs.json -│ └── data (chunks) -└── time/ - ├── .zarray - ├── attrs.json - └── data (chunks) -``` - -INFO: a coordinate is not necessary a list of positions (labelled coordinates) but might be encoded in different ways further defined. - -NOTE: We may require or recommend that a Dataset is restricted to a single data variable or to variable with consistent coordinates (otherwise the group is a mess). We might specify also an attribute for a index of variables. - - -===== Data Variables - -TIP: Defines the requirements for the variables in a dataset (how to specify dimensions and relationship with the coordinates sibling) - -A Data Variable holds the data values of the observed geospatial phenomena. A variable has a name, type,any dimension, attributes and values. - -TBD: can/should a data variable have dimensions which are not coordinates - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/data-variable -| A {set:cellbgcolor:#EEEEEE} | Each data variable (values of a rectified grid coverage) must be stored as a child Zarr array within the dataset group. -| B {set:cellbgcolor:#EEEEEE} | The child Zarr array must include the attribute `_ARRAY_DIMENSIONS` which lists the dimension names. -| C {set:cellbgcolor:#EEEEEE} | For each dimension listed in `_ARRAY_DIMENSIONS`, there must be a corresponding coordinate variable in the dataset group. -|=== - -Each data variable must: -- Be stored as a child Zarr array within the dataset group. -- Include the attribute `_ARRAY_DIMENSIONS` listing the dimension names. -- Have a corresponding coordinate variable for each dimension listed in `_ARRAY_DIMENSIONS` within the dataset group. - - -===== Coordinates - -TIP: Defines the requirement for the data coordinates and reference to the requirement classes for the different encoding of data coordinate. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | Each coordinate variable (representing the positions of one dimension of a data variable) must be represented in a child Zarr array within the dataset group. -| B {set:cellbgcolor:#EEEEEE} | The Zarr array variables must be named with the same name as the dimension of the data variable they represent. -|=== - -Each coordinate variable must: -- Be represented in a child Zarr array within the dataset group. -- Be named with the same name as the dimension of the data variable it represents. - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | Each coordinate variable must include the Climate and Forecast (CF) standard name in the `standard_name` attribute of the Zarr array. -|=== - -Each coordinate variable should: -- Include the Climate and Forecast (CF) standard name in the `standard_name` attribute of the Zarr array. - diff --git a/standard/template/sections/clause_7c_format_coordinates.adoc b/standard/template/sections/clause_7c_format_coordinates.adoc deleted file mode 100644 index 945e8ad..0000000 --- a/standard/template/sections/clause_7c_format_coordinates.adoc +++ /dev/null @@ -1,111 +0,0 @@ - - - -=== Coordinates - - -==== Coordinate Types - -TIP: Defines what are the requirement in GeoZarr related to latitude, longitude, time, etc. metadata such as does it impose to use CF standard names for qualifying the coordinate (or another convention from GDAL) - -==== Geospatial Coordinate Encodings - -There are multiple types of encoding for coordinates, each serving different purposes and applications in geospatial data processing. Some common examples include: - -* Geospatial Control Points (labeled Coordinates) : each data point or grid cell is explicitly assigned a coordinate value, which can be used to directly map and reference spatial data. -* Affine Transforms (Coordinate Origin and Step): this involves defining a starting point (origin) and a regular interval (step) between points. This method is commonly used in grid-based data where the position of each cell is calculated based on its distance from the origin. - -Proposed encoding: - -- 2D array (the nominal encoding applied by xarray) -- origin/offset: -- COARDS : - -===== Requirements Class Geospatial_Control_Points - -Geospatial Control Points (GCPs), also known as Labeled Coordinates, are specific geographic locations with known coordinates. These points serve as reference markers to accurately align and georeference spatial data in mapping and GIS applications, ensuring that the data corresponds correctly to real-world locations. - -[[req_geozarr-coordinate-labelled]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-labelled {set:cellbgcolor:#FFFFFF} -|Target type | Dataset Coordinate -|Dependency | TBD -|=== - - -===== Requirements Class CoordinateOriginOffset - -TIP: It is not supported yet in the model, but this seems relevant to be added. - -[[req_geozarr-coordinate-oo]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-oo {set:cellbgcolor:#FFFFFF} -|Target type | Dataset Coordinate -|Dependency | TBD -|=== - -To accurately represent the spatial dimensions of the dataset, each coordinate type origin offset must be defined in a child Zarr array within the dataset. This array must contain the triplet of values: origin, offset, and end, to describe the coordinate's range and intervals. Additionally, the coordinate variable must include a CF standard name in the `standard_name` attribute, specifically for latitude or longitude. - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | A coordinate type origin offset should be represented in a child Zarr array of the dataset. -| B {set:cellbgcolor:#EEEEEE} | The coordinate variable must define in the array the triplet of values: origin, offset, end. -| C {set:cellbgcolor:#EEEEEE} | The coordinate variable must provide a standard name (CF) for latitude or longitude in the `standard_name` attribute. -|=== - - To enhance clarity and interoperability, it is recommended that each coordinate variable link to the `grid_mapping` variable, which describes the CRS applicable to this coordinate. - -[width="90%",cols="2,6"] -|=== -|*Recommendation {counter:rec-id}* {set:cellbgcolor:#CACCCE}|/rec/geozarr-dataset/coordinate-variable -| A {set:cellbgcolor:#EEEEEE} | The coordinate variable should link to the `grid_mapping` variable defined to describe the CRS that applies to this coordinate. -|=== - -The coordinate variable should: -- Link to the `grid_mapping` variable defined to describe the CRS that applies to this coordinate. - - -===== Requirements Class CoordinateVector - -TIP: please add the definition - -[[req_geozarr-coordinate-vector]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-vector {set:cellbgcolor:#FFFFFF} -|Target type | TBD -|Dependency | TBD -|=== - - -==== Coordinates Reference System Encodings - -TIP: any consideration with projections and affine transformations ? - -[width="90%",cols="2,6"] -|=== -|*Requirement {counter:req-id}* {set:cellbgcolor:#CACCCE}|/req/geozarr-dataset/data-variable-coordinates -| A {set:cellbgcolor:#EEEEEE} | The coordinate reference system (CRS) must be indicated for each data variable (coverage). -| B {set:cellbgcolor:#EEEEEE} | The CRS should be represented in a child Zarr array of the dataset (auxiliary variable). -| C {set:cellbgcolor:#EEEEEE} | The CRS variable name should be referenced in the data variable (coverage) in the `grid_mapping` attribute. -| D {set:cellbgcolor:#EEEEEE} | The CRS should be described in the attributes of the CRS variable using CF conventions properties. -|=== - -Each data variable (coverage) must: -- Indicate the coordinate reference system used. -- Reference the CRS variable name in the `grid_mapping` attribute. - -The CRS should: -- Be represented in a child Zarr array of the dataset (auxiliary variable). -- Be described in the attributes of the CRS variable using CF conventions properties. - -While it is recommended that all coverages in a dataset share the same set of coordinates and coordinate reference system to ensure consistency and ease of use, explicitly indicating the coordinate reference system for each data variable is necessary to avoid any ambiguity and to support interoperability when integrating data from diverse sources. - -TBD explain the grid_mapping and required properties - diff --git a/standard/template/sections/clause_7d_format_pyramiding.adoc b/standard/template/sections/clause_7d_format_pyramiding.adoc deleted file mode 100644 index 861cf7c..0000000 --- a/standard/template/sections/clause_7d_format_pyramiding.adoc +++ /dev/null @@ -1,199 +0,0 @@ - -=== Tiling and Pyramiding - -TIP: equivalent to GeoTiff (https://docs.ogc.org/is/21-026/21-026.html). GeoZarr should specify if and how tiling might be applied for three-dimensional and higher-dimensional data (for example, order of dimensions might be critical) - -==== Requirements Class Tiling - -[[req_geozarr-tiling]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/tiling {set:cellbgcolor:#FFFFFF} -|Target type | Dataset -|Dependency | TBD -|=== - - -A GeoZarr Dataset variable might include multiscales for a set of DataArray variables. Also known as "overviews", multiscales provide resampled copies of the original data at a coarser resolution. Multiscales of the original data thus always hold less detail. Common use cases for multiscales are fast rendering for visualization purposes and analyzing data at multiple resolutions. - -Tiling is a strategy for optimising chunking in GeoZarr. With tiling, access to a specific area or two-dimensional bounding box is much quicker, as the relevant data is stored closer together in the file, reducing the number of bytes that need to be read compared to the strips approach. - -==== Requirements Class Pyramiding - -Pyramiding is useful when the client wants to quickly render an image of the entire area or a large portion of the area represented in the file. Instead of downloading every pixel, the software can request a smaller, pre-created, lower-resolution version. - -[[req_geozarr-coordinate-pyramiding]] -[cols="1,4",width="90%"] -|=== -2+|*Requirements Class* {set:cellbgcolor:#CACCCE} -2+|http://www.opengis.net/spec/GeoZarr/1.0/req/coordinate-piramidiing {set:cellbgcolor:#FFFFFF} -|Target type | Dataset -|Dependency | TBD -|=== - - -==== Requirements Class Map Rendering - -TIP: in addition to traditional 2D formats, some conventions might be needed to faciltiate the rendering of time series or N-D arrays on map tools. For example, how the bands / layers of the array are referenced, etc. - -==== Draft Text - -===== Multiscales Encoding - - Multiscales MUST be encoded in children groups. Data at all scales MUST use the same coordinate reference system and must follow ONE common zoom level strategy. The zoom level strategy is modelled in close alignment to the [OGC Two Dimensional Tile Matrix Set](https://docs.ogc.org/is/17-083r4/17-083r4.html) version 2 and the [Tiled Asset STAC extension](https://github.com/stac-extensions/tiled-assets). Each zoom level is described by a Matrix defining the number, layout, origin and pixel size of included tiles. These tiles MUST correspond to the chunk layout along the two spatial dimensions listed in `_ARRAY_DIMENSIONS` of a given group. - -* Multiscale group name is the zoom level identifier (e.g. '0'). -* Multiscale group contains all DataArrays generated for this specific zoom level. -* Multiscale chunking is RECOMMENDED to be 256 pixels or 512 pixels for the two spatial dimensions listed in `_ARRAY_DIMENSIONS`. - -===== Multiscales Metadata - -If implemented, each DataArray MUST define the 'multiscales' metadata attribute which includes the following fields: -* `tile_matrix_set` -* `tile_matrix_set_limits` (optional) -* `resampling_method` - - -====== Tile Matrix Set -Tile Matrix Set can be: -* the name of a well know tile matrix set. Well known Tile Matrix Sets are listed [here](https://schemas.opengis.net/tms/2.0/json/examples/tilematrixset/). -* the URI of a JSON document describing the Tile Matrix Set following the OGC standard. -* a JSON object describing the Tile Matrix Set following the OGC standard (CamelCase!). - -Within the Tile Matrix Set -* the Tile Matrix identifier for each zoom level MUST be the relative path to the Zarr group which holds the DataArray variable -* zoom levels MUST be provided from lowest to highest resolutions -* the `supportedCRS` attribute of the Tile Matrix Set MUST match the crs information defined under **grid_mapping**. -* the tile layout for each Matrix MUST correspond to the chunk layout along the two spatial dimensions listed in `_ARRAY_DIMENSIONS` of the corresponding group. - - -====== Tile Matrix Set Limits -Tile Matrix Sets may describe a larger spatial extent and more resolutions than used in the given dataset. -In that case, users MAY specify [Tile Matrix Set Limits](https://docs.ogc.org/is/17-083r4/17-083r4.html#toc21) as described in the OGC standard to define the minimum and a maximum limits of the indices for each TileMatrix that contains actual data. However, the notation for tile matrix set does not the JSON encoding as described in the OGC standard but follows the STAC Tile Asset encoding for better readability. - -If used, Tile Matrix Set Limits -* MUST list all included zoom levels -* MAY list the min and max rows and columns for each zoom level. If omitted, it is assumed that the entire spatial extent is covered (resulting in higher chunk count of the DataArray). - -====== Resampling Method -Resampling Method specifies which resampling method is used for generating multiscales. It MUST be one of the following string values. Resampling method MUST be the same across all zoom levels: -* nearest -* bilinear -* cubic -* cubic_spline -* lanczos -* average -* mode -* gauss -* max -* min -* med -* q1 -* q3 -* sum -* rms - -===== Multiscale examples -=====# Using Well Known Name reference - -```diff -(mandatory items in red, optional items in green) -+{ -+ "multiscales": -- { -- "tile_matrix_set": "WebMercatorQuad", -- "resampling_method": "nearest", -- } -+} -``` -=====# Using a URI - -```diff -(mandatory items in red, optional items in green) -+{ -+ "multiscales": -- { -- "tile_matrix_set": "https://schemas.opengis.net/tms/2.0/json/examples/tilematrixset/WebMercatorQuad.json", -- "resampling_method": "nearest", -- } -+} -``` - -====== Using a JSON object - -```diff -(mandatory items in red, optional items in green) -+{ -+ "multiscales": -- { -- "tile_matrix_set": { -- "id": "WebMercatorQuad", -- "title": "Google Maps Compatible for the World", -- "uri": "http://www.opengis.net/def/tilematrixset/OGC/1.0/WebMercatorQuad", -- "crs": "http://www.opengis.net/def/crs/EPSG/0/3857", -- "orderedAxes": [ -- "X", -- "Y" -- ], -- "wellKnownScaleSet": "http://www.opengis.net/def/wkss/OGC/1.0/GoogleMapsCompatible", -- "tileMatrices": [ -- { -- "id": "0", -- "scaleDenominator": 559082264.028717, -- "cellSize": 156543.033928041, -- "pointOfOrigin": [ -- -20037508.3427892, -- 20037508.3427892 -- ], -- "tileWidth": 256, -- "tileHeight": 256, -- "matrixWidth": 1, -- "matrixHeight": 1 -- }, -- { -- "id": "1", -- "scaleDenominator": 279541132.014358, -- "cellSize": 78271.5169640204, -- "pointOfOrigin": [ -- -20037508.3427892, -- 20037508.3427892 -- ], -- "tileWidth": 256, -- "tileHeight": 256, -- "matrixWidth": 2, -- "matrixHeight": 2 -- }, -- } -- "resampling_method": "nearest", -- } -+} -``` -=====# Setting limits - -```diff -(mandatory items in red, optional items in green) -+{ -+ "multiscales": -- { -- "tile_matrix_set": "WebMercatorQuad", -+ "tile_matrix_limits: { -- "0": {}, -- "1": { -+ "min_tile_col": 0, -+ "max_tile_col": 0, -+ "min_tile_row": 0, -+ "max_tile_row": 0 -- }, -- "2": { -+ "min_tile_col": 1, -+ "max_tile_col": 1, -+ "min_tile_row": 2, -+ "max_tile_row": 2 -- } -- }, -- "resampling_method": "nearest", -- } -+} -``` - diff --git a/standard/template/sections/clause_7e_format_dataset_types.adoc b/standard/template/sections/clause_7e_format_dataset_types.adoc deleted file mode 100644 index 4026493..0000000 --- a/standard/template/sections/clause_7e_format_dataset_types.adoc +++ /dev/null @@ -1,45 +0,0 @@ -== Supported Dataset Types - -TIP: To be done - - -This section outlines the specific dataset types supported within this specification, along with additional requirements for each type. Each dataset type has requirements related to data format, metadata, and any unique processing needs. - -=== 1. 2D Raster RGB Data - -- **Description**: Two-dimensional raster images with RGB channels, primarily for visualisation. -- **Data Format**: Supported formats include `GeoTIFF` and `PNG`. -- **Resolution Requirements**: Minimum resolution of 10m per pixel. -- **Metadata Requirements**: -- RGB Channel Mapping. -- Spatial reference and bounding box. -- **Additional Processing**: -- Multiscale overview generation to support fast rendering at various zoom levels. - -=== 2. 2D Multispectral Data - -- **Description**: Multiband data that includes spectral information beyond RGB, useful for environmental and remote sensing applications. -- **Data Format**: Supported formats include `NetCDF` and `GeoTIFF` with multiple bands. -- **Band Information**: -- Supported bands, such as Blue, Green, Red, NIR. -- Wavelength range or specifications per band. -- **Metadata Requirements**: -- Spectral resolution and sensor-specific information. -- Spatial reference, bounding box, and temporal extent if time-indexed. -- **Additional Processing**: -- Generation of multiscale overviews. -- Band normalization or calibration to standardize across datasets. - -=== 3. 3D Time Series - -- **Description**: Multidimensional datasets incorporating spatial (X, Y) and temporal (Z) dimensions for tracking changes over time. -- **Data Format**: Supported formats include `Zarr` and `NetCDF`. -- **Temporal Resolution**: Required intervals, such as daily, weekly, or monthly. -- **Metadata Requirements**: -- Temporal indexing, ideally in ISO 8601 format. -- Spatial reference and bounding box. -- Data provenance, if applicable. -- **Additional Processing**: -- Support for multiscale pyramids to enable fast access and visualization over time. -- Aggregation or summarization of data for efficient handling. - diff --git a/standard/template/sections/clause_8_conformance.adoc b/standard/template/sections/clause_8_conformance.adoc new file mode 100644 index 0000000..602a9ad --- /dev/null +++ b/standard/template/sections/clause_8_conformance.adoc @@ -0,0 +1,32 @@ +[obligation==informative] + +== GeoZarr Conformance Classes + +Datasets can include many different types of data includes rasters, combinations—such as time, height, or wavelength—and can use either a projected or geographic coordinate system. + +This Standard identifies conformance classes rg r offer clear, testable building blocks as a standardised approach for representing different data types when converting to the GeoZarr Unified Data Model (e.g. for encoding RGB bands from a GeoTIFF source). + +TIP: This is a very preliminary draft. The content is primarily for demonstrating the purpose of the proposed sections. + + +//include::clause_8_conformance_raster.adoc[] + +//include::clause_8_conformance_projected_raster.adoc[] + +//include::clause_8_conformance_geographic_raster.adoc[] + +//include::clause_8_conformance_multiband.adoc[] + +//include::clause_8_conformance_spectral.adoc[] + +//include::clause_8_conformance_temporal.adoc[] + +//include::clause_8_conformance_vertical.adoc[] + +//include::clause_8_conformance_sar.adoc[] + +//include::clause_8_conformance_dem.adoc[] + +//include::clauclause_8_conformancese_9_zarr_encoding_stac.adoc[] + + diff --git a/standard/template/sections/clause_8_media_types.adoc b/standard/template/sections/clause_8_media_types.adoc deleted file mode 100644 index 69dec9a..0000000 --- a/standard/template/sections/clause_8_media_types.adoc +++ /dev/null @@ -1,3 +0,0 @@ -== Media Types for any data encoding(s) - -A section describing the MIME-types to be used is mandatory for any standard involving data encodings. If no suitable MIME type exists in http://www.iana.org/assignments/media-types/index.html then this section may be used to define a new MIME type for registration with IANA. diff --git a/standard/template/sections/clause_9_zarr_encoding.adoc b/standard/template/sections/clause_9_zarr_encoding.adoc new file mode 100644 index 0000000..d62edec --- /dev/null +++ b/standard/template/sections/clause_9_zarr_encoding.adoc @@ -0,0 +1,16 @@ +== Unified Data Model Encoding for Zarr + +This clause defines the encoding of the unified data model into the Zarr format. The encoding supports both Zarr Version 2 and Zarr Version 3. + +TIP: This is a very preliminary draft. The content is primarily for demonstrating the purpose of the proposed sections. + + +include::clause_9_zarr_encoding_core.adoc[] + +include::clause_9_zarr_encoding_overviews.adoc[] + +//include::clause_9_zarr_encoding_geotransform.adoc[] + +//include::clause_9_zarr_encoding_stac.adoc[] + + diff --git a/standard/template/sections/clause_9_zarr_encoding_core.adoc b/standard/template/sections/clause_9_zarr_encoding_core.adoc new file mode 100644 index 0000000..a2d6a2e --- /dev/null +++ b/standard/template/sections/clause_9_zarr_encoding_core.adoc @@ -0,0 +1,160 @@ + +=== Hierarchical Structure + +A dataset conforming to the unified data model is represented as a hierarchical structure of groups, variables (arrays), dimensions, and metadata. The dataset is rooted in a *top-level group*, which may contain: + +- Arrays representing coordinate or data variables +- Child groups for modular organisation, including logical sub-collections or resolution levels +- Metadata attributes at group and array levels + +Each group adheres to a consistent structure, allowing recursive composition. This reflects the CDM's use of *groups* and is supported by both Zarr v2 and v3 with differing implementations. + + +[cols="1,2,2"] +|=== +|Model Element |Zarr v2 Encoding |Zarr v3 Encoding + +|Root Dataset | Directory with `.zgroup` and `.zattrs` | Directory with `zarr.json`, with `node_type: group` + +|Child Group | Subdirectory with `.zgroup` and `.zattrs` | Subdirectory with `zarr.json`, with `node_type: group` + +|Array | Subdirectory with `.zarray` and `.zattrs` | Subdirectory with `zarr.json`, with `node_type: array` + +|Attributes | `.zattrs` file | `attributes` field in `zarr.json` +|=== + +Zarr v3 requires `zarr_format: 3` and stores all metadata (including user-defined attributes) in the `zarr.json` document. Each node includes a `node_type` field: either `"group"` or `"array"`. + +=== Dimensions + +Dimensions define the axes along which variables are indexed. + +- In Zarr v2, dimensions are inferred from array shape and declared in `_ARRAY_DIMENSIONS` within `.zattrs`. +- In Zarr v3, dimensions are stored using the `dimension_names` field in `zarr.json`. + +Example for a 2D array with dimension names `["lat", "lon"]`: + +[source,json] +---- +{ + "zarr_format": 3, + "node_type": "array", + "shape": [180, 360], + "dimension_names": ["lat", "lon"], + ... +} +---- + +=== Coordinate Variables + +Coordinate variables (excluding GeoTransform Coordinates) define the geospatial or temporal context of data. They are represented as named arrays with metadata attributes. + +Coordinate variables are represented as named 1D arrays aligned with corresponding dimensions. + +[cols="1,2,2"] +|=== +|Feature |Zarr v2 |Zarr v3 + +|Storage | Zarr array with `.zarray`, `.zattrs` | Zarr array with `zarr.json` + +|Dimension Binding | `_ARRAY_DIMENSIONS` in `.zattrs` | `dimension_names` in `zarr.json` + +|CF Metadata | `standard_name`, `units`, `axis` in `.zattrs` | Under `attributes` in `zarr.json` +|=== + +Example `zarr.json` for a coordinate array: +[source,json] +---- +{ + "zarr_format": 3, + "node_type": "array", + "shape": [180], + "dimension_names": ["lat"], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [180] + } + }, + "attributes": { + "standard_name": "latitude", + "units": "degrees_north", + "axis": "Y" + } +} +---- + + +=== Data Variables + +Data variables represent measured or derived quantities. They are stored as multidimensional arrays with metadata attributes. + +[cols="1,2,2"] +|=== +|Feature |Zarr v2 |Zarr v3 + +|Storage | Multidimensional array with `.zarray` and `.zattrs` | Same structure; v3 supports additional chunk storage formats + +|Dimension Association | `_ARRAY_DIMENSIONS` attribute | Same as v2 + +|CF Metadata | `standard_name`, `units`, `long_name`, `_FillValue`, etc. | Same as v2; v3 may support typed attributes +|=== + +Example: +[source,json] +---- +{ + "_ARRAY_DIMENSIONS": ["time", "lat", "lon"], + "standard_name": "air_temperature", + "units": "K", + "long_name": "Surface air temperature", + "_FillValue": -9999.0 +} +---- + +=== Global Metadata + +Metadata associated with the dataset as a whole is stored at the root group level. + + +[cols="1,2,2"] +|=== +|Field |Zarr v2 |Zarr v3 + +|Location | `.zattrs` file of root `.zgroup` | `attributes` field in root `zarr.json` + +|Group Identification | `.zgroup` file | `node_type: group` in `zarr.json` + +|CF Conformance | `Conventions` attribute (e.g., `CF-1.10`) | Same, under `attributes` +|=== + +Example Zarr v3 root `zarr.json`: +[source,json] +---- +{ + "zarr_format": 3, + "node_type": "group", + "attributes": { + "title": "Example Dataset", + "summary": "Multidimensional Earth Observation data", + "institution": "Example Space Agency", + "Conventions": "CF-1.10" + } +} +---- + + +=== Variables Metadata + +All metadata attributes (for groups, coordinates variables and data variables) are recommended to conform to CF naming and typing conventions. Supported attributes include: + +- `standard_name`, `units`, `axis`, `grid_mapping` (CF) +- `_FillValue`, `scale_factor`, `add_offset` +- `long_name`, `missing_value` + +In all cases: + +- Attribute names are case-sensitive and encoded as UTF-8 strings +- Values shall conform to JSON-compatible types (string, number, boolean, array) + diff --git a/standard/template/sections/clause_9_zarr_encoding_overviews.adoc b/standard/template/sections/clause_9_zarr_encoding_overviews.adoc new file mode 100644 index 0000000..b20092e --- /dev/null +++ b/standard/template/sections/clause_9_zarr_encoding_overviews.adoc @@ -0,0 +1,101 @@ + +=== Encoding of Multiscale Overviews in Zarr + +This clause specifies how multiscale tiling (also known as overviews or pyramids) is encoded in Zarr-based datasets conforming to the unified data model. The encoding supports both Zarr Version 2 and Version 3 and is aligned with the OGC Two Dimensional Tile Matrix Set Standard. + +Multiscale datasets are composed of a set of Zarr groups representing multiple zoom levels. Each level stores coarser-resolution resampled versions of the original data variables. + +==== Hierarchical Layout + +Each zoom level SHALL be represented as a Zarr group, identified by the Tile Matrix identifier (e.g., `"0"`, `"1"`, `"2"`). These groups SHALL be organised hierarchically under a common multiscale root group. Each zoom-level group SHALL contain the complete set of variables (Zarr arrays) corresponding to that resolution. + +[cols="1,2,2"] +|=== +|Structure |Zarr v2 |Zarr v3 + +|Zoom level groups | Subdirectories with `.zgroup` and `.zattrs` | Subdirectories with `zarr.json`, `node_type: group` + +|Variables at each level | Zarr arrays (`.zarray`, `.zattrs`) in each group | Zarr arrays (`zarr.json`, `node_type: array`) in each group + +|Global metadata | `multiscales` defined in parent `.zattrs` | `multiscales` defined in parent group `zarr.json` under `attributes` +|=== + +Each multiscale group MUST define chunking (tiling) along the spatial dimensions (`X`, `Y`, or `lon`, `lat`). Recommended chunk sizes are 256×256 or 512×512. + +==== Metadata Encoding + +Multiscale metadata SHALL be defined using a `multiscales` attribute located in the parent group of the zoom levels. This attribute SHALL be a JSON object with the following members: + +- `tile_matrix_set` – Identifier, URI, or inline JSON object compliant with OGC TileMatrixSet v2 +- `resampling_method` – One of the standard string values (e.g., `"nearest"`, `"average"`) +- `tile_matrix_set_limits` – (optional) Zoom-level limits following the STAC Tiled Asset style + +===== Zarr v2 Encoding Example (`.zattrs`) +[source,json] +---- +{ + "multiscales": { + "tile_matrix_set": "WebMercatorQuad", + "resampling_method": "nearest" + } +} +---- + +===== Zarr v3 Encoding Example (`zarr.json`) +[source,json] +---- +{ + "zarr_format": 3, + "node_type": "group", + "attributes": { + "multiscales": { + "tile_matrix_set": "WebMercatorQuad", + "resampling_method": "nearest" + } + } +} +---- + +==== Tile Matrix Set Representation + +The `tile_matrix_set` member MAY take one of the following forms: + +- A string referring to a well-known identifier (e.g., `"WebMercatorQuad"`) +- A URI pointing to a JSON document describing the tile matrix set +- An inline JSON object (CamelCase, OGC TMS 2.0 compatible) + +Zoom level identifiers in the tile matrix set MUST match the names of the child groups. The spatial reference system declared in `supportedCRS` MUST match the one declared in the corresponding `grid_mapping` of the data variables. + +==== Chunk Layout Alignment + +At each zoom level, chunking SHALL match the tile layout defined by the TileMatrix: + +- Chunks MUST be aligned with the tile grid (1:1 mapping between chunks and tiles) +- Chunk sizes MUST match the `tileWidth` and `tileHeight` declared in the TileMatrix +- Spatial dimensions MUST be clearly identified using `dimension_names` (v3) or `_ARRAY_DIMENSIONS` (v2) + +==== Tile Matrix Set Limits + +The `tile_matrix_set_limits` object MAY define the extent of actual data coverage for each zoom level. This follows the style of the STAC tiled-assets extension rather than the full OGC JSON encoding. + +Example: +[source,json] +---- +"tile_matrix_set_limits": { + "1": { + "min_tile_col": 0, + "max_tile_col": 1, + "min_tile_row": 0, + "max_tile_row": 1 + } +} +---- + +==== Resampling Method + +The `resampling_method` MUST indicate the method used for downsampling across zoom levels. The value MUST be one of: + +`nearest`, `average`, `bilinear`, `cubic`, `cubic_spline`, `lanczos`, `mode`, `max`, `min`, `med`, `sum`, `q1`, `q3`, `rms`, `gauss` + +The same method MUST apply across all levels. +