diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index d62ca31a7..cdff5f8bd 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -23,7 +23,7 @@ on: - ready_for_review - reopened paths: - - docs/src/rest.yaml + - docs/src/spec.yaml - java/** - .github/workflows/java.yml diff --git a/Makefile b/Makefile index 50e0cf08e..49f83b4ce 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ .PHONY: lint lint: - uv run openapi-spec-validator --errors all docs/src/rest.yaml + uv run openapi-spec-validator --errors all docs/src/spec.yaml .PHONY: clean-rust clean-rust: diff --git a/docs/Makefile b/docs/Makefile index fc1e8a0b3..74d7c4443 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -12,7 +12,7 @@ # Java model docs source and destination JAVA_DOCS_SRC := ../java/lance-namespace-apache-client/docs -MODELS_DEST := src/client/operations/models +MODELS_DEST := src/namespace/operations/models # API files to exclude (Java-specific, not data models) API_FILES := DataApi.md IndexApi.md MetadataApi.md NamespaceApi.md TableApi.md TagApi.md TransactionApi.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 23db8ee76..936bf27f6 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,5 +1,5 @@ -site_name: Lance Namespace -site_description: open specification on top of the storage-based Lance data format to standardize access to a collection of Lance tables +site_name: Lance Catalog & Namespace +site_description: Open specification for managing collections of Lance tables through catalog specs (Directory and REST) and a unified Namespace SDK site_url: https://lance.org/format/namespace/ docs_dir: src @@ -68,4 +68,3 @@ extra: extra_javascript: - https://unpkg.com/mermaid@10/dist/mermaid.min.js - diff --git a/docs/src/.pages b/docs/src/.pages index 91de9969c..b26b7883a 100644 --- a/docs/src/.pages +++ b/docs/src/.pages @@ -1,7 +1,4 @@ nav: - index.md - - Client Spec: client - - Directory Namespace: dir - - REST Namespace: rest - - Catalog Integrations: integrations - - Partitioning Spec: partitioning-spec.md + - Catalog Specs: catalog + - Namespace Client Spec: namespace diff --git a/docs/src/catalog/.pages b/docs/src/catalog/.pages new file mode 100644 index 000000000..68a6d8222 --- /dev/null +++ b/docs/src/catalog/.pages @@ -0,0 +1,5 @@ +title: Catalog Specs +nav: + - Overview: index.md + - Directory Catalog: dir + - REST Catalog: rest diff --git a/docs/src/dir/catalog-spec.md b/docs/src/catalog/dir/index.md similarity index 83% rename from docs/src/dir/catalog-spec.md rename to docs/src/catalog/dir/index.md index 15887409a..37c9a5693 100644 --- a/docs/src/dir/catalog-spec.md +++ b/docs/src/catalog/dir/index.md @@ -1,7 +1,10 @@ -# Lance Directory Namespace Catalog Spec +# Directory Catalog Format Specification -**Lance directory namespace** is a catalog that stores tables in a directory structure -on any local or remote storage system. It has gone through 2 major spec versions so far: +The **Lance Directory Catalog** is a storage-native catalog format that stores tables in a directory structure on any local or remote storage system. It requires no external metadata service — only a filesystem or object store. + +Machine learning workloads frequently operate on datasets stored in object storage and favor minimal operational dependencies, even in production environments. However, existing lakehouse formats typically require an external catalog service, while storage-only approaches lack the transactional guarantees required for reliable production use. The Directory Catalog addresses this gap by providing a catalog built directly on top of the Lance table format. + +The Directory Catalog has gone through 2 major spec versions: - **V1 (Directory Listing)**: A lightweight, simple 1-level namespace that discovers tables by scanning the directory. - **V2 (Manifest)**: A more advanced implementation backed by a manifest table (a Lance table) that supports nested namespaces and better performance at scale. @@ -13,11 +16,11 @@ This mode is ideal for getting started quickly with Lance tables. ### Directory Layout -A directory namespace maps to a directory on storage, called the **namespace directory**. -A Lance table corresponds to a subdirectory in the namespace directory that has the format `.lance`, +A directory catalog maps to a directory on storage, called the **catalog directory**. +A Lance table corresponds to a subdirectory in the catalog directory that has the format `.lance`, called a **table directory**. -Consider the following example namespace directory layout: +Consider the following example catalog directory layout: ``` . @@ -38,7 +41,7 @@ Consider the following example namespace directory layout: └── .lance-reserved # Marker: table4 is reserved but not created ``` -This describes a Lance directory namespace with the namespace directory at `/my/dir1/`. +This describes a Lance Directory Catalog with the catalog directory at `/my/dir1/`. It contains active tables `table1` and `table2` at table directories `/my/dir1/table1.lance` and `/my/dir1/table2.lance`. Table `table3` exists on storage but is deregistered (excluded from table listings). @@ -46,7 +49,7 @@ Table `table4` is reserved but not yet created with data. ### Table Existence -In V1, a table exists in a Lance directory namespace if a table directory of the specific name exists +In V1, a table exists in a Lance Directory Catalog if a table directory of the specific name exists and the table is not marked as deregistered. In object store terms, this means the prefix `.lance/` has at least one file in it and the file `.lance/.lance-deregistered` does not exist. @@ -65,9 +68,9 @@ is created inside the table directory. This causes the table to be excluded from and to return "not found" for `DescribeTable` and `TableExists` operations, while preserving the table data for potential re-registration. -## V2: Manifest +## V2: Manifest -V2 uses a special `__manifest` table (a Lance table) stored in the namespace directory to track all tables +V2 uses a special `__manifest` table (a Lance table) stored in the catalog directory to track all tables and namespaces. This provides several advantages over V1: - **Nested namespaces**: Support for hierarchical namespace organization @@ -75,6 +78,8 @@ and namespaces. This provides several advantages over V1: - **Metadata support**: All operations can be supported, e.g. namespaces can have associated properties/metadata, tables can be renamed. - **Optimized directory path**: Hash-based directory naming prevents conflicts and maximizes throughput in object storage. +Because the catalog metadata is itself stored as a Lance table, the catalog inherits the transactional semantics, snapshot isolation, and schema evolution guarantees of the table format, while also benefiting from Lance's random-access-friendly file layout and table-level indexing capabilities. + ### Directory Layout ``` @@ -107,13 +112,13 @@ The `__manifest` table has the following schema: **Primary Key**: The `object_id` column is the [unenforced primary key](https://lance.org/format/table/#unenforced-primary-key) for the manifest table. Implementation of this spec must always enforce the primary key uniqueness using features like Lance merge insert with primary key deduplication. -**Schema Extensibility**: The `__manifest` table schema may include additional columns beyond those listed above. Extensions like [partitioned namespaces](../partitioning-spec.md) add columns for efficient filtering. Implementations should preserve unrecognized columns during updates. +**Schema Extensibility**: The `__manifest` table schema may include additional columns beyond those listed above. Implementations should preserve unrecognized columns during updates, since extensions may add columns for filtering or other metadata-driven behaviors. ### Root Namespace Properties In V2, the root namespace is implicit and does not have a row in the `__manifest` table. Instead, root namespace properties are stored in the `__manifest` Lance table's metadata map. Properties are stored as key-value pairs where the key is the property name and the value is a UTF-8 encoded byte array. -For example, a partitioned namespace stores its `partition_spec_v1`, `partition_spec_v2`, and `schema` properties in the `__manifest` table's metadata. +For example, implementations may store catalog-level properties in the `__manifest` table's metadata. ### Manifest Table Indexes @@ -145,7 +150,7 @@ In [compatibility mode](#compatibility-mode), root namespace tables use `.lance` directory - In V2, query the manifest table for the table location - - When both V1 and V2 are enabled (the default [Compatibility Mode](catalog-spec.md#compatibility-mode)), + - When both V1 and V2 are enabled (the default [Compatibility Mode](../../catalog/dir/index.md#compatibility-mode)), first check the manifest table, then fall back to checking the `.lance` directory 2. Open the Lance table using the Lance SDK 3. Read the table metadata and return: @@ -208,7 +208,7 @@ If a specific version is requested and does not exist, return error code `11` (T ### DeregisterTable -This operation deregisters a table from the namespace while preserving its data on storage. The table files remain at their storage location and can be re-registered later using RegisterTable. +This operation deregisters a table from the catalog while preserving its data on storage. The table files remain at their storage location and can be re-registered later using RegisterTable. In **V1**: @@ -231,7 +231,7 @@ In **V2**: 3. Keep the table files at the storage location 4. Return the table location and properties for reference -When **both V1 and V2 are enabled** (the default [Compatibility Mode](catalog-spec.md#compatibility-mode)), +When **both V1 and V2 are enabled** (the default [Compatibility Mode](../../catalog/dir/index.md#compatibility-mode)), first check the manifest table, then fall back to checking the `.lance` directory. If found in manifest, follow V2 behavior; otherwise follow V1 behavior. @@ -260,8 +260,8 @@ In **V2**: 3. Delete the table directory and all its contents from storage (failure here does not affect the success of the drop since the table is no longer reachable) -When **both V1 and V2 are enabled** (the default [Compatibility Mode](catalog-spec.md#compatibility-mode)), -first check the manifest table, then fall back to checking the `.lance` directory. +When **both V1 and V2 are enabled** (the default [Compatibility Mode](../../catalog/dir/index.md#compatibility-mode)), +first check the manifest table, then fall back to checking the `.lance` directory. If found in manifest, follow V2 behavior; otherwise follow V1 behavior. **Error Handling:** @@ -287,7 +287,7 @@ When **table version management is not enabled**: 5. Delete the staging manifest file 6. Return the created version info including the final manifest path -When **table version management is enabled** (V2 with `table_version_management=true` in `__manifest` metadata), the directory namespace acts as an external manifest store. The commit process follows these steps: +When **table version management is enabled** (V2 with `table_version_management=true` in `__manifest` metadata), the directory catalog acts as an external manifest store. The commit process follows these steps: 1. **Stage manifest in object storage**: The caller writes the new manifest to a staging path (e.g., `{table_location}/_versions/{version}.manifest-{uuid}`). This staged manifest is not yet visible to readers. 2. **Atomically commit to manifest table**: Merge-insert a new row into the `__manifest` table with: diff --git a/docs/src/rest/impl-spec.md b/docs/src/namespace/supported-catalogs/lance-rest.md similarity index 92% rename from docs/src/rest/impl-spec.md rename to docs/src/namespace/supported-catalogs/lance-rest.md index 650853da7..dba4eb6c8 100644 --- a/docs/src/rest/impl-spec.md +++ b/docs/src/namespace/supported-catalogs/lance-rest.md @@ -1,14 +1,14 @@ -# Lance REST Namespace Implementation Spec +# REST Catalog Implementation Spec -This document describes how the Lance REST Namespace implements the Lance Namespace client spec. +This document describes how the Lance REST Catalog implements the Lance Namespace Client operations. ## Background -The Lance REST Namespace is a catalog that provides access to Lance tables via a REST API. For details on the API design, endpoints, and data models, see the [REST Namespace Catalog Spec](catalog-spec.md). +The Lance REST Catalog provides access to Lance tables via a REST API. For details on the API design, endpoints, and data models, see the [REST Catalog API Specification](../../catalog/rest/index.md). -## Namespace Implementation Configuration Properties +## Implementation Configuration Properties -The Lance REST namespace implementation accepts the following configuration properties: +The Lance REST Catalog implementation accepts the following configuration properties: The **uri** property is required and specifies the URI endpoint for the REST API, for example `https://api.example.com/lance`. @@ -40,7 +40,7 @@ The **table location** is managed by the REST server and returned in the Describ ## Lance Table Identification -In a REST Namespace, the server is responsible for managing Lance tables. The client identifies tables by their string identifier and delegates all table operations to the server. +In a REST Catalog, the server is responsible for managing Lance tables. The client identifies tables by their string identifier and delegates all table operations to the server. The server implementation must ensure that: @@ -347,7 +347,7 @@ If the specified version does not exist, return HTTP `404 Not Found` with error ### DeregisterTable -Deregisters a table from the namespace while preserving its data on storage. The table metadata is removed from the namespace catalog but the table files remain at their storage location. +Deregisters a table from the catalog while preserving its data on storage. The table metadata is removed from the catalog but the table files remain at their storage location. **HTTP Request:** @@ -368,7 +368,7 @@ The implementation: 2. Extract the parent namespace from the identifier 3. Validate the parent namespace exists 4. Look up the table in the server's storage -5. Remove the table registration from the namespace catalog +5. Remove the table registration from the catalog 6. Return the table location and properties for reference **Response:** @@ -390,7 +390,7 @@ If the table does not exist, return HTTP `404 Not Found` with error code `4` (Ta ## Additional Operations -The REST namespace supports all operations defined in the [Lance Namespace client spec](../client/operations/index.md). Each operation follows the same HTTP request/response pattern as the basic operations above. +The REST Catalog supports all operations defined in the [Lance Namespace Client spec](../operations/index.md). Each operation follows the same HTTP request/response pattern as the basic operations above. ### DropTable @@ -416,7 +416,7 @@ The implementation: 3. Validate the parent namespace exists 4. Look up the table in the server's storage 5. Delete the table data from storage -6. Remove the table registration from the namespace +6. Remove the table registration from the catalog **Response:** @@ -943,7 +943,7 @@ Content-Type: application/json All error responses follow the JSON error response model based on [RFC-7807](https://datatracker.ietf.org/doc/html/rfc7807). -The response body contains an [ErrorResponse](../client/operations/models/ErrorResponse.md) with a `code` field containing the Lance Namespace error code. See [Error Handling](../client/operations/errors.md) for the complete list of error codes. +The response body contains an [ErrorResponse](../operations/models/ErrorResponse.md) with a `code` field containing the Lance Namespace error code. See [Error Handling](../operations/errors.md) for the complete list of error codes. **Example error response:** @@ -960,7 +960,7 @@ The `detail` field contains detailed error information such as stack traces for ## Error Code to HTTP Status Mapping -REST namespace implementations must map Lance error codes to HTTP status codes as follows: +REST Catalog implementations must map Lance error codes to HTTP status codes as follows: - Error code `0` (Unsupported) maps to HTTP `406 Not Acceptable` - Error codes `1`, `4`, `6`, `8`, `10`, `11`, `12` (not found errors) map to HTTP `404 Not Found` diff --git a/docs/src/integrations/template.md b/docs/src/namespace/supported-catalogs/template.md similarity index 96% rename from docs/src/integrations/template.md rename to docs/src/namespace/supported-catalogs/template.md index ca843396e..2cdea3d16 100644 --- a/docs/src/integrations/template.md +++ b/docs/src/namespace/supported-catalogs/template.md @@ -1,7 +1,7 @@ -# Lance Namespace Implementation Spec Template +# Lance Catalog Implementation Spec Template -This template defines the standard structure for Lance Namespace implementation specs. -Each implementation spec describes how a specific catalog system integrates with the Lance Namespace client spec. +This template defines the standard structure for Lance Catalog implementation specs. +Each implementation spec describes how a specific catalog system integrates with the Lance Namespace Client. ## Required Sections diff --git a/docs/src/overview.png b/docs/src/overview.png index d4a20b21f..5c7733011 100644 Binary files a/docs/src/overview.png and b/docs/src/overview.png differ diff --git a/docs/src/partitioning-spec.md b/docs/src/partitioning-spec.md index 9f8471fe3..a0f328192 100644 --- a/docs/src/partitioning-spec.md +++ b/docs/src/partitioning-spec.md @@ -10,11 +10,11 @@ Most of the time, queries like vector search are only against a specific partiti it would be convenient to query across all business units as a unified dataset. A **Partitioned Namespace** is designed for these use cases. -It is a [Directory Namespace](dir/catalog-spec.md) containing a collection of tables that share a common schema. +It is a [Directory Catalog](catalog/dir/index.md) containing a collection of tables that share a common schema. These tables are physically separated and independent, but logically related through partition fields definition. This document defines the storage format for Partitioned Namespace. -Similar to Lance being a storage-only format, the storage-only [Directory Namespace](dir/catalog-spec.md) spec serves as the foundation for this Partitioned Namespace format. +Similar to Lance being a storage-only format, the storage-only [Directory Catalog](catalog/dir/index.md) spec serves as the foundation for this Partitioned Namespace format. The following example illustrates the logical layout of a partitioned namespace: @@ -48,13 +48,13 @@ Root Namespace (__manifest Lance table) ## Metadata Definition -A directory namespace is identified as a partitioned namespace if the `__manifest` table's -[metadata](dir/catalog-spec.md#root-namespace-properties) contains at least one partition spec version key. +A directory catalog is identified as a partitioned namespace if the `__manifest` table's +[metadata](catalog/dir/index.md#root-namespace-properties) contains at least one partition spec version key. The following properties are stored in the `__manifest` table's metadata map: - `partition_spec_v` (String): A JSON string representing a partition spec object for version N. The object contains the spec ID and an array of partition field definitions. See [Partition Spec](#partition-spec) for details. -- `schema` (String): A json string describing the Schema of the entire partitioned namespace, based on the [JsonArrowSchema](client/operations/models/JsonArrowSchema.md) schema in client spec. See [Namespace Schema](#schema) for more details. +- `schema` (String): A json string describing the Schema of the entire partitioned namespace, based on the [JsonArrowSchema](namespace/operations/models/JsonArrowSchema.md) schema in the Namespace Client spec. See [Namespace Schema](#schema) for more details. See [Appendix A: Metadata Example](#appendix-a-metadata-example) for a complete example. @@ -91,7 +91,7 @@ Each element in the `fields` array is a partition field object with the followin | **`source_ids`** | `JSON int array` | `[1]` | Field IDs of the source columns in the schema | | **`transform`** | `JSON object` | `{ "type": "year" }` | Well-known partition transform (see [Partition Transform](#partition-transform)). Exactly one of `transform` or `expression` must be specified. | | **`expression`** | `JSON string` | `"date_part('year', col0)"` | DataFusion SQL expression using `col0`, `col1`, ... as column references. Exactly one of `transform` or `expression` must be specified. | -| **`result_type`** | `JSON object` | `{ "type": "int32" }` | The output type of the partition value ([JsonArrowDataType](client/operations/models/JsonArrowDataType.md) format) | +| **`result_type`** | `JSON object` | `{ "type": "int32" }` | The output type of the partition value ([JsonArrowDataType](namespace/operations/models/JsonArrowDataType.md) format) | **Transform vs Expression**: Exactly one of `transform` or `expression` must be specified. When `transform` is specified, the expression is derived from the transform type. Custom partition logic that doesn't fit a well-known transform must use `expression` directly. @@ -400,7 +400,7 @@ In this example: - `v1` partitions by `event_date` using the identity transform with `result_type: date32` - `v2` partitions first by year of `event_date` using the year transform with `result_type: int32`, then by `country` using the identity transform with `result_type: utf8` - The `__manifest` table will have three partition columns: `partition_field_event_date` (date32), `partition_field_event_year` (int32), `partition_field_country` (utf8) -- The schema follows [JsonArrowSchema](client/operations/models/JsonArrowSchema.md) format +- The schema follows [JsonArrowSchema](namespace/operations/models/JsonArrowSchema.md) format ### Appendix B: Physical Layout Example diff --git a/docs/src/rest/.pages b/docs/src/rest/.pages deleted file mode 100644 index 17c6203cb..000000000 --- a/docs/src/rest/.pages +++ /dev/null @@ -1,3 +0,0 @@ -nav: - - Catalog Spec: catalog-spec.md - - Implementation Spec: impl-spec.md diff --git a/docs/src/rest.yaml b/docs/src/spec.yaml similarity index 100% rename from docs/src/rest.yaml rename to docs/src/spec.yaml diff --git a/java/Makefile b/java/Makefile index bff81d2b6..85730a5b1 100644 --- a/java/Makefile +++ b/java/Makefile @@ -19,7 +19,7 @@ clean-async-client: .PHONY: gen-async-client gen-async-client: clean-async-client uv run openapi-generator-cli generate \ - -i ../docs/src/rest.yaml \ + -i ../docs/src/spec.yaml \ -g java \ -o lance-namespace-async-client \ --ignore-file-override=.async-client-ignore \ @@ -50,7 +50,7 @@ clean-apache-client: .PHONY: gen-apache-client gen-apache-client: clean-apache-client uv run openapi-generator-cli generate \ - -i ../docs/src/rest.yaml \ + -i ../docs/src/spec.yaml \ -g java \ -o lance-namespace-apache-client \ --ignore-file-override=.apache-client-ignore \ @@ -76,7 +76,7 @@ clean-springboot-server: .PHONY: gen-springboot-server gen-springboot-server: clean-springboot-server uv run openapi-generator-cli generate \ - -i ../docs/src/rest.yaml \ + -i ../docs/src/spec.yaml \ -g spring \ -o lance-namespace-springboot-server \ --additional-properties=groupId=org.lance,artifactId=lance-namespace-springboot-server,artifactVersion=$(VERSION),parentGroupId=org.lance,parentArtifactId=lance-namespace-root,parentVersion=$(VERSION),parentRelativePath=pom.xml,library=spring-boot,interfaceOnly=true,useOptional=true,openApiNullable=false,java8=true,apiPackage=org.lance.namespace.server.springboot.api,modelPackage=org.lance.namespace.server.springboot.model,useTags=true,skipDefaultInterface=false,hideGenerationTimestamp=true,licenseName=Apache-2.0,licenseUrl=https://www.apache.org/licenses/LICENSE-2.0.txt,useSpringBoot3=true diff --git a/python/Makefile b/python/Makefile index 4d78bf190..7ab5c1faa 100644 --- a/python/Makefile +++ b/python/Makefile @@ -21,7 +21,7 @@ clean-urllib3-client: .PHONY: gen-urllib3-client gen-urllib3-client: clean-urllib3-client uv run openapi-generator-cli generate \ - -i ../docs/src/rest.yaml \ + -i ../docs/src/spec.yaml \ -g python \ -o lance_namespace_urllib3_client \ --additional-properties=packageName=lance_namespace_urllib3_client,packageVersion=$(VERSION),library=urllib3 diff --git a/rust/Makefile b/rust/Makefile index 8fa39e6e5..248021159 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -19,7 +19,7 @@ clean-reqwest-client: .PHONY: gen-reqwest-client gen-reqwest-client: clean-reqwest-client uv run openapi-generator-cli generate \ - -i ../docs/src/rest.yaml \ + -i ../docs/src/spec.yaml \ -g rust \ -o lance-namespace-reqwest-client \ --type-mappings='Binary=Vec,binary=Vec,file=Vec' \