From 0548ca42d4f47baeb1a8d9e60a87932deb5db736 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 20 Sep 2025 19:57:46 +0200 Subject: [PATCH 1/4] Add fanout chunk key encoding --- chunk-key-encodings/fanout/README.md | 65 ++++++++++++++++++++++++++ chunk-key-encodings/fanout/schema.json | 23 +++++++++ 2 files changed, 88 insertions(+) create mode 100644 chunk-key-encodings/fanout/README.md create mode 100644 chunk-key-encodings/fanout/schema.json diff --git a/chunk-key-encodings/fanout/README.md b/chunk-key-encodings/fanout/README.md new file mode 100644 index 0000000..dd30227 --- /dev/null +++ b/chunk-key-encodings/fanout/README.md @@ -0,0 +1,65 @@ +# Fanout chunk key encoding +Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children (i.e., fanout). This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. + +## Chunk key encoding name + +The value of the `name` member in the chunk key encoding object MUST be `fanout`. + +## Configuration parameters + +### `max_children` + +An integer greater than 3 indicating the maximum number of child entries allowed within a single node (e.g., directory). Defaults to 1001 if omitted. + +## Example + +For example, the array metadata below specifies that chunk keys are encoded using the `fanout` strategy with a maximum of 1001 files per directory: + +```json +{ + "chunk_key_encoding": { + "name": "fanout", + "configuration": { + "max_children": 1001 + } + } +} +```` + +## Algorithm +Given chunk coordinates as a tuple of integers and a parameter `max_children`, the chunk key is constructed as follows: + +1. For each coordinate `coord` at dimension index `dim` (indexing starts from `0`): + + 1. Create a dimension marker `d{dim}`. + 2. Express `coord` in base `max_children - 1`, producing a sequence of digits (most significant first). + 3. Join the digits with `/` and prepend the dimension marker to form a subpath. For example: + + ``` + d{dim}/{digit0}/{digit1}/…/{digitN} + ``` + +2. Concatenate all dimension subpaths (in order from the lowest to highest dimension) using `/` as a separator. + +3. Append `"/c"` at the end to indicate the chunk file itself. + +> **Note:** Because nodes may also contain reserved entries such as the dimensional markers `dN` and the final chunk marker `c`, the effective numeric base used to subdivide coordinates is `max_children - 1`. + +> **Note:** This method ensures that no directory contains more than `max_children` child entries. Existing chunks never need to be moved or reorganized to maintain this property when new chunks are added. + +### Example +With `max_children = 101` (effective base = 100): + +| Coordinates | Chunk key | +| ------------------ | ---------------------------- | +| `()` | `c` | +| `(123,)` | `d0/1/23/c` | +| `(1234, 5, 67890)` | `d0/12/34/d1/5/d2/6/78/90/c` | + +## Change log + +No changes yet. + +## Current maintainers + +* Remco Leijenaar (GitHub: [RFLeijenaar](https://github.com/RFLeijenaar)) \ No newline at end of file diff --git a/chunk-key-encodings/fanout/schema.json b/chunk-key-encodings/fanout/schema.json new file mode 100644 index 0000000..68b02d5 --- /dev/null +++ b/chunk-key-encodings/fanout/schema.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "name": { + "type": "string", + "const": "fanout" + }, + "configuration": { + "type": "object", + "properties": { + "max_children": { + "type": "integer", + "minimum": 3, + "default": 1001 + } + }, + "additionalProperties": false + } + }, + "required": ["name"], + "additionalProperties": false +} From 7176c0fe28befe40398c659f1d33d9e7191a4514 Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Sat, 20 Sep 2025 20:02:02 +0200 Subject: [PATCH 2/4] Update to algorithm description --- chunk-key-encodings/fanout/README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/chunk-key-encodings/fanout/README.md b/chunk-key-encodings/fanout/README.md index dd30227..4657cf7 100644 --- a/chunk-key-encodings/fanout/README.md +++ b/chunk-key-encodings/fanout/README.md @@ -1,5 +1,5 @@ # Fanout chunk key encoding -Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children (i.e., fanout). This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. +Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children. This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. ## Chunk key encoding name @@ -9,7 +9,7 @@ The value of the `name` member in the chunk key encoding object MUST be `fanout` ### `max_children` -An integer greater than 3 indicating the maximum number of child entries allowed within a single node (e.g., directory). Defaults to 1001 if omitted. +An integer greater than or equal to 3 indicating the maximum number of child entries (fanout) allowed within a single node. Defaults to 1001 if omitted. ## Example @@ -24,20 +24,18 @@ For example, the array metadata below specifies that chunk keys are encoded usin } } } -```` +``` ## Algorithm Given chunk coordinates as a tuple of integers and a parameter `max_children`, the chunk key is constructed as follows: 1. For each coordinate `coord` at dimension index `dim` (indexing starts from `0`): - 1. Create a dimension marker `d{dim}`. 2. Express `coord` in base `max_children - 1`, producing a sequence of digits (most significant first). - 3. Join the digits with `/` and prepend the dimension marker to form a subpath. For example: - - ``` - d{dim}/{digit0}/{digit1}/…/{digitN} - ``` + 3. Join the digits with `/` and prepend the dimension marker to form a subpath. This creates a subpath of the form: + ``` + d{dim}/{digit0}/{digit1}/.../{digitN} + ``` 2. Concatenate all dimension subpaths (in order from the lowest to highest dimension) using `/` as a separator. From 850bc50fe2aaf5268ce57b9d22b31ef87f98e65a Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Thu, 1 Jan 2026 17:41:26 +0100 Subject: [PATCH 3/4] Update algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Document max_children parameter (min=100, default=1000) and power-of-10 flooring behavior - Add note that encoding preserves lexicographic ordering - Replace base-max_children-1/d{dim} algorithm with decimal chunking (decimal_len) and c/... path construction - Update metadata and worked examples to match the new algorithm (incl. table of sample coordinates → keys) --- chunk-key-encodings/fanout/README.md | 59 ++++++++++++++++++---------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/chunk-key-encodings/fanout/README.md b/chunk-key-encodings/fanout/README.md index 4657cf7..dd7d478 100644 --- a/chunk-key-encodings/fanout/README.md +++ b/chunk-key-encodings/fanout/README.md @@ -1,5 +1,5 @@ # Fanout chunk key encoding -Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children. This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. +Defines a chunk key encoding that converts chunk coordinates into a `/`-separated path (representing a sequence of nodes in a tree hierarchy), by splitting each coordinate into multiple nodes such that no node in the hierarchy exceeds a predefined maximum number of children. This is useful for filesystems or other hierarchical stores that experience performance issues when nodes (e.g., directories) contain many entries. The encoding also ensures lexicographical ordering of chunk keys. ## Chunk key encoding name @@ -9,50 +9,67 @@ The value of the `name` member in the chunk key encoding object MUST be `fanout` ### `max_children` -An integer greater than or equal to 3 indicating the maximum number of child entries (fanout) allowed within a single node. Defaults to 1001 if omitted. +* Type: integer +* Minimum: 100 +* Default: 1000 + +This parameter defines the maximum number of child entries allowed in a single node. + +* If a value smaller than 100 is provided, the implementation MUST raise an error. +* If a value that is not a power of 10 is provided, the implementation MAY floor the configuration parameter to the nearest lower power of 10. +* After initialization, the effective value of `max_children` MUST reflect the floored power-of-10 value, which MAY differ from the provided configuration: + +| Provided `max_children` | Effective `max_children` | +| ----------------------- | ------------------------ | +| 250 | 100 | +| 1234 | 1000 | ## Example -For example, the array metadata below specifies that chunk keys are encoded using the `fanout` strategy with a maximum of 1001 files per directory: +For example, the array metadata below specifies that chunk keys are encoded using the `fanout` strategy with a maximum of 10000 files per directory: ```json { "chunk_key_encoding": { "name": "fanout", "configuration": { - "max_children": 1001 + "max_children": 10000 } } } ``` ## Algorithm -Given chunk coordinates as a tuple of integers and a parameter `max_children`, the chunk key is constructed as follows: +Given a tuple of chunk coordinates and a `max_children` parameter, the chunk key MUST be constructed as follows: + +1. Compute `decimal_len`, the number of digits in `max_children - 1` (base-10 representation). -1. For each coordinate `coord` at dimension index `dim` (indexing starts from `0`): - 1. Create a dimension marker `d{dim}`. - 2. Express `coord` in base `max_children - 1`, producing a sequence of digits (most significant first). - 3. Join the digits with `/` and prepend the dimension marker to form a subpath. This creates a subpath of the form: +2. For each coordinate: + 1. Split the coordinate (base-10) into chunks of `decimal_len` digits, starting from the least significant digit. + 2. Left-pad the leftmost chunk with zeros as needed so that it has exactly `decimal_len` digits. + 3. The number of chunks for each coordinate minus one (`num_chunks - 1`) must be prepended to the sequence of chunks. + + For example: ``` - d{dim}/{digit0}/{digit1}/.../{digitN} + coordinate = 1234567 (max_children=1000) => final_sequence = [2, 001, 234, 567] ``` -2. Concatenate all dimension subpaths (in order from the lowest to highest dimension) using `/` as a separator. - -3. Append `"/c"` at the end to indicate the chunk file itself. +3. Concatenate all coordinate chunk sequences in order (from the lowest to highest dimension) and prepend `"c"` as the root. -> **Note:** Because nodes may also contain reserved entries such as the dimensional markers `dN` and the final chunk marker `c`, the effective numeric base used to subdivide coordinates is `max_children - 1`. +4. Join all parts using `/` as a separator. -> **Note:** This method ensures that no directory contains more than `max_children` child entries. Existing chunks never need to be moved or reorganized to maintain this property when new chunks are added. +> **Note:** This encoding ensures that no node contains more than `max_children` entries and that chunk keys are lexicographically sorted. ### Example -With `max_children = 101` (effective base = 100): +With `max_children = 1000` (`decimal_len = 3`): + +| Coordinates | Chunk key | +| ---------------------------- | ----------------------------------------- | +| `()` | `c` | +| `(0)` | `c/0/000` | +| `(12,)` | `c/0/012` | +| `(1234, 5, 0, 6789012)` | `c/1/001/234/0/005/0/000/2/006/789/012` | -| Coordinates | Chunk key | -| ------------------ | ---------------------------- | -| `()` | `c` | -| `(123,)` | `d0/1/23/c` | -| `(1234, 5, 67890)` | `d0/12/34/d1/5/d2/6/78/90/c` | ## Change log From efa1e8b14155784a6713bbbd9cbb6a3e9ee22e8f Mon Sep 17 00:00:00 2001 From: Remco Leijenaar <55834815+RFLeijenaar@users.noreply.github.com> Date: Thu, 1 Jan 2026 17:45:02 +0100 Subject: [PATCH 4/4] Update fanout schema to reflect new max_children constraints and default --- chunk-key-encodings/fanout/schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chunk-key-encodings/fanout/schema.json b/chunk-key-encodings/fanout/schema.json index 68b02d5..ddd3d6d 100644 --- a/chunk-key-encodings/fanout/schema.json +++ b/chunk-key-encodings/fanout/schema.json @@ -11,8 +11,8 @@ "properties": { "max_children": { "type": "integer", - "minimum": 3, - "default": 1001 + "minimum": 100, + "default": 1000 } }, "additionalProperties": false