add_doc

discivigour · discivigour · commit 3240fd363ca8 · 2025-10-16T14:33:14.000+08:00
diff --git a/docs/content/program-api/python-api.md b/docs/content/program-api/python-api.md
@@ -286,7 +286,8 @@ for batch in table_read.to_arrow_batch_reader(splits):
 ```
 
 #### Python Iterator
-You can read the data row by row into a native Python iterator. 
+
+You can read the data row by row into a native Python iterator.
 This is convenient for custom row-based processing logic.
 
 ```python
@@ -365,23 +366,177 @@ print(ray_dataset.to_pandas())
 # ...
 ```
 
+### Incremental Read Between Timestamps
+
+This API allows reading data committed between two snapshot timestamps. The steps are as follows.
+
+- Set the option `CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP` on a copied table via `table.copy({...})`. The value must
+  be a string: `"startMillis,endMillis"`, where `startMillis` is exclusive and `endMillis` is inclusive.
+- Use `SnapshotManager` to obtain snapshot timestamps or you can determine them by yourself.
+- Read the data as above.
+
+Example:
+
+```python
+from pypaimon import CatalogFactory
+from pypaimon.common.core_options import CoreOptions
+from pypaimon.snapshot.snapshot_manager import SnapshotManager
+
+# Prepare catalog and obtain a table
+catalog = CatalogFactory.create({'warehouse': '/path/to/warehouse'})
+table = catalog.get_table('default.your_table_name')
+
+# Assume the table has at least two snapshots (1 and 2)
+snapshot_manager = SnapshotManager(table)
+t1 = snapshot_manager.get_snapshot_by_id(1).time_millis
+t2 = snapshot_manager.get_snapshot_by_id(2).time_millis
+
+# Read records committed between [t1, t2]
+table_inc = table.copy({CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP: f"{t1},{t2}"})
+
+read_builder = table_inc.new_read_builder()
+table_scan = read_builder.new_scan()
+table_read = read_builder.new_read()
+splits = table_scan.plan().splits()
+
+# To Arrow
+arrow_table = table_read.to_arrow(splits)
+
+# Or to pandas
+pandas_df = table_read.to_pandas(splits)
+```
+
+### Shard Read
+
+Shard Read allows you to read data in parallel by dividing the table into multiple shards. This is useful for
+distributed processing and parallel computation.
+
+You can specify the shard index and total number of shards to read a specific portion of the data:
+
+```python
+# Prepare read builder
+table = catalog.get_table('database_name.table_name')
+read_builder = table.new_read_builder()
+table_read = read_builder.new_read()
+
+# Read the second shard (index 1) out of 3 total shards
+splits = read_builder.new_scan().with_shard(1, 3).plan().splits()
+
+# Read all shards and concatenate results
+splits1 = read_builder.new_scan().with_shard(0, 3).plan().splits()
+splits2 = read_builder.new_scan().with_shard(1, 3).plan().splits()
+splits3 = read_builder.new_scan().with_shard(2, 3).plan().splits()
+
+# Combine results from all shards
+
+all_splits = splits1 + splits2 + splits3
+pa_table = table_read.to_arrow(all_splits)
+```
+
+Example with shard read:
+
+```python
+import pyarrow as pa
+from pypaimon import CatalogFactory, Schema
+
+# Create catalog
+catalog_options = {'warehouse': 'file:///path/to/warehouse'}
+catalog = CatalogFactory.create(catalog_options)
+catalog.create_database("default", False)
+# Define schema
+pa_schema = pa.schema([
+    ('user_id', pa.int64()),
+    ('item_id', pa.int64()),
+    ('behavior', pa.string()),
+    ('dt', pa.string()),
+])
+
+# Create table and write data
+schema = Schema.from_pyarrow_schema(pa_schema, partition_keys=['dt'])
+catalog.create_table('default.test_table', schema, False)
+table = catalog.get_table('default.test_table')
+
+# Write data in two batches
+write_builder = table.new_batch_write_builder()
+
+# First write
+table_write = write_builder.new_write()
+table_commit = write_builder.new_commit()
+data1 = {
+    'user_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    'item_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014],
+    'behavior': ['a', 'b', 'c', None, 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'],
+    'dt': ['p1', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1'],
+}
+pa_table = pa.Table.from_pydict(data1, schema=pa_schema)
+table_write.write_arrow(pa_table)
+table_commit.commit(table_write.prepare_commit())
+table_write.close()
+table_commit.close()
+
+# Second write
+table_write = write_builder.new_write()
+table_commit = write_builder.new_commit()
+data2 = {
+    'user_id': [5, 6, 7, 8, 18],
+    'item_id': [1005, 1006, 1007, 1008, 1018],
+    'behavior': ['e', 'f', 'g', 'h', 'z'],
+    'dt': ['p2', 'p1', 'p2', 'p2', 'p1'],
+}
+pa_table = pa.Table.from_pydict(data2, schema=pa_schema)
+table_write.write_arrow(pa_table)
+table_commit.commit(table_write.prepare_commit())
+table_write.close()
+table_commit.close()
+
+# Read specific shard
+read_builder = table.new_read_builder()
+table_read = read_builder.new_read()
+
+# Read shard 2 out of 3 total shards
+splits = read_builder.new_scan().with_shard(2, 3).plan().splits()
+shard_data = table_read.to_arrow(splits)
+
+# Verify shard distribution by reading all shards
+splits1 = read_builder.new_scan().with_shard(0, 3).plan().splits()
+splits2 = read_builder.new_scan().with_shard(1, 3).plan().splits()
+splits3 = read_builder.new_scan().with_shard(2, 3).plan().splits()
+
+# Combine all shards should equal full table read
+all_shards_data = pa.concat_tables([
+    table_read.to_arrow(splits1),
+    table_read.to_arrow(splits2),
+    table_read.to_arrow(splits3),
+])
+full_table_data = table_read.to_arrow(read_builder.new_scan().plan().splits())
+```
+
+Key points about shard read:
+
+- **Shard Index**: Zero-based index of the shard to read (0 to total_shards-1)
+- **Total Shards**: Total number of shards to divide the data into
+- **Data Distribution**: Data is distributed evenly across shards, with remainder rows going to the last shard
+- **Parallel Processing**: Each shard can be processed independently for better performance
+- **Consistency**: Combining all shards should produce the complete table data
+
 ## Data Types
-| Python Native Type | PyArrow Type | Paimon Type |
-| :--- | :--- | :--- |
-| `int` | `pyarrow.int8()` | `TINYINT` |
-| `int` | `pyarrow.int16()` | `SMALLINT` |
-| `int` | `pyarrow.int32()` | `INT` |
-| `int` | `pyarrow.int64()` | `BIGINT` |
-| `float` | `pyarrow.float32()` | `FLOAT` |
-| `float` | `pyarrow.float64()` | `DOUBLE` |
-| `bool` | `pyarrow.bool_()` | `BOOLEAN` |
-| `str` | `pyarrow.string()` | `STRING`, `CHAR(n)`, `VARCHAR(n)` |
-| `bytes` | `pyarrow.binary()` | `BYTES`, `VARBINARY(n)` |
-| `bytes` | `pyarrow.binary(length)` | `BINARY(length)` |
-| `decimal.Decimal` | `pyarrow.decimal128(precision, scale)` | `DECIMAL(precision, scale)` |
-| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)` | `TIMESTAMP(p)` |
-| `datetime.date` | `pyarrow.date32()` | `DATE` |
-| `datetime.time` | `pyarrow.time32(unit)` or `pyarrow.time64(unit)` | `TIME(p)` |
+
+| Python Native Type  | PyArrow Type                                     | Paimon Type                       |
+|:--------------------|:-------------------------------------------------|:----------------------------------|
+| `int`               | `pyarrow.int8()`                                 | `TINYINT`                         |
+| `int`               | `pyarrow.int16()`                                | `SMALLINT`                        |
+| `int`               | `pyarrow.int32()`                                | `INT`                             |
+| `int`               | `pyarrow.int64()`                                | `BIGINT`                          |
+| `float`             | `pyarrow.float32()`                              | `FLOAT`                           |
+| `float`             | `pyarrow.float64()`                              | `DOUBLE`                          |
+| `bool`              | `pyarrow.bool_()`                                | `BOOLEAN`                         |
+| `str`               | `pyarrow.string()`                               | `STRING`, `CHAR(n)`, `VARCHAR(n)` |
+| `bytes`             | `pyarrow.binary()`                               | `BYTES`, `VARBINARY(n)`           |
+| `bytes`             | `pyarrow.binary(length)`                         | `BINARY(length)`                  |
+| `decimal.Decimal`   | `pyarrow.decimal128(precision, scale)`           | `DECIMAL(precision, scale)`       |
+| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)`               | `TIMESTAMP(p)`                    |
+| `datetime.date`     | `pyarrow.date32()`                               | `DATE`                            |
+| `datetime.time`     | `pyarrow.time32(unit)` or `pyarrow.time64(unit)` | `TIME(p)`                         |
 
 ## Predicate
 
@@ -402,5 +557,4 @@ print(ray_dataset.to_pandas())
 | f.contains(literal)   | PredicateBuilder.contains(f, literal)         |
 | f is in [l1, l2]      | PredicateBuilder.is_in(f, [l1, l2])           |
 | f is not in [l1, l2]  | PredicateBuilder.is_not_in(f, [l1, l2])       |
-| lower <= f <= upper   | PredicateBuilder.between(f, lower, upper)     |
-
+| lower <= f <= upper   | PredicateBuilder.between(f, lower, upper)     |
diff --git a/paimon-python/pypaimon/read/scanner/incremental_starting_scanner.py b/paimon-python/pypaimon/read/scanner/incremental_starting_scanner.py
@@ -62,7 +62,7 @@ def between_timestamps(table, predicate: Optional[Predicate], limit: Optional[in
         """
         snapshot_manager = SnapshotManager(table)
         starting_snapshot = snapshot_manager.earlier_or_equal_time_mills(start_timestamp)
-        earliest_snapshot = snapshot_manager.get_earliest_snapshot()
+        earliest_snapshot = snapshot_manager.try_get_earliest_snapshot()
 
         # If earliest_snapshot.time_millis > start_timestamp we should include the earliest_snapshot
         if starting_snapshot is None or (earliest_snapshot and earliest_snapshot.time_millis > start_timestamp):
diff --git a/paimon-python/pypaimon/read/table_scan.py b/paimon-python/pypaimon/read/table_scan.py
@@ -64,7 +64,7 @@ def _create_starting_scanner(self) -> Optional[StartingScanner]:
                 raise ValueError(
                     "The incremental-between-timestamp must specific start(exclusive) and end timestamp. But is: " +
                     options[CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP])
-            earliest_snapshot = SnapshotManager(self.table).get_earliest_snapshot()
+            earliest_snapshot = SnapshotManager(self.table).try_get_earliest_snapshot()
             latest_snapshot = SnapshotManager(self.table).get_latest_snapshot()
             if earliest_snapshot is None or latest_snapshot is None:
                 return EmptyStartingScanner()
diff --git a/paimon-python/pypaimon/snapshot/snapshot_manager.py b/paimon-python/pypaimon/snapshot/snapshot_manager.py
@@ -60,15 +60,13 @@ def get_snapshot_path(self, snapshot_id: int) -> Path:
         """
         return self.snapshot_dir / f"snapshot-{snapshot_id}"
 
-    def get_earliest_snapshot(self) -> Optional[Snapshot]:
-        """
-        Get the earliest snapshot.
-
-        Returns:
-            The earliest snapshot, or None if no snapshots exist
-        """
-        # TODO implement EARLIEST file
-        return self.get_snapshot_by_id(1)
+    def try_get_earliest_snapshot(self) -> Optional[Snapshot]:
+        if self.file_io.exists(self.snapshot_dir / "EARLIEST"):
+            earliest_content = self.file_io.read_file_utf8(self.snapshot_dir / "EARLIEST")
+            earliest_snapshot_id = int(earliest_content.strip())
+            return self.get_snapshot_by_id(earliest_snapshot_id)
+        else:
+            return self.get_snapshot_by_id(1)
 
     def earlier_or_equal_time_mills(self, timestamp: int) -> Optional[Snapshot]:
         """
diff --git a/paimon-python/pypaimon/tests/reader_append_only_test.py b/paimon-python/pypaimon/tests/reader_append_only_test.py
@@ -290,7 +290,7 @@ def test_incremental_timestamp(self):
         t1 = snapshot_manager.get_snapshot_by_id(1).time_millis
         t2 = snapshot_manager.get_snapshot_by_id(2).time_millis
         # test 1
-        table = table.copy({CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP: str(timestamp-1) + ',' + str(timestamp)})
+        table = table.copy({CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP: str(timestamp - 1) + ',' + str(timestamp)})
         read_builder = table.new_read_builder()
         actual = self._read_test_table(read_builder)
         self.assertEqual(len(actual), 0)
@@ -306,6 +306,42 @@ def test_incremental_timestamp(self):
         expected = self.expected.slice(4, 4)
         self.assertEqual(expected, actual)
 
+    def test_incremental_read_multi_snapshots(self):
+        schema = Schema.from_pyarrow_schema(self.pa_schema, partition_keys=['dt'])
+        self.catalog.create_table('default.test_incremental_100', schema, False)
+        table = self.catalog.get_table('default.test_incremental_100')
+
+        write_builder = table.new_batch_write_builder()
+        for i in range(1, 101):
+            table_write = write_builder.new_write()
+            table_commit = write_builder.new_commit()
+            pa_table = pa.Table.from_pydict({
+                'user_id': [i],
+                'item_id': [1000 + i],
+                'behavior': [f'snap{i}'],
+                'dt': ['p1' if i % 2 == 1 else 'p2'],
+            }, schema=self.pa_schema)
+            table_write.write_arrow(pa_table)
+            table_commit.commit(table_write.prepare_commit())
+            table_write.close()
+            table_commit.close()
+
+        snapshot_manager = SnapshotManager(table)
+        t10 = snapshot_manager.get_snapshot_by_id(10).time_millis
+        t20 = snapshot_manager.get_snapshot_by_id(20).time_millis
+
+        table_inc = table.copy({CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP: f"{t10},{t20}"})
+        read_builder = table_inc.new_read_builder()
+        actual = self._read_test_table(read_builder).sort_by('user_id')
+
+        expected = pa.Table.from_pydict({
+            'user_id': list(range(11, 21)),
+            'item_id': [1000 + i for i in range(11, 21)],
+            'behavior': [f'snap{i}' for i in range(11, 21)],
+            'dt': ['p1' if i % 2 == 1 else 'p2' for i in range(11, 21)],
+        }, schema=self.pa_schema).sort_by('user_id')
+        self.assertEqual(expected, actual)
+
     def _write_test_table(self, table):
         write_builder = table.new_batch_write_builder()