apache
diff --git a/‎docs/content/program-api/python-api.md‎
Lines changed: 173 additions & 19 deletions b/‎docs/content/program-api/python-api.md‎
Lines changed: 173 additions & 19 deletions
diff --git a/‎paimon-python/pypaimon/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎paimon-python/pypaimon/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paimon-python/pypaimon/common/core_options.py‎
Lines changed: 1 addition & 0 deletions b/‎paimon-python/pypaimon/common/core_options.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paimon-python/pypaimon/filesystem/pvfs.py‎
Lines changed: 1 addition & 1 deletion b/‎paimon-python/pypaimon/filesystem/pvfs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paimon-python/pypaimon/manifest/manifest_file_manager.py‎
Lines changed: 2 additions & 1 deletion b/‎paimon-python/pypaimon/manifest/manifest_file_manager.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paimon-python/pypaimon/manifest/manifest_list_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎paimon-python/pypaimon/manifest/manifest_list_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/scanner/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎paimon-python/pypaimon/read/scanner/__init__.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/scanner/empty_starting_scanner.py‎
Lines changed: 25 additions & 0 deletions b/‎paimon-python/pypaimon/read/scanner/empty_starting_scanner.py‎
Lines changed: 25 additions & 0 deletions
@@ -286,7 +286,8 @@ for batch in table_read.to_arrow_batch_reader(splits):
 ```
 
 #### Python Iterator
-You can read the data row by row into a native Python iterator. 
+
+You can read the data row by row into a native Python iterator.
 This is convenient for custom row-based processing logic.
 
 ```python
@@ -365,23 +366,177 @@ print(ray_dataset.to_pandas())
 # ...
 ```
 
+### Incremental Read Between Timestamps
+
+This API allows reading data committed between two snapshot timestamps. The steps are as follows.
+
+- Set the option `CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP` on a copied table via `table.copy({...})`. The value must
+  be a string: `"startMillis,endMillis"`, where `startMillis` is exclusive and `endMillis` is inclusive.
+- Use `SnapshotManager` to obtain snapshot timestamps or you can determine them by yourself.
+- Read the data as above.
+
+Example:
+
+```python
+from pypaimon import CatalogFactory
+from pypaimon.common.core_options import CoreOptions
+from pypaimon.snapshot.snapshot_manager import SnapshotManager
+
+# Prepare catalog and obtain a table
+catalog = CatalogFactory.create({'warehouse': '/path/to/warehouse'})
+table = catalog.get_table('default.your_table_name')
+
+# Assume the table has at least two snapshots (1 and 2)
+snapshot_manager = SnapshotManager(table)
+t1 = snapshot_manager.get_snapshot_by_id(1).time_millis
+t2 = snapshot_manager.get_snapshot_by_id(2).time_millis
+
+# Read records committed between [t1, t2]
+table_inc = table.copy({CoreOptions.INCREMENTAL_BETWEEN_TIMESTAMP: f"{t1},{t2}"})
+
+read_builder = table_inc.new_read_builder()
+table_scan = read_builder.new_scan()
+table_read = read_builder.new_read()
+splits = table_scan.plan().splits()
+
+# To Arrow
+arrow_table = table_read.to_arrow(splits)
+
+# Or to pandas
+pandas_df = table_read.to_pandas(splits)
+```
+
+### Shard Read
+
+Shard Read allows you to read data in parallel by dividing the table into multiple shards. This is useful for
+distributed processing and parallel computation.
+
+You can specify the shard index and total number of shards to read a specific portion of the data:
+
+```python
+# Prepare read builder
+table = catalog.get_table('database_name.table_name')
+read_builder = table.new_read_builder()
+table_read = read_builder.new_read()
+
+# Read the second shard (index 1) out of 3 total shards
+splits = read_builder.new_scan().with_shard(1, 3).plan().splits()
+
+# Read all shards and concatenate results
+splits1 = read_builder.new_scan().with_shard(0, 3).plan().splits()
+splits2 = read_builder.new_scan().with_shard(1, 3).plan().splits()
+splits3 = read_builder.new_scan().with_shard(2, 3).plan().splits()
+
+# Combine results from all shards
+
+all_splits = splits1 + splits2 + splits3
+pa_table = table_read.to_arrow(all_splits)
+```
+
+Example with shard read:
+
+```python
+import pyarrow as pa
+from pypaimon import CatalogFactory, Schema
+
+# Create catalog
+catalog_options = {'warehouse': 'file:///path/to/warehouse'}
+catalog = CatalogFactory.create(catalog_options)
+catalog.create_database("default", False)
+# Define schema
+pa_schema = pa.schema([
+    ('user_id', pa.int64()),
+    ('item_id', pa.int64()),
+    ('behavior', pa.string()),
+    ('dt', pa.string()),
+])
+
+# Create table and write data
+schema = Schema.from_pyarrow_schema(pa_schema, partition_keys=['dt'])
+catalog.create_table('default.test_table', schema, False)
+table = catalog.get_table('default.test_table')
+
+# Write data in two batches
+write_builder = table.new_batch_write_builder()
+
+# First write
+table_write = write_builder.new_write()
+table_commit = write_builder.new_commit()
+data1 = {
+    'user_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+    'item_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014],
+    'behavior': ['a', 'b', 'c', None, 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'],
+    'dt': ['p1', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1', 'p2', 'p1'],
+}
+pa_table = pa.Table.from_pydict(data1, schema=pa_schema)
+table_write.write_arrow(pa_table)
+table_commit.commit(table_write.prepare_commit())
+table_write.close()
+table_commit.close()
+
+# Second write
+table_write = write_builder.new_write()
+table_commit = write_builder.new_commit()
+data2 = {
+    'user_id': [5, 6, 7, 8, 18],
+    'item_id': [1005, 1006, 1007, 1008, 1018],
+    'behavior': ['e', 'f', 'g', 'h', 'z'],
+    'dt': ['p2', 'p1', 'p2', 'p2', 'p1'],
+}
+pa_table = pa.Table.from_pydict(data2, schema=pa_schema)
+table_write.write_arrow(pa_table)
+table_commit.commit(table_write.prepare_commit())
+table_write.close()
+table_commit.close()
+
+# Read specific shard
+read_builder = table.new_read_builder()
+table_read = read_builder.new_read()
+
+# Read shard 2 out of 3 total shards
+splits = read_builder.new_scan().with_shard(2, 3).plan().splits()
+shard_data = table_read.to_arrow(splits)
+
+# Verify shard distribution by reading all shards
+splits1 = read_builder.new_scan().with_shard(0, 3).plan().splits()
+splits2 = read_builder.new_scan().with_shard(1, 3).plan().splits()
+splits3 = read_builder.new_scan().with_shard(2, 3).plan().splits()
+
+# Combine all shards should equal full table read
+all_shards_data = pa.concat_tables([
+    table_read.to_arrow(splits1),
+    table_read.to_arrow(splits2),
+    table_read.to_arrow(splits3),
+])
+full_table_data = table_read.to_arrow(read_builder.new_scan().plan().splits())
+```
+
+Key points about shard read:
+
+- **Shard Index**: Zero-based index of the shard to read (0 to total_shards-1)
+- **Total Shards**: Total number of shards to divide the data into
+- **Data Distribution**: Data is distributed evenly across shards, with remainder rows going to the last shard
+- **Parallel Processing**: Each shard can be processed independently for better performance
+- **Consistency**: Combining all shards should produce the complete table data
+
 ## Data Types
-| Python Native Type | PyArrow Type | Paimon Type |
-| :--- | :--- | :--- |
-| `int` | `pyarrow.int8()` | `TINYINT` |
-| `int` | `pyarrow.int16()` | `SMALLINT` |
-| `int` | `pyarrow.int32()` | `INT` |
-| `int` | `pyarrow.int64()` | `BIGINT` |
-| `float` | `pyarrow.float32()` | `FLOAT` |
-| `float` | `pyarrow.float64()` | `DOUBLE` |
-| `bool` | `pyarrow.bool_()` | `BOOLEAN` |
-| `str` | `pyarrow.string()` | `STRING`, `CHAR(n)`, `VARCHAR(n)` |
-| `bytes` | `pyarrow.binary()` | `BYTES`, `VARBINARY(n)` |
-| `bytes` | `pyarrow.binary(length)` | `BINARY(length)` |
-| `decimal.Decimal` | `pyarrow.decimal128(precision, scale)` | `DECIMAL(precision, scale)` |
-| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)` | `TIMESTAMP(p)` |
-| `datetime.date` | `pyarrow.date32()` | `DATE` |
-| `datetime.time` | `pyarrow.time32(unit)` or `pyarrow.time64(unit)` | `TIME(p)` |
+
+| Python Native Type  | PyArrow Type                                     | Paimon Type                       |
+|:--------------------|:-------------------------------------------------|:----------------------------------|
+| `int`               | `pyarrow.int8()`                                 | `TINYINT`                         |
+| `int`               | `pyarrow.int16()`                                | `SMALLINT`                        |
+| `int`               | `pyarrow.int32()`                                | `INT`                             |
+| `int`               | `pyarrow.int64()`                                | `BIGINT`                          |
+| `float`             | `pyarrow.float32()`                              | `FLOAT`                           |
+| `float`             | `pyarrow.float64()`                              | `DOUBLE`                          |
+| `bool`              | `pyarrow.bool_()`                                | `BOOLEAN`                         |
+| `str`               | `pyarrow.string()`                               | `STRING`, `CHAR(n)`, `VARCHAR(n)` |
+| `bytes`             | `pyarrow.binary()`                               | `BYTES`, `VARBINARY(n)`           |
+| `bytes`             | `pyarrow.binary(length)`                         | `BINARY(length)`                  |
+| `decimal.Decimal`   | `pyarrow.decimal128(precision, scale)`           | `DECIMAL(precision, scale)`       |
+| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)`               | `TIMESTAMP(p)`                    |
+| `datetime.date`     | `pyarrow.date32()`                               | `DATE`                            |
+| `datetime.time`     | `pyarrow.time32(unit)` or `pyarrow.time64(unit)` | `TIME(p)`                         |
 
 ## Predicate
 
@@ -402,5 +557,4 @@ print(ray_dataset.to_pandas())
 | f.contains(literal)   | PredicateBuilder.contains(f, literal)         |
 | f is in [l1, l2]      | PredicateBuilder.is_in(f, [l1, l2])           |
 | f is not in [l1, l2]  | PredicateBuilder.is_not_in(f, [l1, l2])       |
-| lower <= f <= upper   | PredicateBuilder.between(f, lower, upper)     |
-
+| lower <= f <= upper   | PredicateBuilder.between(f, lower, upper)     |
@@ -15,8 +15,8 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
-from pypaimon.filesystem.pvfs import PaimonVirtualFileSystem
 from pypaimon.catalog.catalog_factory import CatalogFactory
+from pypaimon.filesystem.pvfs import PaimonVirtualFileSystem
 from pypaimon.schema.schema import Schema
 
 __version__ = "0.3.dev"
 
@@ -45,5 +45,6 @@ def __str__(self):
     FILE_BLOCK_SIZE = "file.block-size"
     # Scan options
     SCAN_FALLBACK_BRANCH = "scan.fallback-branch"
+    INCREMENTAL_BETWEEN_TIMESTAMP = "incremental-between-timestamp"
     # Commit options
     COMMIT_USER_PREFIX = "commit.user-prefix"
@@ -29,7 +29,7 @@
 from fsspec.implementations.local import LocalFileSystem
 from readerwriterlock import rwlock
 
-from pypaimon.api.api_response import GetTableTokenResponse, GetTableResponse
+from pypaimon.api.api_response import GetTableResponse, GetTableTokenResponse
 from pypaimon.api.client import AlreadyExistsException, NoSuchResourceException
 from pypaimon.api.rest_api import RESTApi
 from pypaimon.common.config import CatalogOptions, OssOptions, PVFSOptions
 
@@ -15,10 +15,11 @@
 #  See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
-import fastavro
 from io import BytesIO
 from typing import List
 
+import fastavro
+
 from pypaimon.manifest.schema.data_file_meta import DataFileMeta
 from pypaimon.manifest.schema.manifest_entry import (MANIFEST_ENTRY_SCHEMA,
                                                      ManifestEntry)
 
@@ -47,6 +47,9 @@ def read_all(self, snapshot: Snapshot) -> List[ManifestFileMeta]:
         manifest_files.extend(delta_manifests)
         return manifest_files
 
+    def read_delta(self, snapshot: Snapshot) -> List[ManifestFileMeta]:
+        return self.read(snapshot.delta_manifest_list)
+
     def read(self, manifest_list_name: str) -> List[ManifestFileMeta]:
         manifest_files = []
 
 
@@ -0,0 +1,17 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
@@ -0,0 +1,25 @@
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from pypaimon.read.plan import Plan
+from pypaimon.read.scanner.starting_scanner import StartingScanner
+
+
+class EmptyStartingScanner(StartingScanner):
+
+    def scan(self) -> Plan:
+        return Plan([])