apache
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyiceberg/catalog/rest/__init__.py‎
Lines changed: 160 additions & 3 deletions b/‎pyiceberg/catalog/rest/__init__.py‎
Lines changed: 160 additions & 3 deletions
diff --git a/‎pyiceberg/catalog/rest/scan_planning.py‎
Lines changed: 209 additions & 0 deletions b/‎pyiceberg/catalog/rest/scan_planning.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎pyiceberg/expressions/parser.py‎
Lines changed: 2 additions & 2 deletions b/‎pyiceberg/expressions/parser.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 1 addition & 1 deletion b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyiceberg/table/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎pyiceberg/table/__init__.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎pyiceberg/table/inspect.py‎
Lines changed: 3 additions & 3 deletions b/‎pyiceberg/table/inspect.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pyiceberg/table/metadata.py‎
Lines changed: 10 additions & 4 deletions b/‎pyiceberg/table/metadata.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎pyiceberg/table/update/__init__.py‎
Lines changed: 9 additions & 3 deletions b/‎pyiceberg/table/update/__init__.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎pyiceberg/table/update/snapshot.py‎
Lines changed: 2 additions & 2 deletions b/‎pyiceberg/table/update/snapshot.py‎
Lines changed: 2 additions & 2 deletions
@@ -70,7 +70,7 @@ setup-venv: ## Create virtual environment
 	uv venv $(PYTHON_ARG)
 
 install-dependencies: setup-venv ## Install all dependencies including extras
-	uv sync $(PYTHON_ARG) --all-extras
+	uv sync $(PYTHON_ARG) --all-extras --reinstall
 
 install: install-uv install-dependencies ## Install uv and dependencies
 
 
@@ -0,0 +1,209 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+from datetime import date, datetime, time
+from decimal import Decimal
+from typing import Annotated, Generic, Literal, TypeAlias, TypeVar
+from uuid import UUID
+
+from pydantic import Field, model_validator
+
+from pyiceberg.catalog.rest.response import ErrorResponseMessage
+from pyiceberg.expressions import BooleanExpression, SerializableBooleanExpression
+from pyiceberg.manifest import FileFormat
+from pyiceberg.typedef import IcebergBaseModel
+
+# Primitive types that can appear in partition values and bounds
+PrimitiveTypeValue: TypeAlias = bool | int | float | str | Decimal | UUID | date | time | datetime | bytes
+
+V = TypeVar("V")
+
+
+class KeyValueMap(IcebergBaseModel, Generic[V]):
+    """Map serialized as parallel key/value arrays for column statistics."""
+
+    keys: list[int] = Field(default_factory=list)
+    values: list[V] = Field(default_factory=list)
+
+    @model_validator(mode="after")
+    def _validate_lengths_match(self) -> KeyValueMap[V]:
+        if len(self.keys) != len(self.values):
+            raise ValueError(f"keys and values must have same length: {len(self.keys)} != {len(self.values)}")
+        return self
+
+    def to_dict(self) -> dict[int, V]:
+        """Convert to dictionary mapping field ID to value."""
+        return dict(zip(self.keys, self.values, strict=True))
+
+
+class CountMap(KeyValueMap[int]):
+    """Map of field IDs to counts."""
+
+
+class ValueMap(KeyValueMap[PrimitiveTypeValue]):
+    """Map of field IDs to primitive values (for lower/upper bounds)."""
+
+
+class StorageCredential(IcebergBaseModel):
+    """Storage credential for accessing content files."""
+
+    prefix: str = Field(description="Storage location prefix this credential applies to")
+    config: dict[str, str] = Field(default_factory=dict)
+
+
+class RESTContentFile(IcebergBaseModel):
+    """Base model for data and delete files from REST API."""
+
+    spec_id: int = Field(alias="spec-id")
+    partition: list[PrimitiveTypeValue] = Field(default_factory=list)
+    content: Literal["data", "position-deletes", "equality-deletes"]
+    file_path: str = Field(alias="file-path")
+    file_format: FileFormat = Field(alias="file-format")
+    file_size_in_bytes: int = Field(alias="file-size-in-bytes")
+    record_count: int = Field(alias="record-count")
+    key_metadata: str | None = Field(alias="key-metadata", default=None)
+    split_offsets: list[int] | None = Field(alias="split-offsets", default=None)
+    sort_order_id: int | None = Field(alias="sort-order-id", default=None)
+
+
+class RESTDataFile(RESTContentFile):
+    """Data file from REST API."""
+
+    content: Literal["data"] = Field(default="data")
+    first_row_id: int | None = Field(alias="first-row-id", default=None)
+    column_sizes: CountMap | None = Field(alias="column-sizes", default=None)
+    value_counts: CountMap | None = Field(alias="value-counts", default=None)
+    null_value_counts: CountMap | None = Field(alias="null-value-counts", default=None)
+    nan_value_counts: CountMap | None = Field(alias="nan-value-counts", default=None)
+    lower_bounds: ValueMap | None = Field(alias="lower-bounds", default=None)
+    upper_bounds: ValueMap | None = Field(alias="upper-bounds", default=None)
+
+
+class RESTPositionDeleteFile(RESTContentFile):
+    """Position delete file from REST API."""
+
+    content: Literal["position-deletes"] = Field(default="position-deletes")
+    referenced_data_file: str | None = Field(alias="referenced-data-file", default=None)
+    content_offset: int | None = Field(alias="content-offset", default=None)
+    content_size_in_bytes: int | None = Field(alias="content-size-in-bytes", default=None)
+
+
+class RESTEqualityDeleteFile(RESTContentFile):
+    """Equality delete file from REST API."""
+
+    content: Literal["equality-deletes"] = Field(default="equality-deletes")
+    equality_ids: list[int] | None = Field(alias="equality-ids", default=None)
+
+
+# Discriminated union for delete files
+RESTDeleteFile = Annotated[
+    RESTPositionDeleteFile | RESTEqualityDeleteFile,
+    Field(discriminator="content"),
+]
+
+
+class RESTFileScanTask(IcebergBaseModel):
+    """A file scan task from the REST server."""
+
+    data_file: RESTDataFile = Field(alias="data-file")
+    delete_file_references: list[int] | None = Field(alias="delete-file-references", default=None)
+    residual_filter: BooleanExpression | None = Field(alias="residual-filter", default=None)
+
+
+class ScanTasks(IcebergBaseModel):
+    """Container for scan tasks returned by the server."""
+
+    delete_files: list[RESTDeleteFile] = Field(alias="delete-files", default_factory=list)
+    file_scan_tasks: list[RESTFileScanTask] = Field(alias="file-scan-tasks", default_factory=list)
+    plan_tasks: list[str] = Field(alias="plan-tasks", default_factory=list)
+
+    @model_validator(mode="after")
+    def _validate_delete_file_references(self) -> ScanTasks:
+        # validate delete file references are in bounds
+        max_idx = len(self.delete_files) - 1
+        for task in self.file_scan_tasks:
+            for idx in task.delete_file_references or []:
+                if idx < 0 or idx > max_idx:
+                    raise ValueError(f"Invalid delete file reference: {idx} (valid range: 0-{max_idx})")
+
+        if self.delete_files and not self.file_scan_tasks:
+            raise ValueError("Invalid response: deleteFiles should only be returned with fileScanTasks that reference them")
+
+        return self
+
+
+class PlanCompleted(ScanTasks):
+    """Completed scan plan result."""
+
+    status: Literal["completed"] = "completed"
+    plan_id: str | None = Field(alias="plan-id", default=None)
+    storage_credentials: list[StorageCredential] | None = Field(alias="storage-credentials", default=None)
+
+
+class PlanSubmitted(IcebergBaseModel):
+    """Scan plan submitted, poll for completion."""
+
+    status: Literal["submitted"] = "submitted"
+    plan_id: str | None = Field(alias="plan-id", default=None)
+
+
+class PlanCancelled(IcebergBaseModel):
+    """Planning was cancelled."""
+
+    status: Literal["cancelled"] = "cancelled"
+
+
+class PlanFailed(IcebergBaseModel):
+    """Planning failed with error."""
+
+    status: Literal["failed"] = "failed"
+    error: ErrorResponseMessage
+
+
+PlanningResponse = Annotated[
+    PlanCompleted | PlanSubmitted | PlanCancelled | PlanFailed,
+    Field(discriminator="status"),
+]
+
+
+class PlanTableScanRequest(IcebergBaseModel):
+    """Request body for planning a REST scan."""
+
+    snapshot_id: int | None = Field(alias="snapshot-id", default=None)
+    select: list[str] | None = Field(default=None)
+    filter: SerializableBooleanExpression | None = Field(default=None)
+    case_sensitive: bool = Field(alias="case-sensitive", default=True)
+    use_snapshot_schema: bool = Field(alias="use-snapshot-schema", default=False)
+    start_snapshot_id: int | None = Field(alias="start-snapshot-id", default=None)
+    end_snapshot_id: int | None = Field(alias="end-snapshot-id", default=None)
+    stats_fields: list[str] | None = Field(alias="stats-fields", default=None)
+    min_rows_requested: int | None = Field(alias="min-rows-requested", default=None)
+
+    @model_validator(mode="after")
+    def _validate_snapshot_fields(self) -> PlanTableScanRequest:
+        if self.start_snapshot_id is not None and self.end_snapshot_id is None:
+            raise ValueError("end-snapshot-id is required when start-snapshot-id is specified")
+        if self.snapshot_id is not None and self.start_snapshot_id is not None:
+            raise ValueError("Cannot specify both snapshot-id and start-snapshot-id")
+        return self
+
+
+class FetchScanTasksRequest(IcebergBaseModel):
+    """Request body for fetching scan tasks endpoint."""
+
+    plan_task: str = Field(alias="plan-task")
@@ -69,7 +69,7 @@
 from pyiceberg.typedef import L
 from pyiceberg.types import strtobool
 
-ParserElement.enablePackrat()
+ParserElement.enable_packrat()
 
 AND = CaselessKeyword("and")
 OR = CaselessKeyword("or")
@@ -82,7 +82,7 @@
 BETWEEN = CaselessKeyword("between")
 
 unquoted_identifier = Word(alphas + "_", alphanums + "_$")
-quoted_identifier = QuotedString('"', escChar="\\", unquoteResults=True)
+quoted_identifier = QuotedString('"', esc_quote="\\", unquote_results=True)
 
 
 @quoted_identifier.set_parse_action
 
@@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[
     from pyiceberg.utils.bin_packing import PackingIterator
 
     avg_row_size_bytes = tbl.nbytes / tbl.num_rows
-    target_rows_per_file = target_file_size // avg_row_size_bytes
+    target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes))
     batches = tbl.to_batches(max_chunksize=target_rows_per_file)
     bin_packed_record_batches = PackingIterator(
         items=batches,
 
@@ -734,6 +734,7 @@ def upsert(
         when_not_matched_insert_all: bool = True,
         case_sensitive: bool = True,
         branch: str | None = MAIN_BRANCH,
+        snapshot_properties: dict[str, str] = EMPTY_DICT,
     ) -> UpsertResult:
         """Shorthand API for performing an upsert to an iceberg table.
 
@@ -745,6 +746,7 @@ def upsert(
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table
             case_sensitive: Bool indicating if the match should be case-sensitive
             branch: Branch Reference to run the upsert operation
+            snapshot_properties: Custom properties to be added to the snapshot summary
 
             To learn more about the identifier-field-ids: https://iceberg.apache.org/spec/#identifier-field-ids
 
@@ -861,12 +863,13 @@ def upsert(
                 rows_to_update,
                 overwrite_filter=Or(*overwrite_predicates) if len(overwrite_predicates) > 1 else overwrite_predicates[0],
                 branch=branch,
+                snapshot_properties=snapshot_properties,
             )
 
         if when_not_matched_insert_all:
             insert_row_cnt = len(rows_to_insert)
             if rows_to_insert:
-                self.append(rows_to_insert, branch=branch)
+                self.append(rows_to_insert, branch=branch, snapshot_properties=snapshot_properties)
 
         return UpsertResult(rows_updated=update_row_cnt, rows_inserted=insert_row_cnt)
 
@@ -1327,6 +1330,7 @@ def upsert(
         when_not_matched_insert_all: bool = True,
         case_sensitive: bool = True,
         branch: str | None = MAIN_BRANCH,
+        snapshot_properties: dict[str, str] = EMPTY_DICT,
     ) -> UpsertResult:
         """Shorthand API for performing an upsert to an iceberg table.
 
@@ -1338,6 +1342,7 @@ def upsert(
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table
             case_sensitive: Bool indicating if the match should be case-sensitive
             branch: Branch Reference to run the upsert operation
+            snapshot_properties: Custom properties to be added to the snapshot summary
 
             To learn more about the identifier-field-ids: https://iceberg.apache.org/spec/#identifier-field-ids
 
@@ -1371,6 +1376,7 @@ def upsert(
                 when_not_matched_insert_all=when_not_matched_insert_all,
                 case_sensitive=case_sensitive,
                 branch=branch,
+                snapshot_properties=snapshot_properties,
             )
 
     def append(self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None:
 
@@ -285,7 +285,9 @@ def partitions(
             ]
         )
 
-        partition_record = self.tbl.metadata.specs_struct()
+        snapshot = self._get_snapshot(snapshot_id)
+        spec_ids = {manifest.partition_spec_id for manifest in snapshot.manifests(self.tbl.io)}
+        partition_record = self.tbl.metadata.specs_struct(spec_ids=spec_ids)
         has_partitions = len(partition_record.fields) > 0
 
         if has_partitions:
@@ -299,8 +301,6 @@ def partitions(
 
             table_schema = pa.unify_schemas([partitions_schema, table_schema])
 
-        snapshot = self._get_snapshot(snapshot_id)
-
         scan = DataScan(
             table_metadata=self.tbl.metadata,
             io=self.tbl.io,
 
@@ -18,6 +18,7 @@
 
 import datetime
 import uuid
+from collections.abc import Iterable
 from copy import copy
 from typing import Annotated, Any, Literal
 
@@ -262,18 +263,23 @@ def specs(self) -> dict[int, PartitionSpec]:
         """Return a dict the partition specs this table."""
         return {spec.spec_id: spec for spec in self.partition_specs}
 
-    def specs_struct(self) -> StructType:
-        """Produce a struct of all the combined PartitionSpecs.
+    def specs_struct(self, spec_ids: Iterable[int] | None = None) -> StructType:
+        """Produce a struct of the combined PartitionSpecs.
 
         The partition fields should be optional: Partition fields may be added later,
         in which case not all files would have the result field, and it may be null.
 
-        :return: A StructType that represents all the combined PartitionSpecs of the table
+        Args:
+            spec_ids: Optional iterable of spec IDs to include. When not provided,
+                all table specs are used.
+
+        :return: A StructType that represents the combined PartitionSpecs of the table
         """
         specs = self.specs()
+        selected_specs = specs.values() if spec_ids is None else [specs[spec_id] for spec_id in spec_ids if spec_id in specs]
 
         # Collect all the fields
-        struct_fields = {field.field_id: field for spec in specs.values() for field in spec.fields}
+        struct_fields = {field.field_id: field for spec in selected_specs for field in spec.fields}
 
         schema = self.schema()
 
 
@@ -181,9 +181,15 @@ class SetStatisticsUpdate(IcebergBaseModel):
 
     @model_validator(mode="before")
     def validate_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]:
-        stats = cast(StatisticsFile, data["statistics"])
-
-        data["snapshot_id"] = stats.snapshot_id
+        stats = data["statistics"]
+        if isinstance(stats, StatisticsFile):
+            snapshot_id = stats.snapshot_id
+        elif isinstance(stats, dict):
+            snapshot_id = cast(int, stats.get("snapshot-id"))
+        else:
+            snapshot_id = None
+
+        data["snapshot_id"] = snapshot_id
 
         return data
 
 
@@ -880,7 +880,7 @@ def create_tag(self, snapshot_id: int, tag_name: str, max_ref_age_ms: int | None
         update, requirement = self._transaction._set_ref_snapshot(
             snapshot_id=snapshot_id,
             ref_name=tag_name,
-            type="tag",
+            type=SnapshotRefType.TAG,
             max_ref_age_ms=max_ref_age_ms,
         )
         self._updates += update
@@ -921,7 +921,7 @@ def create_branch(
         update, requirement = self._transaction._set_ref_snapshot(
             snapshot_id=snapshot_id,
             ref_name=branch_name,
-            type="branch",
+            type=SnapshotRefType.BRANCH,
             max_ref_age_ms=max_ref_age_ms,
             max_snapshot_age_ms=max_snapshot_age_ms,
             min_snapshots_to_keep=min_snapshots_to_keep,