Added implementation to delete file.

subkanthi · subkanthi · commit e63feff3a005 · 2026-02-01T14:32:51.000-06:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -936,6 +936,40 @@ def add_files(
             for data_file in data_files:
                 append_files.append_data_file(data_file)
 
+    def delete_files(
+        self,
+        file_paths: list[str],
+        snapshot_properties: dict[str, str] = EMPTY_DICT,
+        branch: str | None = MAIN_BRANCH,
+    ) -> None:
+        """
+        Shorthand API for removing data files from the table transaction by their paths.
+
+        Args:
+            file_paths: The list of full file paths to be removed from the table
+            snapshot_properties: Custom properties to be added to the snapshot summary
+            branch: Branch to delete files from
+
+        Raises:
+            ValueError: If file_paths contains duplicates
+            ValueError: If any file paths are not found in the table
+        """
+        if len(file_paths) != len(set(file_paths)):
+            raise ValueError("File paths must be unique")
+
+        file_paths_set = set(file_paths)
+        data_files = _get_data_files_from_snapshot(
+            table_metadata=self.table_metadata, file_paths=file_paths_set, io=self._table.io, branch=branch
+        )
+
+        missing_files = file_paths_set - set(data_files.keys())
+        if missing_files:
+            raise ValueError(f"Cannot delete files that are not referenced by table, files: {', '.join(sorted(missing_files))}")
+
+        with self.update_snapshot(snapshot_properties=snapshot_properties, branch=branch).overwrite() as overwrite_snapshot:
+            for data_file in data_files.values():
+                overwrite_snapshot.delete_data_file(data_file)
+
     def update_spec(self) -> UpdateSpec:
         """Create a new UpdateSpec to update the partitioning of the table.
 
@@ -1506,6 +1540,31 @@ def add_files(
                 branch=branch,
             )
 
+    def delete_files(
+        self,
+        file_paths: list[str],
+        snapshot_properties: dict[str, str] = EMPTY_DICT,
+        branch: str | None = MAIN_BRANCH,
+    ) -> None:
+        """
+        Shorthand API for removing data files from the table by their paths.
+
+        Args:
+            file_paths: The list of full file paths to be removed from the table
+            snapshot_properties: Custom properties to be added to the snapshot summary
+            branch: Branch to delete files from
+
+        Raises:
+            ValueError: If file_paths contains duplicates
+            ValueError: If any file paths are not found in the table
+        """
+        with self.transaction() as tx:
+            tx.delete_files(
+                file_paths=file_paths,
+                snapshot_properties=snapshot_properties,
+                branch=branch,
+            )
+
     def update_spec(self, case_sensitive: bool = True) -> UpdateSpec:
         return UpdateSpec(Transaction(self, autocommit=True), case_sensitive=case_sensitive)
 
@@ -2175,3 +2234,21 @@ def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: list
     futures = [executor.submit(parquet_file_to_data_file, io, table_metadata, file_path) for file_path in file_paths]
 
     return [f.result() for f in futures if f.result()]
+
+
+def _get_data_files_from_snapshot(
+    table_metadata: TableMetadata, file_paths: set[str], io: FileIO, branch: str | None = MAIN_BRANCH
+) -> dict[str, DataFile]:
+    snapshot = table_metadata.snapshot_by_name(branch) if branch else table_metadata.current_snapshot()
+    if snapshot is None:
+        return {}
+
+    result: dict[str, DataFile] = {}
+    for manifest in snapshot.manifests(io):
+        if manifest.content == ManifestContent.DATA:
+            for entry in manifest.fetch_manifest_entry(io, discard_deleted=True):
+                if entry.data_file.file_path in file_paths:
+                    result[entry.data_file.file_path] = entry.data_file
+                    if len(result) == len(file_paths):
+                        return result
+    return result
diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py
@@ -1040,3 +1040,89 @@ def test_add_files_to_branch(spark: SparkSession, session_catalog: Catalog, form
 
     for col in branch_df.columns:
         assert branch_df.filter(branch_df[col].isNotNull()).count() == 6, "Expected all 6 rows to be non-null"
+
+
+@pytest.mark.integration
+def test_delete_files_from_unpartitioned_table(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None:
+    identifier = f"default.delete_files_unpartitioned_v{format_version}"
+    tbl = _create_table(session_catalog, identifier, format_version)
+
+    file_paths = [f"s3://warehouse/default/delete_unpartitioned/v{format_version}/test-{i}.parquet" for i in range(5)]
+    for file_path in file_paths:
+        _write_parquet(tbl.io, file_path, ARROW_SCHEMA, ARROW_TABLE)
+
+    tbl.add_files(file_paths=file_paths)
+    assert len(tbl.scan().to_arrow()) == 5
+
+    tbl.delete_files(file_paths=file_paths[:2])
+
+    rows = spark.sql(
+        f"""
+        SELECT added_data_files_count, existing_data_files_count, deleted_data_files_count
+        FROM {identifier}.all_manifests
+    """
+    ).collect()
+
+    assert sum(row.deleted_data_files_count for row in rows) == 2
+
+    df = spark.table(identifier)
+    assert df.count() == 3
+
+    assert len(tbl.scan().to_arrow()) == 3
+
+
+@pytest.mark.integration
+def test_delete_files_raises_on_nonexistent_file(session_catalog: Catalog, format_version: int) -> None:
+    identifier = f"default.delete_files_nonexistent_v{format_version}"
+    tbl = _create_table(session_catalog, identifier, format_version)
+
+    file_paths = [f"s3://warehouse/default/delete_nonexistent/v{format_version}/test-{i}.parquet" for i in range(3)]
+    for file_path in file_paths:
+        _write_parquet(tbl.io, file_path, ARROW_SCHEMA, ARROW_TABLE)
+
+    tbl.add_files(file_paths=file_paths)
+
+    with pytest.raises(ValueError, match="Cannot delete files that are not referenced by table"):
+        tbl.delete_files(file_paths=["s3://warehouse/default/does-not-exist.parquet"])
+
+
+@pytest.mark.integration
+def test_delete_files_raises_on_duplicate_paths(session_catalog: Catalog, format_version: int) -> None:
+    identifier = f"default.delete_files_duplicate_v{format_version}"
+    tbl = _create_table(session_catalog, identifier, format_version)
+
+    file_path = f"s3://warehouse/default/delete_duplicate/v{format_version}/test.parquet"
+    _write_parquet(tbl.io, file_path, ARROW_SCHEMA, ARROW_TABLE)
+
+    tbl.add_files(file_paths=[file_path])
+
+    with pytest.raises(ValueError, match="File paths must be unique"):
+        tbl.delete_files(file_paths=[file_path, file_path])
+
+
+@pytest.mark.integration
+def test_delete_files_from_branch(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None:
+    identifier = f"default.delete_files_branch_v{format_version}"
+    branch = "branch1"
+
+    tbl = _create_table(session_catalog, identifier, format_version)
+
+    file_paths = [f"s3://warehouse/default/delete_branch/v{format_version}/test-{i}.parquet" for i in range(5)]
+    for file_path in file_paths:
+        _write_parquet(tbl.io, file_path, ARROW_SCHEMA, ARROW_TABLE)
+
+    tbl.append(ARROW_TABLE)
+    assert tbl.metadata.current_snapshot_id is not None
+    tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch).commit()
+
+    tbl.add_files(file_paths=file_paths, branch=branch)
+    branch_df = spark.table(f"{identifier}.branch_{branch}")
+    assert branch_df.count() == 6
+
+    tbl.delete_files(file_paths=file_paths[:3], branch=branch)
+
+    branch_df = spark.table(f"{identifier}.branch_{branch}")
+    assert branch_df.count() == 3
+
+    main_df = spark.table(identifier)
+    assert main_df.count() == 1