@@ -1040,3 +1040,89 @@ def test_add_files_to_branch(spark: SparkSession, session_catalog: Catalog, form
10401040
10411041 for col in branch_df .columns :
10421042 assert branch_df .filter (branch_df [col ].isNotNull ()).count () == 6 , "Expected all 6 rows to be non-null"
1043+
1044+
1045+ @pytest .mark .integration
1046+ def test_delete_files_from_unpartitioned_table (spark : SparkSession , session_catalog : Catalog , format_version : int ) -> None :
1047+ identifier = f"default.delete_files_unpartitioned_v{ format_version } "
1048+ tbl = _create_table (session_catalog , identifier , format_version )
1049+
1050+ file_paths = [f"s3://warehouse/default/delete_unpartitioned/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
1051+ for file_path in file_paths :
1052+ _write_parquet (tbl .io , file_path , ARROW_SCHEMA , ARROW_TABLE )
1053+
1054+ tbl .add_files (file_paths = file_paths )
1055+ assert len (tbl .scan ().to_arrow ()) == 5
1056+
1057+ tbl .delete_files (file_paths = file_paths [:2 ])
1058+
1059+ rows = spark .sql (
1060+ f"""
1061+ SELECT added_data_files_count, existing_data_files_count, deleted_data_files_count
1062+ FROM { identifier } .all_manifests
1063+ """
1064+ ).collect ()
1065+
1066+ assert sum (row .deleted_data_files_count for row in rows ) == 2
1067+
1068+ df = spark .table (identifier )
1069+ assert df .count () == 3
1070+
1071+ assert len (tbl .scan ().to_arrow ()) == 3
1072+
1073+
1074+ @pytest .mark .integration
1075+ def test_delete_files_raises_on_nonexistent_file (session_catalog : Catalog , format_version : int ) -> None :
1076+ identifier = f"default.delete_files_nonexistent_v{ format_version } "
1077+ tbl = _create_table (session_catalog , identifier , format_version )
1078+
1079+ file_paths = [f"s3://warehouse/default/delete_nonexistent/v{ format_version } /test-{ i } .parquet" for i in range (3 )]
1080+ for file_path in file_paths :
1081+ _write_parquet (tbl .io , file_path , ARROW_SCHEMA , ARROW_TABLE )
1082+
1083+ tbl .add_files (file_paths = file_paths )
1084+
1085+ with pytest .raises (ValueError , match = "Cannot delete files that are not referenced by table" ):
1086+ tbl .delete_files (file_paths = ["s3://warehouse/default/does-not-exist.parquet" ])
1087+
1088+
1089+ @pytest .mark .integration
1090+ def test_delete_files_raises_on_duplicate_paths (session_catalog : Catalog , format_version : int ) -> None :
1091+ identifier = f"default.delete_files_duplicate_v{ format_version } "
1092+ tbl = _create_table (session_catalog , identifier , format_version )
1093+
1094+ file_path = f"s3://warehouse/default/delete_duplicate/v{ format_version } /test.parquet"
1095+ _write_parquet (tbl .io , file_path , ARROW_SCHEMA , ARROW_TABLE )
1096+
1097+ tbl .add_files (file_paths = [file_path ])
1098+
1099+ with pytest .raises (ValueError , match = "File paths must be unique" ):
1100+ tbl .delete_files (file_paths = [file_path , file_path ])
1101+
1102+
1103+ @pytest .mark .integration
1104+ def test_delete_files_from_branch (spark : SparkSession , session_catalog : Catalog , format_version : int ) -> None :
1105+ identifier = f"default.delete_files_branch_v{ format_version } "
1106+ branch = "branch1"
1107+
1108+ tbl = _create_table (session_catalog , identifier , format_version )
1109+
1110+ file_paths = [f"s3://warehouse/default/delete_branch/v{ format_version } /test-{ i } .parquet" for i in range (5 )]
1111+ for file_path in file_paths :
1112+ _write_parquet (tbl .io , file_path , ARROW_SCHEMA , ARROW_TABLE )
1113+
1114+ tbl .append (ARROW_TABLE )
1115+ assert tbl .metadata .current_snapshot_id is not None
1116+ tbl .manage_snapshots ().create_branch (snapshot_id = tbl .metadata .current_snapshot_id , branch_name = branch ).commit ()
1117+
1118+ tbl .add_files (file_paths = file_paths , branch = branch )
1119+ branch_df = spark .table (f"{ identifier } .branch_{ branch } " )
1120+ assert branch_df .count () == 6
1121+
1122+ tbl .delete_files (file_paths = file_paths [:3 ], branch = branch )
1123+
1124+ branch_df = spark .table (f"{ identifier } .branch_{ branch } " )
1125+ assert branch_df .count () == 3
1126+
1127+ main_df = spark .table (identifier )
1128+ assert main_df .count () == 1
0 commit comments