@@ -78,6 +78,7 @@ def _inspect_files_asserts(df: pa.Table, spark_df: DataFrame) -> None:
7878 "content" ,
7979 "file_path" ,
8080 "file_format" ,
81+ "partition" ,
8182 "spec_id" ,
8283 "record_count" ,
8384 "file_size_in_bytes" ,
@@ -141,6 +142,9 @@ def _inspect_files_asserts(df: pa.Table, spark_df: DataFrame) -> None:
141142 assert_frame_equal (lhs_subset , rhs_subset , check_dtype = False , check_categorical = False )
142143
143144 for column in df .column_names :
145+ if column == "partition" :
146+ # Spark leaves out the partition if the table is unpartitioned
147+ continue
144148 for left , right in zip (lhs [column ].to_list (), rhs [column ].to_list ()):
145149 if isinstance (left , float ) and math .isnan (left ) and isinstance (right , float ) and math .isnan (right ):
146150 # NaN != NaN in Python
@@ -833,6 +837,7 @@ def inspect_files_asserts(df: pa.Table) -> None:
833837 "content" ,
834838 "file_path" ,
835839 "file_format" ,
840+ "partition" ,
836841 "spec_id" ,
837842 "record_count" ,
838843 "file_size_in_bytes" ,
@@ -1010,7 +1015,6 @@ def test_inspect_files_format_version_3(spark: SparkSession, session_catalog: Ca
10101015 spark .sql (insert_data_sql )
10111016 spark .sql (f"UPDATE { identifier } SET int = 2 WHERE int = 1" )
10121017 spark .sql (f"DELETE FROM { identifier } WHERE int = 9" )
1013- spark .table (identifier ).show (20 , False )
10141018
10151019 tbl .refresh ()
10161020
@@ -1029,3 +1033,71 @@ def test_inspect_files_format_version_3(spark: SparkSession, session_catalog: Ca
10291033 _inspect_files_asserts (all_files_df , spark .table (f"{ identifier } .all_files" ))
10301034 _inspect_files_asserts (all_data_files_df , spark .table (f"{ identifier } .all_data_files" ))
10311035 _inspect_files_asserts (all_delete_files_df , spark .table (f"{ identifier } .all_delete_files" ))
1036+
1037+
1038+ @pytest .mark .integration
1039+ @pytest .mark .parametrize ("format_version" , [1 , 2 , 3 ])
1040+ def test_inspect_files_partitioned (spark : SparkSession , session_catalog : Catalog , format_version : int ) -> None :
1041+ from pandas .testing import assert_frame_equal
1042+
1043+ identifier = "default.table_metadata_files_partitioned"
1044+ try :
1045+ session_catalog .drop_table (identifier = identifier )
1046+ except NoSuchTableError :
1047+ pass
1048+
1049+ spark .sql (
1050+ f"""
1051+ CREATE TABLE { identifier } (
1052+ dt date,
1053+ int_data int
1054+ )
1055+ PARTITIONED BY (months(dt))
1056+ TBLPROPERTIES ('format-version'='{ format_version } ')
1057+ """
1058+ )
1059+
1060+ if format_version > 1 :
1061+ spark .sql (
1062+ f"""
1063+ ALTER TABLE { identifier } SET TBLPROPERTIES(
1064+ 'write.update.mode' = 'merge-on-read',
1065+ 'write.delete.mode' = 'merge-on-read',
1066+ 'write.merge.mode' = 'merge-on-read')
1067+ """
1068+ )
1069+
1070+ spark .sql (f"""
1071+ INSERT INTO { identifier } VALUES (CAST('2025-01-01' AS date), 1), (CAST('2025-01-01' AS date), 2)
1072+ """ )
1073+
1074+ spark .sql (
1075+ f"""
1076+ ALTER TABLE { identifier }
1077+ REPLACE PARTITION FIELD dt_month WITH days(dt)
1078+ """
1079+ )
1080+
1081+ spark .sql (
1082+ f"""
1083+ INSERT INTO { identifier } VALUES (CAST('2025-01-02' AS date), 2)
1084+ """
1085+ )
1086+
1087+ spark .sql (
1088+ f"""
1089+ DELETE FROM { identifier } WHERE int_data = 1
1090+ """
1091+ )
1092+
1093+ tbl = session_catalog .load_table (identifier )
1094+ files_df = tbl .inspect .files ()
1095+ lhs = files_df .to_pandas ()[["file_path" , "partition" ]].sort_values ("file_path" , ignore_index = True ).reset_index ()
1096+ rhs = (
1097+ spark .table (f"{ identifier } .files" )
1098+ .select (["file_path" , "partition" ])
1099+ .toPandas ()
1100+ .sort_values ("file_path" , ignore_index = True )
1101+ .reset_index ()
1102+ )
1103+ assert_frame_equal (lhs , rhs , check_dtype = False )
0 commit comments