@@ -1298,79 +1298,79 @@ def verify( # Was create_and_verify_pickled_df_list()
12981298 pickled_dfs = pickled_list_hash_file .read ()
12991299
13001300 # And print out diffs for them...
1301- diff_pickled_dfs (pickled_dfs , dataframes )
1301+ DataFrameHashUtils . diff_pickled_dfs (pickled_dfs , dataframes )
13021302
13031303 # Raise to ensure the test fails and this change / regression gets investigated
13041304 raise ValueError ("Dataframes did not match baseline. Please investigate using above diffs" )
13051305 else :
13061306 _logging .debug (f"Hashes matched for '{ pickled_list_path } ': { pickled_list_sha256 } " )
13071307 return True
13081308
1309+ @staticmethod
1310+ def diff_pickled_dfs (pickled_df_list : bytes , current_dfs_list : list [DataFrame ]) -> None :
13091311
1310- def diff_pickled_dfs (pickled_df_list : bytes , current_dfs_list : list [DataFrame ]) -> None :
1311-
1312- # CAUTION: deserialising can present arbitrary code execution potential. Ensure the data passed in is trustworthy.
1313- if DataFrameHashUtils .enable_unpickling != True :
1314- raise ValueError (
1315- "Cannot load baselined DataFrames from pickle for analysis as unpickling is off (default for security). "
1316- "Temporarily set DataFrameHashUtils.enable_unpickling = True to allow deserialisation of old DFs from disk."
1317- )
1318- old_df_list : list [DataFrame ] = pickle .loads (pickled_df_list )
1312+ # CAUTION: deserialising can present arbitrary code execution potential. Ensure the data passed in is trustworthy.
1313+ if DataFrameHashUtils .enable_unpickling != True :
1314+ raise ValueError (
1315+ "Cannot load baselined DataFrames from pickle for analysis as unpickling is off (default for security). "
1316+ "Temporarily set DataFrameHashUtils.enable_unpickling = True to allow deserialisation of old DFs from disk."
1317+ )
1318+ old_df_list : list [DataFrame ] = pickle .loads (pickled_df_list )
13191319
1320- old_length = len (old_df_list )
1321- current_length = len (current_dfs_list )
1322- if old_length != current_length :
1323- raise ValueError (
1324- f"Unpickled DataFrame list had { old_length } elements, " f"whereas the current one has { current_length } "
1325- )
1326- for i in range (current_length ):
1327- old_df = old_df_list [i ]
1328- current_df = current_dfs_list [i ]
1320+ old_length = len (old_df_list )
1321+ current_length = len (current_dfs_list )
1322+ if old_length != current_length :
1323+ raise ValueError (
1324+ f"Unpickled DataFrame list had { old_length } elements, " f"whereas the current one has { current_length } "
1325+ )
1326+ for i in range (current_length ):
1327+ old_df = old_df_list [i ]
1328+ current_df = current_dfs_list [i ]
13291329
1330- _logging .info (f"Diffing DataFrame #{ i } ..." )
1330+ _logging .info (f"Diffing DataFrame #{ i } ..." )
13311331
1332- # DF.equals() may be useful, but does not check that the row/column index datatypes are the same
1333- _logging .info (f"DataFrame.equals(): { current_df .equals (old_df )} " )
1332+ # DF.equals() may be useful, but does not check that the row/column index datatypes are the same
1333+ _logging .info (f"DataFrame.equals(): { current_df .equals (old_df )} " )
13341334
1335- try :
1336- _logging .info (f"current_dataframe.compare(old_dataframe): { current_df .compare (old_df )} " )
1337- except ValueError :
1338- _logging .info (
1339- f"current_dataframe.compare(old_dataframe): FAILED! Indexes / columns likely differ. Running diff of those..."
1340- )
1341- diff_indexes_and_columns (old_df , current_df )
1342-
1343-
1344- def diff_indexes_and_columns (existing_df : DataFrame , current_df : DataFrame ) -> None :
1345- # Utility function to output diffs of DataFrame indexes and columns, as DataFrame.compare() will not run if
1346- # they differ.
1347-
1348- # Handle diffing of indexes
1349- existing_df_index = existing_df .index .to_list ()
1350- current_df_index = current_df .index .to_list ()
1351- index_diff = set (existing_df_index ).symmetric_difference (current_df_index )
1352- if existing_df_index != current_df_index :
1353- if len (index_diff ) == 0 : # Diff must've been in order, not values
1354- _logging .info ("Indexes differed in order, but not values. Outputting full indexes:" )
1355- _logging .info (f"Existing DF indexes: { str (existing_df .index .to_list ())} " )
1356- _logging .info (f"Current DF indexes: { str (current_df .index .to_list ())} " )
1357- else :
1358- _logging .info (f"The following index values are in one DF but not the other: { str (index_diff )} " )
1359-
1360- # Handle diffing of columns
1361- existing_df_colums = existing_df .columns .to_list ()
1362- current_df_columns = current_df .columns .to_list ()
1363-
1364- column_diff = set (existing_df_colums ).symmetric_difference (current_df_columns )
1365- if existing_df_colums != current_df_columns :
1366- if len (column_diff ) == 0 : # Diff must've been in order, not values
1367- _logging .info ("Columns differed in order, but not values. Outputting full column listing:" )
1368- _logging .info (f"Existing DF columns: { str (existing_df .columns .to_list ())} " )
1369- _logging .info (f"Current DF columns: { str (current_df .columns .to_list ())} " )
1370- else :
1371- _logging .info (f"The following column names are in one DF but not the other: { str (column_diff )} " )
1335+ try :
1336+ _logging .info (f"current_dataframe.compare(old_dataframe): { current_df .compare (old_df )} " )
1337+ except ValueError :
1338+ _logging .info (
1339+ f"current_dataframe.compare(old_dataframe): FAILED! Indexes / columns likely differ. Running diff of those..."
1340+ )
1341+ DataFrameHashUtils .diff_indexes_and_columns (old_df , current_df )
13721342
1343+ @staticmethod
1344+ def diff_indexes_and_columns (existing_df : DataFrame , current_df : DataFrame ) -> None :
1345+ # Utility function to output diffs of DataFrame indexes and columns, as DataFrame.compare() will not run if
1346+ # they differ.
1347+
1348+ # Handle diffing of indexes
1349+ existing_df_index = existing_df .index .to_list ()
1350+ current_df_index = current_df .index .to_list ()
1351+ index_diff = set (existing_df_index ).symmetric_difference (current_df_index )
1352+ if existing_df_index != current_df_index :
1353+ if len (index_diff ) == 0 : # Diff must've been in order, not values
1354+ _logging .info ("Indexes differed in order, but not values. Outputting full indexes:" )
1355+ _logging .info (f"Existing DF indexes: { str (existing_df .index .to_list ())} " )
1356+ _logging .info (f"Current DF indexes: { str (current_df .index .to_list ())} " )
1357+ else :
1358+ _logging .info (f"The following index values are in one DF but not the other: { str (index_diff )} " )
1359+
1360+ # Handle diffing of columns
1361+ existing_df_colums = existing_df .columns .to_list ()
1362+ current_df_columns = current_df .columns .to_list ()
1363+
1364+ column_diff = set (existing_df_colums ).symmetric_difference (current_df_columns )
1365+ if existing_df_colums != current_df_columns :
1366+ if len (column_diff ) == 0 : # Diff must've been in order, not values
1367+ _logging .info ("Columns differed in order, but not values. Outputting full column listing:" )
1368+ _logging .info (f"Existing DF columns: { str (existing_df .columns .to_list ())} " )
1369+ _logging .info (f"Current DF columns: { str (current_df .columns .to_list ())} " )
1370+ else :
1371+ _logging .info (f"The following column names are in one DF but not the other: { str (column_diff )} " )
13731372
1374- # NOTE: for aggregate tests, the revised multi-dataframe functions in PickleHashUtils are suggested
1375- def pickle_and_sha256 (obj : object ) -> str :
1376- return hashlib .sha256 (pickle .dumps (obj )).hexdigest ()
1373+ # NOTE: for aggregate tests, the revised multi-dataframe functions above are suggested
1374+ @staticmethod
1375+ def pickle_and_sha256 (obj : object ) -> str :
1376+ return hashlib .sha256 (pickle .dumps (obj )).hexdigest ()
0 commit comments