Skip to content

Commit c429792

Browse files
committed
NPI-4453 restructure into single class for consistency
1 parent 5d94a26 commit c429792

1 file changed

Lines changed: 62 additions & 62 deletions

File tree

gnssanalysis/gn_utils.py

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,79 +1298,79 @@ def verify( # Was create_and_verify_pickled_df_list()
12981298
pickled_dfs = pickled_list_hash_file.read()
12991299

13001300
# And print out diffs for them...
1301-
diff_pickled_dfs(pickled_dfs, dataframes)
1301+
DataFrameHashUtils.diff_pickled_dfs(pickled_dfs, dataframes)
13021302

13031303
# Raise to ensure the test fails and this change / regression gets investigated
13041304
raise ValueError("Dataframes did not match baseline. Please investigate using above diffs")
13051305
else:
13061306
_logging.debug(f"Hashes matched for '{pickled_list_path}': {pickled_list_sha256}")
13071307
return True
13081308

1309+
@staticmethod
1310+
def diff_pickled_dfs(pickled_df_list: bytes, current_dfs_list: list[DataFrame]) -> None:
13091311

1310-
def diff_pickled_dfs(pickled_df_list: bytes, current_dfs_list: list[DataFrame]) -> None:
1311-
1312-
# CAUTION: deserialising can present arbitrary code execution potential. Ensure the data passed in is trustworthy.
1313-
if DataFrameHashUtils.enable_unpickling != True:
1314-
raise ValueError(
1315-
"Cannot load baselined DataFrames from pickle for analysis as unpickling is off (default for security). "
1316-
"Temporarily set DataFrameHashUtils.enable_unpickling = True to allow deserialisation of old DFs from disk."
1317-
)
1318-
old_df_list: list[DataFrame] = pickle.loads(pickled_df_list)
1312+
# CAUTION: deserialising can present arbitrary code execution potential. Ensure the data passed in is trustworthy.
1313+
if DataFrameHashUtils.enable_unpickling != True:
1314+
raise ValueError(
1315+
"Cannot load baselined DataFrames from pickle for analysis as unpickling is off (default for security). "
1316+
"Temporarily set DataFrameHashUtils.enable_unpickling = True to allow deserialisation of old DFs from disk."
1317+
)
1318+
old_df_list: list[DataFrame] = pickle.loads(pickled_df_list)
13191319

1320-
old_length = len(old_df_list)
1321-
current_length = len(current_dfs_list)
1322-
if old_length != current_length:
1323-
raise ValueError(
1324-
f"Unpickled DataFrame list had {old_length} elements, " f"whereas the current one has {current_length}"
1325-
)
1326-
for i in range(current_length):
1327-
old_df = old_df_list[i]
1328-
current_df = current_dfs_list[i]
1320+
old_length = len(old_df_list)
1321+
current_length = len(current_dfs_list)
1322+
if old_length != current_length:
1323+
raise ValueError(
1324+
f"Unpickled DataFrame list had {old_length} elements, " f"whereas the current one has {current_length}"
1325+
)
1326+
for i in range(current_length):
1327+
old_df = old_df_list[i]
1328+
current_df = current_dfs_list[i]
13291329

1330-
_logging.info(f"Diffing DataFrame #{i}...")
1330+
_logging.info(f"Diffing DataFrame #{i}...")
13311331

1332-
# DF.equals() may be useful, but does not check that the row/column index datatypes are the same
1333-
_logging.info(f"DataFrame.equals(): {current_df.equals(old_df)}")
1332+
# DF.equals() may be useful, but does not check that the row/column index datatypes are the same
1333+
_logging.info(f"DataFrame.equals(): {current_df.equals(old_df)}")
13341334

1335-
try:
1336-
_logging.info(f"current_dataframe.compare(old_dataframe): {current_df.compare(old_df)}")
1337-
except ValueError:
1338-
_logging.info(
1339-
f"current_dataframe.compare(old_dataframe): FAILED! Indexes / columns likely differ. Running diff of those..."
1340-
)
1341-
diff_indexes_and_columns(old_df, current_df)
1342-
1343-
1344-
def diff_indexes_and_columns(existing_df: DataFrame, current_df: DataFrame) -> None:
1345-
# Utility function to output diffs of DataFrame indexes and columns, as DataFrame.compare() will not run if
1346-
# they differ.
1347-
1348-
# Handle diffing of indexes
1349-
existing_df_index = existing_df.index.to_list()
1350-
current_df_index = current_df.index.to_list()
1351-
index_diff = set(existing_df_index).symmetric_difference(current_df_index)
1352-
if existing_df_index != current_df_index:
1353-
if len(index_diff) == 0: # Diff must've been in order, not values
1354-
_logging.info("Indexes differed in order, but not values. Outputting full indexes:")
1355-
_logging.info(f"Existing DF indexes: {str(existing_df.index.to_list())}")
1356-
_logging.info(f"Current DF indexes: {str(current_df.index.to_list())}")
1357-
else:
1358-
_logging.info(f"The following index values are in one DF but not the other: {str(index_diff)}")
1359-
1360-
# Handle diffing of columns
1361-
existing_df_colums = existing_df.columns.to_list()
1362-
current_df_columns = current_df.columns.to_list()
1363-
1364-
column_diff = set(existing_df_colums).symmetric_difference(current_df_columns)
1365-
if existing_df_colums != current_df_columns:
1366-
if len(column_diff) == 0: # Diff must've been in order, not values
1367-
_logging.info("Columns differed in order, but not values. Outputting full column listing:")
1368-
_logging.info(f"Existing DF columns: {str(existing_df.columns.to_list())}")
1369-
_logging.info(f"Current DF columns: {str(current_df.columns.to_list())}")
1370-
else:
1371-
_logging.info(f"The following column names are in one DF but not the other: {str(column_diff)}")
1335+
try:
1336+
_logging.info(f"current_dataframe.compare(old_dataframe): {current_df.compare(old_df)}")
1337+
except ValueError:
1338+
_logging.info(
1339+
f"current_dataframe.compare(old_dataframe): FAILED! Indexes / columns likely differ. Running diff of those..."
1340+
)
1341+
DataFrameHashUtils.diff_indexes_and_columns(old_df, current_df)
13721342

1343+
@staticmethod
1344+
def diff_indexes_and_columns(existing_df: DataFrame, current_df: DataFrame) -> None:
1345+
# Utility function to output diffs of DataFrame indexes and columns, as DataFrame.compare() will not run if
1346+
# they differ.
1347+
1348+
# Handle diffing of indexes
1349+
existing_df_index = existing_df.index.to_list()
1350+
current_df_index = current_df.index.to_list()
1351+
index_diff = set(existing_df_index).symmetric_difference(current_df_index)
1352+
if existing_df_index != current_df_index:
1353+
if len(index_diff) == 0: # Diff must've been in order, not values
1354+
_logging.info("Indexes differed in order, but not values. Outputting full indexes:")
1355+
_logging.info(f"Existing DF indexes: {str(existing_df.index.to_list())}")
1356+
_logging.info(f"Current DF indexes: {str(current_df.index.to_list())}")
1357+
else:
1358+
_logging.info(f"The following index values are in one DF but not the other: {str(index_diff)}")
1359+
1360+
# Handle diffing of columns
1361+
existing_df_colums = existing_df.columns.to_list()
1362+
current_df_columns = current_df.columns.to_list()
1363+
1364+
column_diff = set(existing_df_colums).symmetric_difference(current_df_columns)
1365+
if existing_df_colums != current_df_columns:
1366+
if len(column_diff) == 0: # Diff must've been in order, not values
1367+
_logging.info("Columns differed in order, but not values. Outputting full column listing:")
1368+
_logging.info(f"Existing DF columns: {str(existing_df.columns.to_list())}")
1369+
_logging.info(f"Current DF columns: {str(current_df.columns.to_list())}")
1370+
else:
1371+
_logging.info(f"The following column names are in one DF but not the other: {str(column_diff)}")
13731372

1374-
# NOTE: for aggregate tests, the revised multi-dataframe functions in PickleHashUtils are suggested
1375-
def pickle_and_sha256(obj: object) -> str:
1376-
return hashlib.sha256(pickle.dumps(obj)).hexdigest()
1373+
# NOTE: for aggregate tests, the revised multi-dataframe functions above are suggested
1374+
@staticmethod
1375+
def pickle_and_sha256(obj: object) -> str:
1376+
return hashlib.sha256(pickle.dumps(obj)).hexdigest()

0 commit comments

Comments
 (0)