Skip to content

Commit 89ae8b9

Browse files
committed
NPI-4453 introduce DataFrame hashing and test baselining functionality
1 parent 4c0deba commit 89ae8b9

8 files changed

Lines changed: 503 additions & 2 deletions

gnssanalysis/gn_utils.py

Lines changed: 371 additions & 1 deletion
Large diffs are not rendered by default.
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
6b5020201b08f64a2e7412422e03f94a6e7b0479f3a69a792967cec80b17a08b
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1b369c0e1d2ee74b36b233cb0655cc5e0158b334ec0757f546d5e45e6d05d58e
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1b369c0e1d2ee74b36b233cb0655cc5e0158b334ec0757f546d5e45e6d05d58e

tests/test_utils.py

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import logging
2+
import os
3+
import unittest
4+
from pandas import DataFrame
25
from pyfakefs.fake_filesystem_unittest import TestCase
36
from pathlib import Path
47

5-
from gnssanalysis.gn_utils import delete_entire_directory
8+
from gnssanalysis.gn_utils import DataFrameHashUtils, delete_entire_directory
69
import gnssanalysis.gn_utils as ga_utils
710

811

@@ -64,3 +67,128 @@ def test_configure_logging(self):
6467

6568
# Verify
6669
self.assertEqual(logger_not_output, None)
70+
71+
72+
class TestDataFrameHashUtils(unittest.TestCase):
73+
74+
def test_verify_refusal_in_wrong_mode(self):
75+
mode_backup = DataFrameHashUtils.mode
76+
try:
77+
df = DataFrame(["a", "b", "c"])
78+
79+
# Baseline (do not commit uncommented!) Note: every function needs its own baseline, becuase the
80+
# function name determines the filename, unless we override that.
81+
# DataFrameHashUtils.mode = "baseline"
82+
# DataFrameHashUtils.record_baseline([df])
83+
84+
# In baseline (write) mode, verify should be refused.
85+
DataFrameHashUtils.mode = "baseline"
86+
87+
with self.assertWarns(Warning) as warning_assessor:
88+
self.assertFalse(
89+
DataFrameHashUtils.verify([df]),
90+
"DF list verification should not succeed in 'baseline' mode",
91+
)
92+
# Ensure the expected warning, and only that warning, was raised
93+
captured_warnings = warning_assessor.warnings
94+
self.assertEqual(
95+
"Refusing to run verify method while not in verify mode. Set DataframeHashUtils.mode = 'verify' first",
96+
str(captured_warnings[0].message),
97+
)
98+
self.assertEqual(
99+
len(captured_warnings),
100+
1,
101+
"Expected exactly 1 warning. Check what other warnings are being raised!",
102+
)
103+
104+
# Should succeed in correct mode.
105+
DataFrameHashUtils.mode = "verify"
106+
self.assertTrue(
107+
DataFrameHashUtils.verify([df]),
108+
"DF list verification should succeed in 'verify' mode",
109+
)
110+
finally:
111+
# Ensure flag reset to avoid impacts on other tests (across the whole suite)
112+
DataFrameHashUtils.mode = mode_backup
113+
114+
def test_repeat_caller_rejection(self):
115+
# These functions determine what files to write/read baselines from, based on the identity of the (test)
116+
# function that called them. Therefore, calling twice from the same function would cause the *same baseline
117+
# files* to be read/written for a different part of the unit test.
118+
# That would have the effect of:
119+
# - in write mode: overwriting the baseline file for a previous part of the test function.
120+
# - in read mode: repeating verification of the same file against a different DF list (which would likely fail).
121+
122+
# We're only testing it with the verify function below, but both verify and baseline functions use the same
123+
# caller check logic, and store the caller record statically in a class variable. ?
124+
125+
df = DataFrame(["a", "b", "c"])
126+
127+
# Baseline (every function needs its own baseline, becuase the function name determines the filename,
128+
# unless we override that)
129+
# DataFrameHashUtils.mode = "baseline"
130+
# DataFrameHashUtils.record_baseline([df])
131+
132+
self.assertTrue(
133+
DataFrameHashUtils.verify([df]),
134+
"DF list verification should succeed on *first* call from a function.",
135+
)
136+
with self.assertRaises(ValueError):
137+
DataFrameHashUtils.verify([df])
138+
self.fail("DF list verification should fail on *second*/repeated calls from a function.")
139+
140+
def test_duplicate_df_rejection(self):
141+
142+
# List to aggregate DFs for hashing
143+
dfs_to_hash: list[DataFrame] = []
144+
145+
df = DataFrame(["a", "b", "c"]) # Let's call this Dataframe 'a'
146+
dfs_to_hash.extend([df])
147+
148+
# Overwrite local variable, as often happens in our unit tests
149+
df = DataFrame(["b", "c", "d"]) # Let's call this Dataframe 'b'
150+
151+
# This might look questionable, but is ok, because we saved a reference to dataframe 'a' to the list,
152+
# before overwriting local var 'df' to point at dataframe 'b'.
153+
dfs_to_hash.extend([df])
154+
155+
# Baseline this test (this should only be committed commented out!)
156+
# DataFrameHashUtils.mode = "baseline"
157+
# DataFrameHashUtils.record_baseline(dfs_to_hash)
158+
159+
# Will return True if verification succeeded. False if baseline missing or mode != verify
160+
self.assertTrue(
161+
DataFrameHashUtils.verify(dfs_to_hash),
162+
"DF list verification should succeed here (unless baseline files are missing, or baselining has been turned on)",
163+
)
164+
165+
# The local variable df still points to the same DF, so now the list contains [a,b,b]. This should be an error.
166+
dfs_to_hash.extend([df])
167+
with self.assertRaises(ValueError):
168+
DataFrameHashUtils.verify(dfs_to_hash)
169+
170+
def test_caller_identity_fetch(self):
171+
def wrapper_function():
172+
class_name, func_name = DataFrameHashUtils.get_grandparent_caller_id()
173+
self.assertEqual(class_name, "TestDataFrameHashUtils")
174+
self.assertEqual(func_name, "test_caller_identity_fetch")
175+
176+
# We have to do this (create an extra stack frame) because the function looks for
177+
# the *grandparent* caller, not parent caller.
178+
wrapper_function()
179+
180+
181+
# For use with debugger
182+
# if __name__ == "__main__":
183+
184+
# logging.basicConfig(format="%(levelname)s: %(message)s")
185+
# logger = logging.getLogger()
186+
# logger.setLevel(logging.DEBUG)
187+
188+
# os.chdir("./tests")
189+
190+
# df_hash_tests = TestDataFrameHashUtils()
191+
# df_hash_tests.test_duplicate_df_rejection()
192+
# df_hash_tests.test_verify_refusal_in_wrong_mode
193+
# df_hash_tests.test_repeat_caller_rejection()
194+
# df_hash_tests.test_caller_identity_fetch()

0 commit comments

Comments
 (0)