Skip to content

Commit e191f74

Browse files
committed
chore: add diagnostic logging to compare_test_results
Temporary instrumentation to debug flaky futurehouse E2E test. Logs matched/skipped/timed-out counts and did_all_timeout state.
1 parent fefccd5 commit e191f74

1 file changed

Lines changed: 27 additions & 0 deletions

File tree

codeflash/verification/equivalence.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,23 +41,33 @@ def compare_test_results(
4141
)
4242
test_diffs: list[TestDiff] = []
4343
did_all_timeout: bool = True
44+
_matched_count = 0
45+
_skipped_cdd_only = 0
46+
_skipped_init_state = 0
47+
_skipped_none = 0
48+
_timed_out_count = 0
4449
for test_id in test_ids_superset:
4550
original_test_result = original_results.get_by_unique_invocation_loop_id(test_id)
4651
cdd_test_result = candidate_results.get_by_unique_invocation_loop_id(test_id)
4752

4853
if cdd_test_result is not None and original_test_result is None:
54+
_skipped_cdd_only += 1
4955
continue
5056
# If helper function instance_state verification is not present, that's ok. continue
5157
if (
5258
original_test_result.verification_type
5359
and original_test_result.verification_type == VerificationType.INIT_STATE_HELPER
5460
and cdd_test_result is None
5561
):
62+
_skipped_init_state += 1
5663
continue
5764
if original_test_result is None or cdd_test_result is None:
65+
_skipped_none += 1
5866
continue
67+
_matched_count += 1
5968
did_all_timeout = did_all_timeout and original_test_result.timed_out
6069
if original_test_result.timed_out:
70+
_timed_out_count += 1
6171
continue
6272
superset_obj = False
6373
if original_test_result.verification_type and (
@@ -148,6 +158,23 @@ def compare_test_results(
148158
)
149159

150160
sys.setrecursionlimit(original_recursion_limit)
161+
logger.info(
162+
f"[compare_test_results] superset={len(test_ids_superset)} matched={_matched_count} "
163+
f"skipped(cdd_only={_skipped_cdd_only} init_state={_skipped_init_state} none={_skipped_none}) "
164+
f"timed_out={_timed_out_count} did_all_timeout={did_all_timeout} diffs={len(test_diffs)} "
165+
f"pass_fail_only={pass_fail_only} orig_len={len(original_results)} cand_len={len(candidate_results)}"
166+
)
167+
if did_all_timeout and _matched_count > 0 and _matched_count <= 3:
168+
# Log a few sample matched IDs for debugging
169+
_sample_ids = []
170+
for test_id in test_ids_superset:
171+
orig = original_results.get_by_unique_invocation_loop_id(test_id)
172+
cand = candidate_results.get_by_unique_invocation_loop_id(test_id)
173+
if orig is not None and cand is not None:
174+
_sample_ids.append(f" id={test_id} orig_timed_out={orig.timed_out} orig_pass={orig.did_pass}")
175+
if len(_sample_ids) >= 3:
176+
break
177+
logger.info(f"[compare_test_results] sample matched: {_sample_ids}")
151178
if did_all_timeout:
152179
return False, test_diffs
153180
return len(test_diffs) == 0, test_diffs

0 commit comments

Comments
 (0)