Skip to content

Commit 42cf35c

Browse files
jucorclaude
andcommitted
Add repness blob comparison tests and fix tid type mismatches
Legacy repness comparison: replace visibility-only test with asserting tests (xfail for group coverage and set matching). Legacy clojure regression: add test_repness_matches_clojure comparing selected comment sets and z-values against the math blob (xfail). Fix int/str tid mismatch that caused all shared-comment lookups to find zero matches, making blob comparison tests pass vacuously. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1e96cb6 commit 42cf35c

2 files changed

Lines changed: 114 additions & 47 deletions

File tree

delphi/tests/test_legacy_clojure_regression.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,3 +401,74 @@ def test_comment_priorities(self, conversation_data):
401401

402402
check.equal(len(mismatches), 0,
403403
f"All comment priorities should match Clojure (got {len(mismatches)} mismatches out of {len(clojure_priorities)})")
404+
405+
@pytest.mark.xfail(raises=AssertionError, strict=True, reason="D5/D6/D7/D10: z-values, metric, and selection logic differ")
406+
def test_repness_matches_clojure(self, conversation_data):
407+
"""
408+
Test that representative comment selection matches Clojure.
409+
410+
Compares selected comment sets and z-score values per group.
411+
Requires D5 (prop test), D6 (two-prop test), D7 (metric),
412+
and D10 (selection logic) to fully pass.
413+
"""
414+
conv = conversation_data['conv']
415+
clojure_output = conversation_data['clojure_output']
416+
dataset_name = conversation_data['dataset_name']
417+
418+
print(f"\n[{dataset_name}] Testing repness matches Clojure...")
419+
420+
clj_repness = clojure_output.get('repness', {})
421+
check.is_true(bool(clj_repness), "Clojure output should have repness")
422+
423+
py_repness = (conv.repness or {}).get('group_repness', {})
424+
check.is_true(bool(py_repness), "Python should have group_repness")
425+
426+
if not clj_repness or not py_repness:
427+
return
428+
429+
set_mismatches = []
430+
value_mismatches = []
431+
432+
for gid_str, clj_entries in clj_repness.items():
433+
gid = int(gid_str)
434+
435+
# Compare selected comment sets
436+
clj_tids = set(e['tid'] for e in clj_entries)
437+
py_entries = py_repness.get(gid, [])
438+
py_tids = set(int(e['comment_id']) for e in py_entries)
439+
440+
if clj_tids != py_tids:
441+
set_mismatches.append(
442+
f" g{gid}: clj={sorted(clj_tids)}, py={sorted(py_tids)}")
443+
444+
# Compare z-values for shared comments
445+
py_by_tid = {int(e['comment_id']): e for e in py_entries}
446+
for clj_entry in clj_entries:
447+
tid = clj_entry['tid']
448+
py_entry = py_by_tid.get(tid)
449+
if py_entry is None:
450+
continue
451+
452+
clj_pat = clj_entry.get('p-test', 0)
453+
py_pat = py_entry.get('pat', 0)
454+
clj_rat = clj_entry.get('repness-test', 0)
455+
py_rat = py_entry.get('rat', 0)
456+
457+
if abs(clj_pat - py_pat) > 0.01 or abs(clj_rat - py_rat) > 0.01:
458+
value_mismatches.append(
459+
f" g{gid}/t{tid}: pat clj={clj_pat:.4f} py={py_pat:.4f}, "
460+
f"rat clj={clj_rat:.4f} py={py_rat:.4f}")
461+
462+
if set_mismatches:
463+
print(f" Set mismatches ({len(set_mismatches)} groups):")
464+
for m in set_mismatches[:10]:
465+
print(m)
466+
if value_mismatches:
467+
print(f" Value mismatches ({len(value_mismatches)} comments):")
468+
for m in value_mismatches[:10]:
469+
print(m)
470+
471+
check.equal(len(set_mismatches), 0,
472+
f"{len(set_mismatches)} groups differ in selected rep comments")
473+
check.equal(len(value_mismatches), 0,
474+
f"{len(value_mismatches)} shared comments have z-value mismatches")

delphi/tests/test_legacy_repness_comparison.py

Lines changed: 43 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -234,51 +234,47 @@ def test_structural_compatibility(self, dataset_name: str, python_results, cloju
234234
logger.warning(f"No Clojure results available for {dataset_name}")
235235

236236
@pytest.mark.use_discovered_datasets(use_blobs=True)
237-
def test_comparison_visibility(self, dataset_name: str, python_results, clojure_results):
238-
"""
239-
Compare Python and Clojure results for visibility into differences.
237+
@pytest.mark.xfail(reason="D2/D3: different clustering → different groups → empty repness for some groups")
238+
def test_python_covers_clojure_groups(self, dataset_name: str, python_results, clojure_results):
239+
"""Python should produce non-empty repness for every group Clojure has."""
240+
if not clojure_results or 'repness' not in clojure_results:
241+
pytest.skip(f"No Clojure repness for {dataset_name}")
242+
243+
clj_repness = clojure_results['repness']
244+
py_group_repness = python_results.get('group_repness', {})
245+
246+
for gid_str, clj_entries in clj_repness.items():
247+
if not isinstance(clj_entries, list) or not clj_entries:
248+
continue
249+
gid = int(gid_str)
250+
py_entries = py_group_repness.get(gid, [])
251+
assert len(py_entries) > 0, (
252+
f"Group {gid}: Clojure has {len(clj_entries)} rep comments, Python has none"
253+
)
240254

241-
Note: This test does NOT assert on match rates, as implementations are
242-
known to be very different. It reports statistics for manual inspection.
243-
"""
244-
logger.info(f"Comparing representativeness for {dataset_name} dataset")
245-
246-
if not clojure_results:
247-
logger.warning(f"No Clojure results available for {dataset_name}. Skipping comparison.")
248-
pytest.skip(f"No Clojure results for {dataset_name}")
249-
return
250-
251-
# Perform comparison
252-
match_rate, stats = self._compare_results(python_results, clojure_results)
253-
254-
# Log comparison results (for visibility, not assertions)
255-
logger.info(f"Comparison results for {dataset_name}:")
256-
logger.info(f" - Overall: {stats['comment_matches']} / {stats['total_comments']} comments match")
257-
logger.info(f" - Note: Python and Clojure implementations are known to be very different")
258-
259-
logger.debug(f"Group match rates:")
260-
for group_id, rate in stats['group_match_rates'].items():
261-
logger.debug(f" - Group {group_id}: {rate:.2f}")
262-
263-
logger.debug(f"Consensus comments match rate: {stats['consensus_match_rate']:.2f}")
264-
265-
# Log sample matching comments for inspection
266-
if stats['top_matching_comments']:
267-
logger.debug(f"Sample matching comments (first 3):")
268-
for i, comment in enumerate(stats['top_matching_comments'][:3]):
269-
cid = comment['comment_id']
270-
gid = comment['group_id']
271-
logger.debug(f" - Comment {cid} (Group {gid}):")
272-
logger.debug(f" Clojure: Agree={comment['clojure']['agree']:.2f}, Disagree={comment['clojure']['disagree']:.2f}")
273-
logger.debug(f" Python: Agree={comment['python']['agree']:.2f}, Disagree={comment['python']['disagree']:.2f}")
274-
275-
# Log Python results summary
276-
logger.debug(f"Python representativeness summary:")
277-
for group_id, comments in python_results.get('group_repness', {}).items():
278-
if comments:
279-
logger.debug(f" - Group {group_id}: {len(comments)} comments")
280-
for i, cmt in enumerate(comments[:2]): # Show top 2
281-
logger.debug(f" Comment {i+1}: ID {cmt.get('comment_id')}, Type: {cmt.get('repful')}")
282-
logger.debug(f" Agree: {cmt.get('pa', 0):.2f}, Disagree: {cmt.get('pd', 0):.2f}")
283-
284-
logger.info(f"✓ Comparison completed for {dataset_name}")
255+
@pytest.mark.use_discovered_datasets(use_blobs=True)
256+
@pytest.mark.xfail(reason="D5/D6/D10: z-value and selection logic differences")
257+
def test_selected_comment_sets_match(self, dataset_name: str, python_results, clojure_results):
258+
"""Selected representative comment sets should match per group."""
259+
if not clojure_results or 'repness' not in clojure_results:
260+
pytest.skip(f"No Clojure repness for {dataset_name}")
261+
262+
clj_repness = clojure_results['repness']
263+
py_group_repness = python_results.get('group_repness', {})
264+
265+
mismatches = []
266+
for gid_str, clj_entries in clj_repness.items():
267+
if not isinstance(clj_entries, list):
268+
continue
269+
gid = int(gid_str)
270+
clj_tids = set(int(e.get('tid', e.get('comment_id', 0))) for e in clj_entries)
271+
py_entries = py_group_repness.get(gid, [])
272+
py_tids = set(int(e['comment_id']) for e in py_entries)
273+
274+
if clj_tids != py_tids:
275+
mismatches.append(
276+
f"Group {gid}: clj={sorted(clj_tids)}, py={sorted(py_tids)}")
277+
278+
assert len(mismatches) == 0, (
279+
f"{len(mismatches)} groups differ:\n" + "\n".join(mismatches)
280+
)

0 commit comments

Comments
 (0)