Add repness blob comparison tests and fix tid type mismatches

jucor · claude · jucor · commit 42cf35cc4b0f · 2026-03-30T17:46:24.000+01:00
Legacy repness comparison: replace visibility-only test with asserting
tests (xfail for group coverage and set matching).

Legacy clojure regression: add test_repness_matches_clojure comparing
selected comment sets and z-values against the math blob (xfail).

Fix int/str tid mismatch that caused all shared-comment lookups to find
zero matches, making blob comparison tests pass vacuously.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/delphi/tests/test_legacy_clojure_regression.py b/delphi/tests/test_legacy_clojure_regression.py
@@ -401,3 +401,74 @@ def test_comment_priorities(self, conversation_data):
 
         check.equal(len(mismatches), 0,
                    f"All comment priorities should match Clojure (got {len(mismatches)} mismatches out of {len(clojure_priorities)})")
+
+    @pytest.mark.xfail(raises=AssertionError, strict=True, reason="D5/D6/D7/D10: z-values, metric, and selection logic differ")
+    def test_repness_matches_clojure(self, conversation_data):
+        """
+        Test that representative comment selection matches Clojure.
+
+        Compares selected comment sets and z-score values per group.
+        Requires D5 (prop test), D6 (two-prop test), D7 (metric),
+        and D10 (selection logic) to fully pass.
+        """
+        conv = conversation_data['conv']
+        clojure_output = conversation_data['clojure_output']
+        dataset_name = conversation_data['dataset_name']
+
+        print(f"\n[{dataset_name}] Testing repness matches Clojure...")
+
+        clj_repness = clojure_output.get('repness', {})
+        check.is_true(bool(clj_repness), "Clojure output should have repness")
+
+        py_repness = (conv.repness or {}).get('group_repness', {})
+        check.is_true(bool(py_repness), "Python should have group_repness")
+
+        if not clj_repness or not py_repness:
+            return
+
+        set_mismatches = []
+        value_mismatches = []
+
+        for gid_str, clj_entries in clj_repness.items():
+            gid = int(gid_str)
+
+            # Compare selected comment sets
+            clj_tids = set(e['tid'] for e in clj_entries)
+            py_entries = py_repness.get(gid, [])
+            py_tids = set(int(e['comment_id']) for e in py_entries)
+
+            if clj_tids != py_tids:
+                set_mismatches.append(
+                    f"  g{gid}: clj={sorted(clj_tids)}, py={sorted(py_tids)}")
+
+            # Compare z-values for shared comments
+            py_by_tid = {int(e['comment_id']): e for e in py_entries}
+            for clj_entry in clj_entries:
+                tid = clj_entry['tid']
+                py_entry = py_by_tid.get(tid)
+                if py_entry is None:
+                    continue
+
+                clj_pat = clj_entry.get('p-test', 0)
+                py_pat = py_entry.get('pat', 0)
+                clj_rat = clj_entry.get('repness-test', 0)
+                py_rat = py_entry.get('rat', 0)
+
+                if abs(clj_pat - py_pat) > 0.01 or abs(clj_rat - py_rat) > 0.01:
+                    value_mismatches.append(
+                        f"  g{gid}/t{tid}: pat clj={clj_pat:.4f} py={py_pat:.4f}, "
+                        f"rat clj={clj_rat:.4f} py={py_rat:.4f}")
+
+        if set_mismatches:
+            print(f"  Set mismatches ({len(set_mismatches)} groups):")
+            for m in set_mismatches[:10]:
+                print(m)
+        if value_mismatches:
+            print(f"  Value mismatches ({len(value_mismatches)} comments):")
+            for m in value_mismatches[:10]:
+                print(m)
+
+        check.equal(len(set_mismatches), 0,
+                   f"{len(set_mismatches)} groups differ in selected rep comments")
+        check.equal(len(value_mismatches), 0,
+                   f"{len(value_mismatches)} shared comments have z-value mismatches")
diff --git a/delphi/tests/test_legacy_repness_comparison.py b/delphi/tests/test_legacy_repness_comparison.py
@@ -234,51 +234,47 @@ def test_structural_compatibility(self, dataset_name: str, python_results, cloju
             logger.warning(f"No Clojure results available for {dataset_name}")
 
     @pytest.mark.use_discovered_datasets(use_blobs=True)
-    def test_comparison_visibility(self, dataset_name: str, python_results, clojure_results):
-        """
-        Compare Python and Clojure results for visibility into differences.
+    @pytest.mark.xfail(reason="D2/D3: different clustering → different groups → empty repness for some groups")
+    def test_python_covers_clojure_groups(self, dataset_name: str, python_results, clojure_results):
+        """Python should produce non-empty repness for every group Clojure has."""
+        if not clojure_results or 'repness' not in clojure_results:
+            pytest.skip(f"No Clojure repness for {dataset_name}")
+
+        clj_repness = clojure_results['repness']
+        py_group_repness = python_results.get('group_repness', {})
+
+        for gid_str, clj_entries in clj_repness.items():
+            if not isinstance(clj_entries, list) or not clj_entries:
+                continue
+            gid = int(gid_str)
+            py_entries = py_group_repness.get(gid, [])
+            assert len(py_entries) > 0, (
+                f"Group {gid}: Clojure has {len(clj_entries)} rep comments, Python has none"
+            )
 
-        Note: This test does NOT assert on match rates, as implementations are
-        known to be very different. It reports statistics for manual inspection.
-        """
-        logger.info(f"Comparing representativeness for {dataset_name} dataset")
-
-        if not clojure_results:
-            logger.warning(f"No Clojure results available for {dataset_name}. Skipping comparison.")
-            pytest.skip(f"No Clojure results for {dataset_name}")
-            return
-
-        # Perform comparison
-        match_rate, stats = self._compare_results(python_results, clojure_results)
-
-        # Log comparison results (for visibility, not assertions)
-        logger.info(f"Comparison results for {dataset_name}:")
-        logger.info(f"  - Overall: {stats['comment_matches']} / {stats['total_comments']} comments match")
-        logger.info(f"  - Note: Python and Clojure implementations are known to be very different")
-
-        logger.debug(f"Group match rates:")
-        for group_id, rate in stats['group_match_rates'].items():
-            logger.debug(f"  - Group {group_id}: {rate:.2f}")
-
-        logger.debug(f"Consensus comments match rate: {stats['consensus_match_rate']:.2f}")
-
-        # Log sample matching comments for inspection
-        if stats['top_matching_comments']:
-            logger.debug(f"Sample matching comments (first 3):")
-            for i, comment in enumerate(stats['top_matching_comments'][:3]):
-                cid = comment['comment_id']
-                gid = comment['group_id']
-                logger.debug(f"  - Comment {cid} (Group {gid}):")
-                logger.debug(f"    Clojure: Agree={comment['clojure']['agree']:.2f}, Disagree={comment['clojure']['disagree']:.2f}")
-                logger.debug(f"    Python:  Agree={comment['python']['agree']:.2f}, Disagree={comment['python']['disagree']:.2f}")
-
-        # Log Python results summary
-        logger.debug(f"Python representativeness summary:")
-        for group_id, comments in python_results.get('group_repness', {}).items():
-            if comments:
-                logger.debug(f"  - Group {group_id}: {len(comments)} comments")
-                for i, cmt in enumerate(comments[:2]):  # Show top 2
-                    logger.debug(f"    Comment {i+1}: ID {cmt.get('comment_id')}, Type: {cmt.get('repful')}")
-                    logger.debug(f"      Agree: {cmt.get('pa', 0):.2f}, Disagree: {cmt.get('pd', 0):.2f}")
-
-        logger.info(f"✓ Comparison completed for {dataset_name}")
+    @pytest.mark.use_discovered_datasets(use_blobs=True)
+    @pytest.mark.xfail(reason="D5/D6/D10: z-value and selection logic differences")
+    def test_selected_comment_sets_match(self, dataset_name: str, python_results, clojure_results):
+        """Selected representative comment sets should match per group."""
+        if not clojure_results or 'repness' not in clojure_results:
+            pytest.skip(f"No Clojure repness for {dataset_name}")
+
+        clj_repness = clojure_results['repness']
+        py_group_repness = python_results.get('group_repness', {})
+
+        mismatches = []
+        for gid_str, clj_entries in clj_repness.items():
+            if not isinstance(clj_entries, list):
+                continue
+            gid = int(gid_str)
+            clj_tids = set(int(e.get('tid', e.get('comment_id', 0))) for e in clj_entries)
+            py_entries = py_group_repness.get(gid, [])
+            py_tids = set(int(e['comment_id']) for e in py_entries)
+
+            if clj_tids != py_tids:
+                mismatches.append(
+                    f"Group {gid}: clj={sorted(clj_tids)}, py={sorted(py_tids)}")
+
+        assert len(mismatches) == 0, (
+            f"{len(mismatches)} groups differ:\n" + "\n".join(mismatches)
+        )