@@ -234,51 +234,47 @@ def test_structural_compatibility(self, dataset_name: str, python_results, cloju
234234 logger .warning (f"No Clojure results available for { dataset_name } " )
235235
236236 @pytest .mark .use_discovered_datasets (use_blobs = True )
237- def test_comparison_visibility (self , dataset_name : str , python_results , clojure_results ):
238- """
239- Compare Python and Clojure results for visibility into differences.
237+ @pytest .mark .xfail (reason = "D2/D3: different clustering → different groups → empty repness for some groups" )
238+ def test_python_covers_clojure_groups (self , dataset_name : str , python_results , clojure_results ):
239+ """Python should produce non-empty repness for every group Clojure has."""
240+ if not clojure_results or 'repness' not in clojure_results :
241+ pytest .skip (f"No Clojure repness for { dataset_name } " )
242+
243+ clj_repness = clojure_results ['repness' ]
244+ py_group_repness = python_results .get ('group_repness' , {})
245+
246+ for gid_str , clj_entries in clj_repness .items ():
247+ if not isinstance (clj_entries , list ) or not clj_entries :
248+ continue
249+ gid = int (gid_str )
250+ py_entries = py_group_repness .get (gid , [])
251+ assert len (py_entries ) > 0 , (
252+ f"Group { gid } : Clojure has { len (clj_entries )} rep comments, Python has none"
253+ )
240254
241- Note: This test does NOT assert on match rates, as implementations are
242- known to be very different. It reports statistics for manual inspection.
243- """
244- logger .info (f"Comparing representativeness for { dataset_name } dataset" )
245-
246- if not clojure_results :
247- logger .warning (f"No Clojure results available for { dataset_name } . Skipping comparison." )
248- pytest .skip (f"No Clojure results for { dataset_name } " )
249- return
250-
251- # Perform comparison
252- match_rate , stats = self ._compare_results (python_results , clojure_results )
253-
254- # Log comparison results (for visibility, not assertions)
255- logger .info (f"Comparison results for { dataset_name } :" )
256- logger .info (f" - Overall: { stats ['comment_matches' ]} / { stats ['total_comments' ]} comments match" )
257- logger .info (f" - Note: Python and Clojure implementations are known to be very different" )
258-
259- logger .debug (f"Group match rates:" )
260- for group_id , rate in stats ['group_match_rates' ].items ():
261- logger .debug (f" - Group { group_id } : { rate :.2f} " )
262-
263- logger .debug (f"Consensus comments match rate: { stats ['consensus_match_rate' ]:.2f} " )
264-
265- # Log sample matching comments for inspection
266- if stats ['top_matching_comments' ]:
267- logger .debug (f"Sample matching comments (first 3):" )
268- for i , comment in enumerate (stats ['top_matching_comments' ][:3 ]):
269- cid = comment ['comment_id' ]
270- gid = comment ['group_id' ]
271- logger .debug (f" - Comment { cid } (Group { gid } ):" )
272- logger .debug (f" Clojure: Agree={ comment ['clojure' ]['agree' ]:.2f} , Disagree={ comment ['clojure' ]['disagree' ]:.2f} " )
273- logger .debug (f" Python: Agree={ comment ['python' ]['agree' ]:.2f} , Disagree={ comment ['python' ]['disagree' ]:.2f} " )
274-
275- # Log Python results summary
276- logger .debug (f"Python representativeness summary:" )
277- for group_id , comments in python_results .get ('group_repness' , {}).items ():
278- if comments :
279- logger .debug (f" - Group { group_id } : { len (comments )} comments" )
280- for i , cmt in enumerate (comments [:2 ]): # Show top 2
281- logger .debug (f" Comment { i + 1 } : ID { cmt .get ('comment_id' )} , Type: { cmt .get ('repful' )} " )
282- logger .debug (f" Agree: { cmt .get ('pa' , 0 ):.2f} , Disagree: { cmt .get ('pd' , 0 ):.2f} " )
283-
284- logger .info (f"✓ Comparison completed for { dataset_name } " )
255+ @pytest .mark .use_discovered_datasets (use_blobs = True )
256+ @pytest .mark .xfail (reason = "D5/D6/D10: z-value and selection logic differences" )
257+ def test_selected_comment_sets_match (self , dataset_name : str , python_results , clojure_results ):
258+ """Selected representative comment sets should match per group."""
259+ if not clojure_results or 'repness' not in clojure_results :
260+ pytest .skip (f"No Clojure repness for { dataset_name } " )
261+
262+ clj_repness = clojure_results ['repness' ]
263+ py_group_repness = python_results .get ('group_repness' , {})
264+
265+ mismatches = []
266+ for gid_str , clj_entries in clj_repness .items ():
267+ if not isinstance (clj_entries , list ):
268+ continue
269+ gid = int (gid_str )
270+ clj_tids = set (int (e .get ('tid' , e .get ('comment_id' , 0 ))) for e in clj_entries )
271+ py_entries = py_group_repness .get (gid , [])
272+ py_tids = set (int (e ['comment_id' ]) for e in py_entries )
273+
274+ if clj_tids != py_tids :
275+ mismatches .append (
276+ f"Group { gid } : clj={ sorted (clj_tids )} , py={ sorted (py_tids )} " )
277+
278+ assert len (mismatches ) == 0 , (
279+ f"{ len (mismatches )} groups differ:\n " + "\n " .join (mismatches )
280+ )
0 commit comments