Remove old participant_stats() in favor of vectorized replacement

jucor · claude · jucor · commit 9dee5d93d9ab · 2026-03-13T14:14:29.000Z
Delete the O(participants × groups × members) participant_stats() from
repness.py (~130 lines) now that _compute_participant_info_optimized()
on the Conversation class provides a 3-15x faster NumPy replacement.

- Remove function from repness.py and __init__.py exports
- Update all imports (conversation.py, run_analysis.py, 4 test files)
- Rewrite 3 test methods to call the vectorized Conversation method
- Add "remove dead code after replacement" principle to the plan

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/delphi/docs/PLAN_DISCREPANCY_FIXES.md b/delphi/docs/PLAN_DISCREPANCY_FIXES.md
@@ -49,6 +49,7 @@ Because this work will span multiple Claude Code sessions, we maintain:
 - **All datasets, not just biodiversity**: Every fix must pass on ALL datasets. biodiversity is just one reference among many.
 - **Synthetic edge-case tests**: Every time we discover an edge case specific to one conversation, extract it into a synthetic unit test with made-up data (never real data from private datasets). These run fast and document the intent clearly.
 - **E2E awareness**: GitHub Actions has Cypress E2E tests (`cypress-tests.yml`) testing UI workflows, and `python-ci.yml` running pytest regression. The Cypress tests don't test math output values directly, but `python-ci.yml` will break if clustering/repness changes. Formula-level fixes (D4, D5, D6, D7, D8, D9) are pure computation — no E2E risk. Selection logic changes (D10, D11) and priority computation (D12) could affect what the TypeScript server returns. We decide case-by-case which PRs need E2E verification.
+- **Remove dead code after replacement**: When a function is replaced by a new implementation (e.g. vectorized version), the old function must be deleted and all callers updated — not left as dead code. Do this in the same PR or a follow-up, after benchmarks and tests confirm the replacement works.
 
 ### Datasets Available (sorted by size, smallest first)
 
diff --git a/delphi/notebooks/run_analysis.py b/delphi/notebooks/run_analysis.py
@@ -45,7 +45,7 @@ def check_environment():
 
 # Import polismath modules
 from polismath.conversation.conversation import Conversation
-from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
+from polismath.pca_kmeans_rep.repness import conv_repness
 from polismath.pca_kmeans_rep.corr import compute_correlation
 
 def load_votes(votes_path):
diff --git a/delphi/polismath/conversation/conversation.py b/delphi/polismath/conversation/conversation.py
@@ -21,7 +21,7 @@
     kmeans_sklearn,
     calculate_silhouette_sklearn
 )
-from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
+from polismath.pca_kmeans_rep.repness import conv_repness
 from polismath.pca_kmeans_rep.corr import compute_correlation
 
 
diff --git a/delphi/polismath/pca_kmeans_rep/__init__.py b/delphi/polismath/pca_kmeans_rep/__init__.py
@@ -10,14 +10,13 @@
 
 from polismath.pca_kmeans_rep.pca import pca_project_dataframe
 from polismath.pca_kmeans_rep.clusters import cluster_dataframe, Cluster
-from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
+from polismath.pca_kmeans_rep.repness import conv_repness
 from polismath.pca_kmeans_rep.corr import compute_correlation
 
 __all__ = [
     'pca_project_dataframe',
     'cluster_dataframe',
     'Cluster',
     'conv_repness',
-    'participant_stats',
     'compute_correlation',
 ]
diff --git a/delphi/polismath/pca_kmeans_rep/repness.py b/delphi/polismath/pca_kmeans_rep/repness.py
@@ -934,133 +934,3 @@ def conv_repness(vote_matrix_df: pd.DataFrame, group_clusters: List[Dict[str, An
         result['consensus_comments'] = []
 
     return result
-
-
-def participant_stats(vote_matrix: pd.DataFrame, group_clusters: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """
-    Calculate statistics about participants.
-    
-    Args:
-        vote_matrix: pd.DataFrame of votes
-        group_clusters: List of group clusters
-        
-    Returns:
-        Dictionary with participant statistics
-    """
-    if not group_clusters:
-        return {}
-    
-    # Extract values and ensure they're numeric
-    matrix_values = vote_matrix.values.copy()
-    
-    # Convert to numeric matrix with NaN for missing values
-    if not np.issubdtype(matrix_values.dtype, np.number):
-        numeric_values = np.zeros(matrix_values.shape, dtype=float)
-        for i in range(matrix_values.shape[0]):
-            for j in range(matrix_values.shape[1]):
-                val = matrix_values[i, j]
-                if pd.isna(val) or val is None:
-                    numeric_values[i, j] = np.nan
-                else:
-                    try:
-                        numeric_values[i, j] = float(val)
-                    except (ValueError, TypeError):
-                        numeric_values[i, j] = np.nan
-        matrix_values = numeric_values
-    
-    # Replace NaNs with zeros for correlation calculation
-    matrix_values = np.nan_to_num(matrix_values, nan=0.0)
-    
-    # Create result structure
-    result = {
-        'participant_ids': vote_matrix.index.tolist(),
-        'stats': {}
-    }
-    
-    # For each participant, calculate statistics
-    for p_idx, participant_id in enumerate(vote_matrix.index):
-        if p_idx >= matrix_values.shape[0]:
-            continue
-            
-        participant_votes = matrix_values[p_idx, :]
-        
-        # Count votes (non-zero values are votes)
-        n_agree = np.sum(participant_votes > 0)
-        n_disagree = np.sum(participant_votes < 0)
-        n_pass = np.sum(participant_votes == 0) - np.count_nonzero(np.isnan(participant_votes))
-        n_votes = n_agree + n_disagree
-        
-        # Skip participants with no votes
-        if n_votes == 0:
-            continue
-            
-        # Find participant's group
-        participant_group = None
-        for group in group_clusters:
-            if participant_id in group['members']:
-                participant_group = group['id']
-                break
-        
-        # Calculate agreement with each group
-        group_agreements = {}
-        
-        for group in group_clusters:
-            group_id = group['id']
-            
-            try:
-                # Get group member indices
-                group_members = []
-                for m in group['members']:
-                    if m in vote_matrix.index:
-                        idx = vote_matrix.index.get_loc(m)
-                        if 0 <= idx < matrix_values.shape[0]:
-                            group_members.append(idx)
-                
-                if not group_members or len(group_members) < 3:
-                    # Skip groups with too few members
-                    group_agreements[group_id] = 0.0
-                    continue
-                
-                # Calculate group average votes for each comment
-                group_vote_matrix = matrix_values[group_members, :]
-                group_avg_votes = np.mean(group_vote_matrix, axis=0)
-                
-                # Get participant's votes
-                participant_vote_vector = participant_votes
-                
-                # Calculate correlation if enough votes
-                # Mask comments that have fewer than 3 votes from group members
-                valid_comment_mask = np.sum(group_vote_matrix != 0, axis=0) >= 3
-                
-                if np.sum(valid_comment_mask) >= 3:  # At least 3 common votes
-                    # Extract votes for valid comments
-                    p_votes = participant_vote_vector[valid_comment_mask]
-                    g_votes = group_avg_votes[valid_comment_mask]
-                    
-                    # Calculate correlation
-                    if np.std(p_votes) > 0 and np.std(g_votes) > 0:
-                        correlation = np.corrcoef(p_votes, g_votes)[0, 1]
-                        if not np.isnan(correlation):
-                            group_agreements[group_id] = correlation
-                        else:
-                            group_agreements[group_id] = 0.0
-                    else:
-                        group_agreements[group_id] = 0.0
-                else:
-                    group_agreements[group_id] = 0.0
-                    
-            except Exception as e:
-                # Fallback for errors
-                group_agreements[group_id] = 0.0
-        
-        # Store participant stats
-        result['stats'][participant_id] = {
-            'n_agree': int(n_agree),
-            'n_disagree': int(n_disagree),
-            'n_pass': int(n_pass),
-            'n_votes': int(n_votes),
-            'group': participant_group,
-            'group_correlations': group_agreements
-        }
-    
-    return result
diff --git a/delphi/tests/test_legacy_repness_comparison.py b/delphi/tests/test_legacy_repness_comparison.py
@@ -19,7 +19,7 @@
 # Add the parent directory to the path to import the module
 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
 
-from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
+from polismath.pca_kmeans_rep.repness import conv_repness
 from common_utils import create_test_conversation
 from polismath.regression import get_dataset_files
 from conftest import parse_dataset_blob_id
diff --git a/delphi/tests/test_old_format_repness.py b/delphi/tests/test_old_format_repness.py
@@ -19,8 +19,8 @@
     comment_stats, add_comparative_stats, repness_metric, finalize_cmt_stats,
     passes_by_test, best_agree, best_disagree, select_rep_comments,
     select_consensus_comments, conv_repness,
-    participant_stats
 )
+from polismath.conversation.conversation import Conversation
 
 
 class TestStatisticalFunctions:
@@ -497,7 +497,7 @@ def test_conv_repness(self):
         assert 'c3' in group2_rep_ids
 
     def test_participant_stats(self):
-        """Test participant statistics calculation."""
+        """Test participant statistics calculation via vectorized method."""
         # Create a test vote matrix
         vote_data = np.array([
             [1, 1, -1, None],  # Participant 1
@@ -511,14 +511,15 @@ def test_participant_stats(self):
 
         vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names)
 
-        # Create group clusters
+        # Create group clusters (vectorized method requires 'center' key)
         group_clusters = [
-            {'id': 1, 'members': ['p1', 'p2']},
-            {'id': 2, 'members': ['p3', 'p4']}
+            {'id': 1, 'members': ['p1', 'p2'], 'center': [0.0]},
+            {'id': 2, 'members': ['p3', 'p4'], 'center': [0.0]}
         ]
 
-        # Calculate participant stats
-        ptpt_stats = participant_stats(vote_matrix, group_clusters)
+        # Calculate participant stats using vectorized method
+        conv = Conversation("test")
+        ptpt_stats = conv._compute_participant_info_optimized(vote_matrix, group_clusters)
 
         # Check result structure
         assert 'participant_ids' in ptpt_stats
diff --git a/delphi/tests/test_repness_smoke.py b/delphi/tests/test_repness_smoke.py
@@ -18,7 +18,8 @@
 # Add the parent directory to the path to import the module
 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
 
-from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
+from polismath.pca_kmeans_rep.repness import conv_repness
+from polismath.conversation.conversation import Conversation
 from common_utils import create_test_conversation
 logger = logging.getLogger(__name__)
 
@@ -116,7 +117,7 @@ def test_participant_stats(self, dataset_name: str, conversation):
         """Test participant statistics calculation."""
         logger.debug(f"Testing participant stats for {dataset_name}")
 
-        ptpt_stats = participant_stats(conversation.rating_mat, conversation._unfolded_group_clusters())
+        ptpt_stats = conversation._compute_participant_info_optimized(conversation.rating_mat, conversation._unfolded_group_clusters())
 
         assert ptpt_stats is not None
         assert 'participant_ids' in ptpt_stats
diff --git a/delphi/tests/test_repness_unit.py b/delphi/tests/test_repness_unit.py
@@ -18,10 +18,10 @@
     comment_stats, add_comparative_stats, repness_metric, finalize_cmt_stats,
     passes_by_test, best_agree, best_disagree, select_rep_comments,
     calculate_kl_divergence, select_consensus_comments, conv_repness,
-    participant_stats,
     # DataFrame-native vectorized functions
     prop_test_vectorized, two_prop_test_vectorized, compute_group_comment_stats_df
 )
+from polismath.conversation.conversation import Conversation
 
 
 class TestStatisticalFunctions:
@@ -498,44 +498,45 @@ def test_conv_repness(self):
         assert 'c3' in group2_rep_ids
     
     def test_participant_stats(self):
-        """Test participant statistics calculation."""
+        """Test participant statistics calculation via vectorized method."""
         # Create a test vote matrix
         vote_data = np.array([
             [1, 1, -1, None],  # Participant 1
             [1, 1, -1, 1],     # Participant 2
             [-1, -1, 1, -1],   # Participant 3
             [-1, -1, 1, 1]     # Participant 4
         ])
-        
+
         row_names = ['p1', 'p2', 'p3', 'p4']
         col_names = ['c1', 'c2', 'c3', 'c4']
-        
+
         vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names)
-        
-        # Create group clusters
+
+        # Create group clusters (vectorized method requires 'center' key)
         group_clusters = [
-            {'id': 1, 'members': ['p1', 'p2']},
-            {'id': 2, 'members': ['p3', 'p4']}
+            {'id': 1, 'members': ['p1', 'p2'], 'center': [0.0]},
+            {'id': 2, 'members': ['p3', 'p4'], 'center': [0.0]}
         ]
-        
-        # Calculate participant stats
-        ptpt_stats = participant_stats(vote_matrix, group_clusters)
-        
+
+        # Calculate participant stats using vectorized method
+        conv = Conversation("test")
+        ptpt_stats = conv._compute_participant_info_optimized(vote_matrix, group_clusters)
+
         # Check result structure
         assert 'participant_ids' in ptpt_stats
         assert 'stats' in ptpt_stats
-        
+
         # Check participant stats
         for ptpt_id in row_names:
             assert ptpt_id in ptpt_stats['stats']
             stats = ptpt_stats['stats'][ptpt_id]
-            
+
             assert 'n_agree' in stats
             assert 'n_disagree' in stats
             assert 'n_votes' in stats
             assert 'group' in stats
             assert 'group_correlations' in stats
-            
+
         # Check specific stats
         p1_stats = ptpt_stats['stats']['p1']
         assert p1_stats['n_agree'] == 2

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`kmeans_sklearn,`
`22`	`22`	`calculate_silhouette_sklearn`
`23`	`23`	`)`
`24`		`-from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats`
	`24`	`+from polismath.pca_kmeans_rep.repness import conv_repness`
`25`	`25`	`from polismath.pca_kmeans_rep.corr import compute_correlation`
`26`	`26`
`27`	`27`