Skip to content

Commit 9dee5d9

Browse files
jucorclaude
andcommitted
Remove old participant_stats() in favor of vectorized replacement
Delete the O(participants × groups × members) participant_stats() from repness.py (~130 lines) now that _compute_participant_info_optimized() on the Conversation class provides a 3-15x faster NumPy replacement. - Remove function from repness.py and __init__.py exports - Update all imports (conversation.py, run_analysis.py, 4 test files) - Rewrite 3 test methods to call the vectorized Conversation method - Add "remove dead code after replacement" principle to the plan Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4d3354d commit 9dee5d9

9 files changed

Lines changed: 32 additions & 159 deletions

File tree

delphi/docs/PLAN_DISCREPANCY_FIXES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Because this work will span multiple Claude Code sessions, we maintain:
4949
- **All datasets, not just biodiversity**: Every fix must pass on ALL datasets. biodiversity is just one reference among many.
5050
- **Synthetic edge-case tests**: Every time we discover an edge case specific to one conversation, extract it into a synthetic unit test with made-up data (never real data from private datasets). These run fast and document the intent clearly.
5151
- **E2E awareness**: GitHub Actions has Cypress E2E tests (`cypress-tests.yml`) testing UI workflows, and `python-ci.yml` running pytest regression. The Cypress tests don't test math output values directly, but `python-ci.yml` will break if clustering/repness changes. Formula-level fixes (D4, D5, D6, D7, D8, D9) are pure computation — no E2E risk. Selection logic changes (D10, D11) and priority computation (D12) could affect what the TypeScript server returns. We decide case-by-case which PRs need E2E verification.
52+
- **Remove dead code after replacement**: When a function is replaced by a new implementation (e.g. vectorized version), the old function must be deleted and all callers updated — not left as dead code. Do this in the same PR or a follow-up, after benchmarks and tests confirm the replacement works.
5253

5354
### Datasets Available (sorted by size, smallest first)
5455

delphi/notebooks/run_analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def check_environment():
4545

4646
# Import polismath modules
4747
from polismath.conversation.conversation import Conversation
48-
from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
48+
from polismath.pca_kmeans_rep.repness import conv_repness
4949
from polismath.pca_kmeans_rep.corr import compute_correlation
5050

5151
def load_votes(votes_path):

delphi/polismath/conversation/conversation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
kmeans_sklearn,
2222
calculate_silhouette_sklearn
2323
)
24-
from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
24+
from polismath.pca_kmeans_rep.repness import conv_repness
2525
from polismath.pca_kmeans_rep.corr import compute_correlation
2626

2727

delphi/polismath/pca_kmeans_rep/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,13 @@
1010

1111
from polismath.pca_kmeans_rep.pca import pca_project_dataframe
1212
from polismath.pca_kmeans_rep.clusters import cluster_dataframe, Cluster
13-
from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
13+
from polismath.pca_kmeans_rep.repness import conv_repness
1414
from polismath.pca_kmeans_rep.corr import compute_correlation
1515

1616
__all__ = [
1717
'pca_project_dataframe',
1818
'cluster_dataframe',
1919
'Cluster',
2020
'conv_repness',
21-
'participant_stats',
2221
'compute_correlation',
2322
]

delphi/polismath/pca_kmeans_rep/repness.py

Lines changed: 0 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -934,133 +934,3 @@ def conv_repness(vote_matrix_df: pd.DataFrame, group_clusters: List[Dict[str, An
934934
result['consensus_comments'] = []
935935

936936
return result
937-
938-
939-
def participant_stats(vote_matrix: pd.DataFrame, group_clusters: List[Dict[str, Any]]) -> Dict[str, Any]:
940-
"""
941-
Calculate statistics about participants.
942-
943-
Args:
944-
vote_matrix: pd.DataFrame of votes
945-
group_clusters: List of group clusters
946-
947-
Returns:
948-
Dictionary with participant statistics
949-
"""
950-
if not group_clusters:
951-
return {}
952-
953-
# Extract values and ensure they're numeric
954-
matrix_values = vote_matrix.values.copy()
955-
956-
# Convert to numeric matrix with NaN for missing values
957-
if not np.issubdtype(matrix_values.dtype, np.number):
958-
numeric_values = np.zeros(matrix_values.shape, dtype=float)
959-
for i in range(matrix_values.shape[0]):
960-
for j in range(matrix_values.shape[1]):
961-
val = matrix_values[i, j]
962-
if pd.isna(val) or val is None:
963-
numeric_values[i, j] = np.nan
964-
else:
965-
try:
966-
numeric_values[i, j] = float(val)
967-
except (ValueError, TypeError):
968-
numeric_values[i, j] = np.nan
969-
matrix_values = numeric_values
970-
971-
# Replace NaNs with zeros for correlation calculation
972-
matrix_values = np.nan_to_num(matrix_values, nan=0.0)
973-
974-
# Create result structure
975-
result = {
976-
'participant_ids': vote_matrix.index.tolist(),
977-
'stats': {}
978-
}
979-
980-
# For each participant, calculate statistics
981-
for p_idx, participant_id in enumerate(vote_matrix.index):
982-
if p_idx >= matrix_values.shape[0]:
983-
continue
984-
985-
participant_votes = matrix_values[p_idx, :]
986-
987-
# Count votes (non-zero values are votes)
988-
n_agree = np.sum(participant_votes > 0)
989-
n_disagree = np.sum(participant_votes < 0)
990-
n_pass = np.sum(participant_votes == 0) - np.count_nonzero(np.isnan(participant_votes))
991-
n_votes = n_agree + n_disagree
992-
993-
# Skip participants with no votes
994-
if n_votes == 0:
995-
continue
996-
997-
# Find participant's group
998-
participant_group = None
999-
for group in group_clusters:
1000-
if participant_id in group['members']:
1001-
participant_group = group['id']
1002-
break
1003-
1004-
# Calculate agreement with each group
1005-
group_agreements = {}
1006-
1007-
for group in group_clusters:
1008-
group_id = group['id']
1009-
1010-
try:
1011-
# Get group member indices
1012-
group_members = []
1013-
for m in group['members']:
1014-
if m in vote_matrix.index:
1015-
idx = vote_matrix.index.get_loc(m)
1016-
if 0 <= idx < matrix_values.shape[0]:
1017-
group_members.append(idx)
1018-
1019-
if not group_members or len(group_members) < 3:
1020-
# Skip groups with too few members
1021-
group_agreements[group_id] = 0.0
1022-
continue
1023-
1024-
# Calculate group average votes for each comment
1025-
group_vote_matrix = matrix_values[group_members, :]
1026-
group_avg_votes = np.mean(group_vote_matrix, axis=0)
1027-
1028-
# Get participant's votes
1029-
participant_vote_vector = participant_votes
1030-
1031-
# Calculate correlation if enough votes
1032-
# Mask comments that have fewer than 3 votes from group members
1033-
valid_comment_mask = np.sum(group_vote_matrix != 0, axis=0) >= 3
1034-
1035-
if np.sum(valid_comment_mask) >= 3: # At least 3 common votes
1036-
# Extract votes for valid comments
1037-
p_votes = participant_vote_vector[valid_comment_mask]
1038-
g_votes = group_avg_votes[valid_comment_mask]
1039-
1040-
# Calculate correlation
1041-
if np.std(p_votes) > 0 and np.std(g_votes) > 0:
1042-
correlation = np.corrcoef(p_votes, g_votes)[0, 1]
1043-
if not np.isnan(correlation):
1044-
group_agreements[group_id] = correlation
1045-
else:
1046-
group_agreements[group_id] = 0.0
1047-
else:
1048-
group_agreements[group_id] = 0.0
1049-
else:
1050-
group_agreements[group_id] = 0.0
1051-
1052-
except Exception as e:
1053-
# Fallback for errors
1054-
group_agreements[group_id] = 0.0
1055-
1056-
# Store participant stats
1057-
result['stats'][participant_id] = {
1058-
'n_agree': int(n_agree),
1059-
'n_disagree': int(n_disagree),
1060-
'n_pass': int(n_pass),
1061-
'n_votes': int(n_votes),
1062-
'group': participant_group,
1063-
'group_correlations': group_agreements
1064-
}
1065-
1066-
return result

delphi/tests/test_legacy_repness_comparison.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
# Add the parent directory to the path to import the module
2020
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
2121

22-
from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
22+
from polismath.pca_kmeans_rep.repness import conv_repness
2323
from common_utils import create_test_conversation
2424
from polismath.regression import get_dataset_files
2525
from conftest import parse_dataset_blob_id

delphi/tests/test_old_format_repness.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
comment_stats, add_comparative_stats, repness_metric, finalize_cmt_stats,
2020
passes_by_test, best_agree, best_disagree, select_rep_comments,
2121
select_consensus_comments, conv_repness,
22-
participant_stats
2322
)
23+
from polismath.conversation.conversation import Conversation
2424

2525

2626
class TestStatisticalFunctions:
@@ -497,7 +497,7 @@ def test_conv_repness(self):
497497
assert 'c3' in group2_rep_ids
498498

499499
def test_participant_stats(self):
500-
"""Test participant statistics calculation."""
500+
"""Test participant statistics calculation via vectorized method."""
501501
# Create a test vote matrix
502502
vote_data = np.array([
503503
[1, 1, -1, None], # Participant 1
@@ -511,14 +511,15 @@ def test_participant_stats(self):
511511

512512
vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names)
513513

514-
# Create group clusters
514+
# Create group clusters (vectorized method requires 'center' key)
515515
group_clusters = [
516-
{'id': 1, 'members': ['p1', 'p2']},
517-
{'id': 2, 'members': ['p3', 'p4']}
516+
{'id': 1, 'members': ['p1', 'p2'], 'center': [0.0]},
517+
{'id': 2, 'members': ['p3', 'p4'], 'center': [0.0]}
518518
]
519519

520-
# Calculate participant stats
521-
ptpt_stats = participant_stats(vote_matrix, group_clusters)
520+
# Calculate participant stats using vectorized method
521+
conv = Conversation("test")
522+
ptpt_stats = conv._compute_participant_info_optimized(vote_matrix, group_clusters)
522523

523524
# Check result structure
524525
assert 'participant_ids' in ptpt_stats

delphi/tests/test_repness_smoke.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
# Add the parent directory to the path to import the module
1919
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
2020

21-
from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats
21+
from polismath.pca_kmeans_rep.repness import conv_repness
22+
from polismath.conversation.conversation import Conversation
2223
from common_utils import create_test_conversation
2324
logger = logging.getLogger(__name__)
2425

@@ -116,7 +117,7 @@ def test_participant_stats(self, dataset_name: str, conversation):
116117
"""Test participant statistics calculation."""
117118
logger.debug(f"Testing participant stats for {dataset_name}")
118119

119-
ptpt_stats = participant_stats(conversation.rating_mat, conversation._unfolded_group_clusters())
120+
ptpt_stats = conversation._compute_participant_info_optimized(conversation.rating_mat, conversation._unfolded_group_clusters())
120121

121122
assert ptpt_stats is not None
122123
assert 'participant_ids' in ptpt_stats

delphi/tests/test_repness_unit.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
comment_stats, add_comparative_stats, repness_metric, finalize_cmt_stats,
1919
passes_by_test, best_agree, best_disagree, select_rep_comments,
2020
calculate_kl_divergence, select_consensus_comments, conv_repness,
21-
participant_stats,
2221
# DataFrame-native vectorized functions
2322
prop_test_vectorized, two_prop_test_vectorized, compute_group_comment_stats_df
2423
)
24+
from polismath.conversation.conversation import Conversation
2525

2626

2727
class TestStatisticalFunctions:
@@ -498,44 +498,45 @@ def test_conv_repness(self):
498498
assert 'c3' in group2_rep_ids
499499

500500
def test_participant_stats(self):
501-
"""Test participant statistics calculation."""
501+
"""Test participant statistics calculation via vectorized method."""
502502
# Create a test vote matrix
503503
vote_data = np.array([
504504
[1, 1, -1, None], # Participant 1
505505
[1, 1, -1, 1], # Participant 2
506506
[-1, -1, 1, -1], # Participant 3
507507
[-1, -1, 1, 1] # Participant 4
508508
])
509-
509+
510510
row_names = ['p1', 'p2', 'p3', 'p4']
511511
col_names = ['c1', 'c2', 'c3', 'c4']
512-
512+
513513
vote_matrix = pd.DataFrame(vote_data, index=row_names, columns=col_names)
514-
515-
# Create group clusters
514+
515+
# Create group clusters (vectorized method requires 'center' key)
516516
group_clusters = [
517-
{'id': 1, 'members': ['p1', 'p2']},
518-
{'id': 2, 'members': ['p3', 'p4']}
517+
{'id': 1, 'members': ['p1', 'p2'], 'center': [0.0]},
518+
{'id': 2, 'members': ['p3', 'p4'], 'center': [0.0]}
519519
]
520-
521-
# Calculate participant stats
522-
ptpt_stats = participant_stats(vote_matrix, group_clusters)
523-
520+
521+
# Calculate participant stats using vectorized method
522+
conv = Conversation("test")
523+
ptpt_stats = conv._compute_participant_info_optimized(vote_matrix, group_clusters)
524+
524525
# Check result structure
525526
assert 'participant_ids' in ptpt_stats
526527
assert 'stats' in ptpt_stats
527-
528+
528529
# Check participant stats
529530
for ptpt_id in row_names:
530531
assert ptpt_id in ptpt_stats['stats']
531532
stats = ptpt_stats['stats'][ptpt_id]
532-
533+
533534
assert 'n_agree' in stats
534535
assert 'n_disagree' in stats
535536
assert 'n_votes' in stats
536537
assert 'group' in stats
537538
assert 'group_correlations' in stats
538-
539+
539540
# Check specific stats
540541
p1_stats = ptpt_stats['stats']['p1']
541542
assert p1_stats['n_agree'] == 2

0 commit comments

Comments
 (0)