@@ -97,33 +97,51 @@ def prop_test(succ: int, n: int) -> float:
9797 return 2 * math .sqrt (n_pc ) * (succ_pc / n_pc - 0.5 )
9898
9999
100- def two_prop_test (p1 : float , n1 : int , p2 : float , n2 : int ) -> float :
100+ def two_prop_test (succ_in : int , succ_out : int , pop_in : int , pop_out : int ) -> float :
101101 """
102- Two-proportion z-test.
103-
102+ Two-proportion z-test with +1 pseudocount on all inputs.
103+
104+ Matches Clojure's stats/two-prop-test (stats.clj:18-33):
105+ (let [[succ-in succ-out pop-in pop-out] (map inc [succ-in succ-out pop-in pop-out])
106+ pi1 (/ succ-in pop-in)
107+ pi2 (/ succ-out pop-out)
108+ pi-hat (/ (+ succ-in succ-out) (+ pop-in pop-out))]
109+ ...)
110+
111+ The +1 pseudocount (Laplace smoothing) regularizes the z-score for small
112+ samples, preventing extreme values when group sizes are tiny.
113+
104114 Args:
105- p1: First proportion
106- n1 : Number of observations for first proportion
107- p2: Second proportion
108- n2: Number of observations for second proportion
109-
115+ succ_in: Number of successes in the group (e.g., agrees)
116+ succ_out : Number of successes outside the group
117+ pop_in: Total votes in the group
118+ pop_out: Total votes outside the group
119+
110120 Returns:
111- Z-score
121+ Z-score (positive means group proportion > other proportion)
112122 """
113- if n1 == 0 or n2 == 0 :
123+ if pop_in == 0 or pop_out == 0 :
114124 return 0.0
115-
116- # Pooled probability
117- p = (p1 * n1 + p2 * n2 ) / (n1 + n2 )
118-
119- # Standard error
120- se = math .sqrt (p * (1 - p ) * (1 / n1 + 1 / n2 ))
121-
122- # Z-score calculation
125+
126+ # Add +1 pseudocount to all four inputs (Clojure: map inc)
127+ s1 = succ_in + 1
128+ s2 = succ_out + 1
129+ p1 = pop_in + 1
130+ p2 = pop_out + 1
131+
132+ pi1 = s1 / p1
133+ pi2 = s2 / p2
134+ pi_hat = (s1 + s2 ) / (p1 + p2 )
135+
136+ if pi_hat == 1.0 :
137+ # Clojure note (stats.clj:26-27): "this isn't quite right... could
138+ # actually solve this using limits" — returning 0 for now, matching Clojure.
139+ return 0.0
140+
141+ se = math .sqrt (pi_hat * (1 - pi_hat ) * (1 / p1 + 1 / p2 ))
123142 if se == 0 :
124143 return 0.0
125- else :
126- return (p1 - p2 ) / se
144+ return (pi1 - pi2 ) / se
127145
128146
129147def comment_stats (votes : np .ndarray , group_members : List [int ]) -> Dict [str , Any ]:
@@ -184,15 +202,17 @@ def add_comparative_stats(comment_stats: Dict[str, Any],
184202 result ['ra' ] = result ['pa' ] / other_stats ['pa' ] if other_stats ['pa' ] > 0 else 1.0
185203 result ['rd' ] = result ['pd' ] / other_stats ['pd' ] if other_stats ['pd' ] > 0 else 1.0
186204
187- # Calculate representativeness tests
205+ # Calculate representativeness tests — pass raw counts, matching Clojure's
206+ # (stats/two-prop-test (:na in-stats) (sum :na rest-stats)
207+ # (:ns in-stats) (sum :ns rest-stats)) (repness.clj:97-100)
188208 result ['rat' ] = two_prop_test (
189- result ['pa ' ], result [ 'ns ' ],
190- other_stats [ 'pa ' ], other_stats ['ns' ]
209+ result ['na ' ], other_stats [ 'na ' ],
210+ result [ 'ns ' ], other_stats ['ns' ]
191211 )
192-
212+
193213 result ['rdt' ] = two_prop_test (
194- result ['pd ' ], result [ 'ns ' ],
195- other_stats [ 'pd ' ], other_stats ['ns' ]
214+ result ['nd ' ], other_stats [ 'nd ' ],
215+ result [ 'ns ' ], other_stats ['ns' ]
196216 )
197217
198218 return result
@@ -495,30 +515,38 @@ def prop_test_vectorized(succ: pd.Series, n: pd.Series) -> pd.Series:
495515 return z
496516
497517
498- def two_prop_test_vectorized (p1 : pd .Series , n1 : pd .Series ,
499- p2 : pd .Series , n2 : pd .Series ) -> pd .Series :
518+ def two_prop_test_vectorized (succ_in : pd .Series , succ_out : pd .Series ,
519+ pop_in : pd .Series , pop_out : pd .Series ) -> pd .Series :
500520 """
501- Vectorized two-proportion z-test.
521+ Vectorized two-proportion z-test with +1 pseudocount on all inputs.
522+
523+ Matches Clojure's stats/two-prop-test (stats.clj:18-33).
524+ See two_prop_test() scalar version for formula details.
502525
503526 Args:
504- p1 : Series of first proportions
505- n1 : Series of number of observations for first proportion
506- p2 : Series of second proportions
507- n2 : Series of number of observations for second proportion
527+ succ_in : Series of success counts in the group
528+ succ_out : Series of success counts outside the group
529+ pop_in : Series of total vote counts in the group
530+ pop_out : Series of total vote counts outside the group
508531
509532 Returns:
510533 Series of z-scores
511534 """
512- # Pooled probability
513- p_pooled = (p1 * n1 + p2 * n2 ) / (n1 + n2 )
535+ # Add +1 pseudocount to all four inputs (Clojure: map inc)
536+ s1 = succ_in + 1
537+ s2 = succ_out + 1
538+ p1 = pop_in + 1
539+ p2 = pop_out + 1
514540
515- # Standard error
516- se = np .sqrt (p_pooled * (1 - p_pooled ) * (1 / n1 + 1 / n2 ))
541+ pi1 = s1 / p1
542+ pi2 = s2 / p2
543+ pi_hat = (s1 + s2 ) / (p1 + p2 )
517544
518- # Z-score calculation
519- z = (p1 - p2 ) / se
545+ se = np . sqrt ( pi_hat * ( 1 - pi_hat ) * ( 1 / p1 + 1 / p2 ))
546+ z = (pi1 - pi2 ) / se
520547
521- # Handle edge cases
548+ # Handle edge cases: pop_in=0 or pop_out=0 → 0, pi_hat=1 → 0
549+ z = z .where ((pop_in > 0 ) & (pop_out > 0 ), 0.0 )
522550 z = z .fillna (0.0 )
523551 z = z .replace ([np .inf , - np .inf ], 0.0 )
524552 return z
@@ -651,14 +679,16 @@ def compute_group_comment_stats_df(votes_long: pd.DataFrame,
651679 stats_df ['ra' ] = stats_df ['ra' ].replace ([np .inf , - np .inf ], 1.0 ).fillna (1.0 )
652680 stats_df ['rd' ] = stats_df ['rd' ].replace ([np .inf , - np .inf ], 1.0 ).fillna (1.0 )
653681
654- # Compute representativeness tests (two-proportion z-test: group vs other)
682+ # Compute representativeness tests — pass raw counts, matching Clojure's
683+ # (stats/two-prop-test (:na in-stats) (sum :na rest-stats)
684+ # (:ns in-stats) (sum :ns rest-stats)) (repness.clj:97-100)
655685 stats_df ['rat' ] = two_prop_test_vectorized (
656- stats_df ['pa ' ], stats_df ['ns ' ],
657- stats_df ['other_pa ' ], stats_df ['other_votes' ]
686+ stats_df ['na ' ], stats_df ['other_agree ' ],
687+ stats_df ['ns ' ], stats_df ['other_votes' ]
658688 )
659689 stats_df ['rdt' ] = two_prop_test_vectorized (
660- stats_df ['pd ' ], stats_df ['ns ' ],
661- stats_df ['other_pd ' ], stats_df ['other_votes' ]
690+ stats_df ['nd ' ], stats_df ['other_disagree ' ],
691+ stats_df ['ns ' ], stats_df ['other_votes' ]
662692 )
663693
664694 # Compute metrics
0 commit comments