-
Notifications
You must be signed in to change notification settings - Fork 252
[Stack 16/27] Fix D5: match Clojure prop_test formula (Wilson-score-like with +1 pseudocount) #2448
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
61d6423
cba756d
15d994b
de83253
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -60,29 +60,39 @@ def z_score_sig_95(z: float) -> bool: | |||||
| return z > Z_95 | ||||||
|
|
||||||
|
|
||||||
| def prop_test(p: float, n: int, p0: float) -> float: | ||||||
| def prop_test(succ: int, n: int) -> float: | ||||||
| """ | ||||||
| One-proportion z-test. | ||||||
|
|
||||||
| One-proportion z-test, matching Clojure's stats/prop-test (stats.clj:10-15). | ||||||
|
|
||||||
| Clojure formula: | ||||||
| (let [[succ n] (map inc [succ n])] | ||||||
| (* 2 (sqrt n) (+ (/ succ n) -0.5))) | ||||||
|
|
||||||
| Which simplifies to: 2 * sqrt(n+1) * ((succ+1)/(n+1) - 0.5) | ||||||
|
|
||||||
| This is a Wilson-score-like test with built-in +1 pseudocount (Laplace | ||||||
| smoothing). Unlike the standard z-test ((p - p0) / sqrt(p0*(1-p0)/n)), | ||||||
| the +1 terms regularize extreme values for small samples, preventing | ||||||
| spurious significance in small Polis groups. | ||||||
|
|
||||||
| Note: the pseudocount here (+1 to succ and n, i.e. Beta(1,1)) is | ||||||
| independent of the PSEUDO_COUNT used for pa/pd computation (Beta(2,2)). | ||||||
| Clojure's prop-test takes raw success counts, not pre-smoothed | ||||||
| probabilities. | ||||||
|
|
||||||
| Args: | ||||||
| p: Observed proportion | ||||||
| n: Number of observations | ||||||
| p0: Expected proportion under null hypothesis | ||||||
|
|
||||||
| succ: Number of successes (e.g. agrees or disagrees) | ||||||
| n: Total number of trials (votes seen) | ||||||
|
|
||||||
| Returns: | ||||||
| Z-score | ||||||
| Z-score (positive means succ/n > 0.5) | ||||||
|
||||||
| Z-score (positive means succ/n > 0.5) | |
| Z-score (sign determined by (succ + 1) / (n + 1) relative to 0.5; positive when (succ + 1) / (n + 1) > 0.5) |
Copilot
AI
Mar 23, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The docstring claims this matches the Clojure formula, but the implementation intentionally overrides the n=0 case to return 0.0 (instead of the formula’s 1.0 when applying the +1 terms). Please document this deviation explicitly (and ideally do the same for prop_test()), so “matches Clojure” isn’t misleading for no-data rows.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -693,23 +693,32 @@ class TestD5ProportionTest: | |
| Clojure uses Wilson-score-like: 2*sqrt(n+1)*((succ+1)/(n+1) - 0.5) | ||
|
|
||
| Clojure formula has built-in regularization via +1 terms. | ||
| After fix, prop_test(succ, n) matches Clojure exactly. | ||
| """ | ||
|
|
||
| @pytest.mark.xfail(reason="D5: Python standard z-test vs Clojure Wilson-score-like") | ||
| def test_prop_test_matches_clojure_formula(self): | ||
| """prop_test should match Clojure's formula for known inputs.""" | ||
| # Example: 12 successes out of 13 trials | ||
| succ, n = 12, 13 | ||
| # Clojure formula: 2 * sqrt(n+1) * ((succ+1)/(n+1) - 0.5) | ||
| expected = 2 * math.sqrt(n + 1) * ((succ + 1) / (n + 1) - 0.5) | ||
|
|
||
| # Current Python: prop_test(p, n, 0.5) where p = (succ + pc/2) / (n + pc) | ||
| p = (succ + PSEUDO_COUNT / 2) / (n + PSEUDO_COUNT) | ||
| python_result = prop_test(p, n, 0.5) | ||
|
|
||
| print(f"prop_test(succ={succ}, n={n}): Python={python_result:.4f}, Clojure={expected:.4f}") | ||
| check.almost_equal(python_result, expected, abs=0.01, | ||
| msg=f"prop_test mismatch: Python={python_result:.4f}, Clojure={expected:.4f}") | ||
| """prop_test(succ, n) should match Clojure's formula for known inputs.""" | ||
| test_cases = [ | ||
| (12, 13), # High success rate | ||
| (5, 8), # Moderate | ||
| (0, 10), # All failures | ||
| (10, 10), # All successes | ||
| (1, 2), # Tiny sample | ||
| (50, 100), # Larger sample | ||
| (0, 1), # Single trial, no success | ||
| (1, 1), # Single trial, success | ||
| ] | ||
| for succ, n in test_cases: | ||
| # Clojure formula: 2 * sqrt(n+1) * ((succ+1)/(n+1) - 0.5) | ||
| expected = 2 * math.sqrt(n + 1) * ((succ + 1) / (n + 1) - 0.5) | ||
| result = prop_test(succ, n) | ||
| check.almost_equal(result, expected, abs=1e-10, | ||
| msg=f"prop_test({succ}, {n}): got {result:.6f}, expected {expected:.6f}") | ||
|
|
||
| def test_prop_test_edge_cases(self): | ||
| """prop_test handles n=0 gracefully.""" | ||
| # n=0 should return 0 (no data) | ||
| assert prop_test(0, 0) == 0.0 | ||
|
|
||
| def test_clojure_pat_values_consistent_with_formula(self, clojure_blob, dataset_name): | ||
| """Sanity check: Clojure's p-test values match the documented formula.""" | ||
|
|
@@ -1155,14 +1164,13 @@ def test_z_thresholds_are_one_tailed(self): | |
| check.almost_equal(Z_95, 1.6449, abs=0.001, | ||
| msg=f"Z_95={Z_95}, expected 1.6449 (one-tailed)") | ||
|
|
||
| def test_clojure_prop_test_formula(self): | ||
| """Verify Clojure's proportion test formula: 2*sqrt(n+1)*((succ+1)/(n+1) - 0.5).""" | ||
| def test_prop_test_matches_clojure_formula_synthetic(self): | ||
| """prop_test(succ, n) should produce 2*sqrt(n+1)*((succ+1)/(n+1) - 0.5).""" | ||
| # Small n: 5 successes out of 8 trials | ||
| succ, n = 5, 8 | ||
| result = 2 * math.sqrt(n + 1) * ((succ + 1) / (n + 1) - 0.5) | ||
| # Manual: 2 * 3 * (6/9 - 0.5) = 6 * 0.1667 = 1.0 | ||
| expected = 2 * 3.0 * (6.0 / 9.0 - 0.5) | ||
| assert abs(result - expected) < 1e-10 | ||
| expected = 2 * 3.0 * (6.0 / 9.0 - 0.5) # = 1.0 | ||
| result = prop_test(succ, n) | ||
| assert abs(result - expected) < 1e-10, f"prop_test({succ}, {n})={result}, expected {expected}" | ||
|
|
||
| def test_clojure_repness_metric_product(self): | ||
| """Verify Clojure's repness metric is a product: ra * rat * pa * pat.""" | ||
|
|
@@ -1177,3 +1185,51 @@ def test_clojure_repful_uses_rat_vs_rdt(self): | |
|
|
||
| # rat < rdt → disagree | ||
| assert (0.5 < 1.5) # rat=0.5, rdt=1.5 → disagree | ||
|
|
||
|
|
||
| # ============================================================================ | ||
| # Blob Injection Tests — Compare Python functions against real Clojure values | ||
| # ============================================================================ | ||
| # | ||
| # These tests extract inputs from the Clojure math blob, feed them to Python | ||
| # functions, and compare outputs to the Clojure blob's values. This is the | ||
| # only non-tautological way to verify correctness: formula-only tests just | ||
| # re-implement our reading of the Clojure source and can't catch misreadings. | ||
| # | ||
| # Since Python and Clojure may produce different clusters (different k), we | ||
| # inject Clojure's own group memberships and vote counts from the blob, | ||
| # isolating each computation stage from upstream divergence. | ||
| # ============================================================================ | ||
|
|
||
| @pytest.mark.clojure_comparison | ||
| class TestD5BlobInjection: | ||
| """D5: Verify prop_test against real Clojure blob p-test values. | ||
|
|
||
| For each repness entry in the blob, extract n-success and n-trials, | ||
| feed to Python's prop_test(), compare to blob's p-test. | ||
| """ | ||
|
|
||
| def test_prop_test_matches_blob_p_test(self, clojure_blob, dataset_name): | ||
| """prop_test(n_success, n_trials) should match blob's p-test for every repness entry.""" | ||
| repness = clojure_blob.get('repness', {}) | ||
| if not repness: | ||
| pytest.skip(f"No repness in Clojure blob for {dataset_name}") | ||
|
|
||
| mismatches = [] | ||
| total = 0 | ||
| for gid, entries in repness.items(): | ||
| for entry in entries: | ||
| n_success = entry['n-success'] | ||
| n_trials = entry['n-trials'] | ||
| expected_p_test = entry['p-test'] | ||
| actual = prop_test(n_success, n_trials) | ||
| total += 1 | ||
| if abs(actual - expected_p_test) > 1e-4: | ||
| mismatches.append( | ||
| f"group={gid} tid={entry['tid']}: " | ||
| f"prop_test({n_success}, {n_trials})={actual:.6f}, " | ||
| f"blob p-test={expected_p_test:.6f}") | ||
|
Comment on lines
+1218
to
+1231
|
||
|
|
||
| assert not mismatches, ( | ||
| f"[{dataset_name}] {len(mismatches)}/{total} p-test mismatches:\n" | ||
| + "\n".join(mismatches[:10])) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This table row marks D5 “DONE” but still shows the PR number column as
—, while earlier in the doc you added the mappingPR 4 (D5) | #2448. Please replace—with#2448here to keep the plan’s tracking consistent.