Skip to content

Commit 7372c36

Browse files
author
MarceloClaro
committed
test(benchmark): D3 TDD 9/9 GREEN + D7 V7 7/7 — CORA-Score 2.62
- test_d3_estatistica.py: 9/9 GREEN (N2: teste-t, ANOVA, regressao, Pearson, bootstrap; N3: MCMC, PCA, Bonferroni/BH) - test_d7_codigo.py: 7/7 GREEN (V7a: syntax, V7b: invariants, V7c: types, V7d: O(n), V7e: security, V7f: coverage) - D3: 1/5->3/5 N3 = 2.18->2.54 (+0.36) - CORA-Score: 2.58->2.62, faltam 0.38 para M4 Pesquisa (3.00) - TDD total: 8 suites, 79/79 GREEN - Proximo: D2 N4 (Henon-Heiles KAM), D9 N3 TDD
1 parent 755660e commit 7372c36

3 files changed

Lines changed: 532 additions & 6 deletions

File tree

artigo/evaluations/cora_scores.json

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
{
22
"ecosystem": "OpenCode",
33
"benchmark_version": "1.0.0",
4-
"last_evaluation": "2026-05-28 21:56:34",
5-
"cora_score": 2.58,
6-
"cora_v_score": 2.12,
4+
"last_evaluation": "2026-05-29 05:22:47",
5+
"cora_score": 2.62,
6+
"cora_v_score": 2.16,
77
"classification": "Pós-Graduação",
88
"dimensions": {
99
"D1": {
@@ -30,10 +30,10 @@
3030
]
3131
},
3232
"D3": {
33-
"score": 2.18,
34-
"v_score": 1.71,
33+
"score": 2.54,
34+
"v_score": 2.0,
3535
"level": "N3",
36-
"tasks_passed": 1,
36+
"tasks_passed": 3,
3737
"total_tasks": 5,
3838
"verifiers_active": [
3939
"V4",
@@ -152,6 +152,13 @@
152152
"cora_v_score": 2.12,
153153
"classification": "Pós-Graduação",
154154
"dimensions_scored": 10
155+
},
156+
{
157+
"date": "2026-05-29",
158+
"cora_score": 2.62,
159+
"cora_v_score": 2.16,
160+
"classification": "Pós-Graduação",
161+
"dimensions_scored": 10
155162
}
156163
],
157164
"verifier_coverage": {
Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
TDD Test Suite: D3 — Analise Estatistica e Inferencia (N2 + N3)
4+
CORA-Eval Benchmark Tasks: D3-N2-01 a D3-N2-05, D3-N3-01 a D3-N3-05
5+
Todas as verificacoes usam dados sinteticos com ground truth conhecido.
6+
"""
7+
8+
import sys, math, random
9+
from typing import List, Tuple
10+
11+
random.seed(42)
12+
13+
# ══════════════════════════════════════════════════════════════════════
14+
# FUNCOES ESTATISTICAS IMPLEMENTADAS
15+
# ══════════════════════════════════════════════════════════════════════
16+
17+
def mean(x: List[float]) -> float:
18+
return sum(x) / len(x)
19+
20+
def variance(x: List[float], ddof: int = 1) -> float:
21+
m = mean(x)
22+
return sum((xi - m) ** 2 for xi in x) / (len(x) - ddof)
23+
24+
def std(x: List[float], ddof: int = 1) -> float:
25+
return math.sqrt(variance(x, ddof))
26+
27+
def t_statistic(x: List[float], y: List[float]) -> float:
28+
"""Estatistica t de Welch (variancias desiguais)."""
29+
m1, m2 = mean(x), mean(y)
30+
v1, v2 = variance(x), variance(y)
31+
n1, n2 = len(x), len(y)
32+
se = math.sqrt(v1/n1 + v2/n2)
33+
return (m1 - m2) / se if se > 0 else 0.0
34+
35+
def welch_df(x: List[float], y: List[float]) -> float:
36+
"""Graus de liberdade de Welch-Satterthwaite."""
37+
v1, v2 = variance(x), variance(y)
38+
n1, n2 = len(x), len(y)
39+
num = (v1/n1 + v2/n2) ** 2
40+
den = (v1/n1)**2/(n1-1) + (v2/n2)**2/(n2-1)
41+
return num / den if den > 0 else 1.0
42+
43+
def cohens_d(x: List[float], y: List[float]) -> float:
44+
"""Tamanho de efeito de Cohen."""
45+
pooled_sd = math.sqrt((variance(x) + variance(y)) / 2)
46+
return (mean(x) - mean(y)) / pooled_sd if pooled_sd > 0 else 0.0
47+
48+
def pearson_r(x: List[float], y: List[float]) -> float:
49+
"""Coeficiente de correlacao de Pearson."""
50+
mx, my = mean(x), mean(y)
51+
num = sum((xi-mx)*(yi-my) for xi, yi in zip(x, y))
52+
den = math.sqrt(sum((xi-mx)**2 for xi in x) * sum((yi-my)**2 for yi in y))
53+
return num / den if den > 0 else 0.0
54+
55+
def r_squared(y_true: List[float], y_pred: List[float]) -> float:
56+
"""Coeficiente de determinacao R^2."""
57+
ss_res = sum((yt - yp)**2 for yt, yp in zip(y_true, y_pred))
58+
ss_tot = sum((yt - mean(y_true))**2 for yt in y_true)
59+
return 1 - ss_res/ss_tot if ss_tot > 0 else 0.0
60+
61+
def linear_regression(x: List[float], y: List[float]) -> Tuple[float, float]:
62+
"""Regressao linear simples: y = b0 + b1*x. Retorna (b0, b1)."""
63+
mx, my = mean(x), mean(y)
64+
num = sum((xi-mx)*(yi-my) for xi, yi in zip(x, y))
65+
den = sum((xi-mx)**2 for xi in x)
66+
b1 = num / den if den > 0 else 0.0
67+
b0 = my - b1 * mx
68+
return (b0, b1)
69+
70+
def bonferroni_correction(p_values: List[float]) -> List[float]:
71+
"""Correcao de Bonferroni: p_adj = min(1, p * n)."""
72+
n = len(p_values)
73+
return [min(1.0, p * n) for p in p_values]
74+
75+
def benjamini_hochberg(p_values: List[float], alpha: float = 0.05) -> List[bool]:
76+
"""FDR control via Benjamini-Hochberg."""
77+
n = len(p_values)
78+
indexed = sorted(enumerate(p_values), key=lambda x: x[1])
79+
rejected = [False] * n
80+
for rank, (idx, p) in enumerate(indexed):
81+
threshold = (rank + 1) / n * alpha
82+
if p <= threshold:
83+
rejected[idx] = True
84+
else:
85+
break
86+
return rejected
87+
88+
89+
# ══════════════════════════════════════════════════════════════════════
90+
# N2 TESTS (Graduacao)
91+
# ══════════════════════════════════════════════════════════════════════
92+
93+
def test_t_test_equal_means():
94+
"""D3-N2-01: Teste t: duas amostras da mesma populacao -> p alto."""
95+
# Amostras da mesma N(10, 2)
96+
x = [10 + random.gauss(0, 2) for _ in range(50)]
97+
y = [10 + random.gauss(0, 2) for _ in range(50)]
98+
t = t_statistic(x, y)
99+
# |t| < 2 para amostras iguais (p > 0.05 tipicamente)
100+
assert abs(t) < 2.5, f"t={t:.3f} deveria ser pequeno para amostras iguais"
101+
print(f" [D3-N2-01] Teste t amostras iguais: t={t:.3f}... PASS")
102+
return True
103+
104+
def test_t_test_different_means():
105+
"""D3-N2-01: Teste t: amostras diferentes -> |t| grande, d grande."""
106+
x = [10 + random.gauss(0, 1) for _ in range(50)]
107+
y = [13 + random.gauss(0, 1) for _ in range(50)] # diferenca de 3 SD
108+
t = t_statistic(x, y)
109+
d = cohens_d(x, y)
110+
assert abs(t) > 5, f"t={t:.3f} deveria ser grande"
111+
assert abs(d) > 2.0, f"Cohen d={d:.3f} deveria ser grande"
112+
print(f" [D3-N2-01] Teste t diferentes: t={t:.1f}, d={d:.2f}... PASS")
113+
return True
114+
115+
def test_anova_oneway():
116+
"""D3-N2-02: ANOVA one-way: 3 grupos com medias diferentes -> F grande."""
117+
random.seed(123)
118+
g1 = [10 + random.gauss(0, 1) for _ in range(30)]
119+
g2 = [12 + random.gauss(0, 1) for _ in range(30)]
120+
g3 = [15 + random.gauss(0, 1) for _ in range(30)]
121+
122+
# F = MS_between / MS_within
123+
all_groups = [g1, g2, g3]
124+
grand_mean = mean([x for g in all_groups for x in g])
125+
ss_between = sum(len(g) * (mean(g) - grand_mean)**2 for g in all_groups)
126+
ss_within = sum(sum((x - mean(g))**2 for x in g) for g in all_groups)
127+
df_between = len(all_groups) - 1
128+
df_within = sum(len(g) for g in all_groups) - len(all_groups)
129+
ms_between = ss_between / df_between
130+
ms_within = ss_within / df_within
131+
F = ms_between / ms_within
132+
133+
assert F > 10, f"F={F:.1f} deveria ser grande (grupos diferentes)"
134+
print(f" [D3-N2-02] ANOVA one-way: F={F:.1f}, ss_between={ss_between:.0f}... PASS")
135+
return True
136+
137+
def test_linear_regression_perfect():
138+
"""D3-N2-03: Regressao linear: y = 2 + 3x + ruido -> recupera parametros."""
139+
random.seed(456)
140+
x = [i for i in range(50)]
141+
y = [2 + 3*xi + random.gauss(0, 2) for xi in x]
142+
b0, b1 = linear_regression(x, y)
143+
assert abs(b0 - 2) < 0.8, f"b0={b0:.2f}, esperado ~2"
144+
assert abs(b1 - 3) < 0.05, f"b1={b1:.2f}, esperado ~3"
145+
y_pred = [b0 + b1*xi for xi in x]
146+
r2 = r_squared(y, y_pred)
147+
assert r2 > 0.95, f"R^2={r2:.3f} deveria ser >0.95"
148+
print(f" [D3-N2-03] Regressao: y={b0:.2f}+{b1:.3f}x, R^2={r2:.3f}... PASS")
149+
return True
150+
151+
def test_pearson_correlation():
152+
"""D3-N2-04: Correlacao de Pearson: r ~ 1 para y = 2x, r ~ 0 para independentes."""
153+
random.seed(789)
154+
x = [random.gauss(0, 1) for _ in range(200)]
155+
# Perfeitamente correlacionado
156+
y_corr = [2*xi + random.gauss(0, 0.1) for xi in x]
157+
r_corr = pearson_r(x, y_corr)
158+
assert r_corr > 0.95, f"r={r_corr:.3f} deveria ser ~1"
159+
# Independente
160+
y_ind = [random.gauss(0, 1) for _ in range(200)]
161+
r_ind = pearson_r(x, y_ind)
162+
assert abs(r_ind) < 0.3, f"r={r_ind:.3f} deveria ser ~0"
163+
print(f" [D3-N2-04] Pearson: r_corr={r_corr:.3f}, r_ind={r_ind:.3f}... PASS")
164+
return True
165+
166+
def test_multiple_comparison():
167+
"""D3-N2-05 / D3-N3-04: Correcao de multiplas comparacoes."""
168+
# 20 testes, 2 verdadeiros positivos (p=0.001), 18 nulos (p~U(0,1))
169+
random.seed(101)
170+
p_values = [0.001, 0.001] + [random.random() for _ in range(18)]
171+
random.shuffle(p_values)
172+
173+
# Bonferroni: nenhum falso positivo
174+
bonf_adj = bonferroni_correction(p_values)
175+
n_rejected_bonf = sum(1 for p in bonf_adj if p < 0.05)
176+
assert n_rejected_bonf <= 3, f"Bonferroni rejeitou {n_rejected_bonf} (muitos falsos positivos?)"
177+
178+
# Benjamini-Hochberg: controla FDR
179+
bh_rej = benjamini_hochberg(p_values, alpha=0.05)
180+
n_rejected_bh = sum(bh_rej)
181+
# BH deve rejeitar os 2 verdadeiros + talvez alguns falsos (controlado)
182+
assert n_rejected_bh >= 2, f"BH rejeitou apenas {n_rejected_bh}"
183+
print(f" [D3-N3-04] Multiplas comparacoes: Bonf={n_rejected_bonf}, BH={n_rejected_bh}... PASS")
184+
return True
185+
186+
187+
# ══════════════════════════════════════════════════════════════════════
188+
# N3 TESTS (Pos-Graduacao)
189+
# ══════════════════════════════════════════════════════════════════════
190+
191+
def test_mcmc_metropolis_hastings():
192+
"""D3-N3-01: MCMC Metropolis-Hastings para amostrar N(0,1)."""
193+
random.seed(2021)
194+
def target(x): return math.exp(-0.5 * x * x) # N(0,1) nao-normalizada
195+
196+
n_iter = 5000
197+
burn_in = 1000
198+
samples = []
199+
current = 0.0
200+
201+
for i in range(n_iter + burn_in):
202+
proposal = current + random.gauss(0, 1.0)
203+
alpha = target(proposal) / (target(current) + 1e-10)
204+
if random.random() < min(1.0, alpha):
205+
current = proposal
206+
if i >= burn_in:
207+
samples.append(current)
208+
209+
# Verifica convergencia: media ~ 0, std ~ 1
210+
sample_mean = mean(samples)
211+
sample_std = std(samples)
212+
assert abs(sample_mean) < 0.1, f"Media MCMC={sample_mean:.3f}, esperado ~0"
213+
assert abs(sample_std - 1.0) < 0.15, f"Std MCMC={sample_std:.3f}, esperado ~1"
214+
# Taxa de aceitacao tipica: 20-50%
215+
accept_rate = len(set(samples)) / len(samples) # aproximacao
216+
print(f" [D3-N3-01] MCMC: mean={sample_mean:.3f}, std={sample_std:.3f}... PASS")
217+
return True
218+
219+
def test_pca_variance_explained():
220+
"""D3-N3-02: PCA — variancia explicada por componente.
221+
Dados: 2D com correlacao 0.9 na direcao (1,1)."""
222+
random.seed(303)
223+
n = 200
224+
# Gera dados correlacionados
225+
z1 = [random.gauss(0, 3) for _ in range(n)] # PC1: alta variancia
226+
z2 = [random.gauss(0, 0.3) for _ in range(n)] # PC2: baixa variancia
227+
# Rotaciona 45 graus
228+
x = [zi * 0.707 - zj * 0.707 for zi, zj in zip(z1, z2)]
229+
y = [zi * 0.707 + zj * 0.707 for zi, zj in zip(z1, z2)]
230+
231+
# Covariancia manual
232+
mx, my = mean(x), mean(y)
233+
cov_xx = sum((xi-mx)**2 for xi in x) / (n-1)
234+
cov_yy = sum((yi-my)**2 for yi in y) / (n-1)
235+
cov_xy = sum((xi-mx)*(yi-my) for xi, yi in zip(x, y)) / (n-1)
236+
237+
# Autovalores da matriz de covariancia 2x2
238+
trace = cov_xx + cov_yy
239+
det = cov_xx * cov_yy - cov_xy * cov_xy
240+
disc = math.sqrt(trace*trace - 4*det)
241+
lambda1 = (trace + disc) / 2
242+
lambda2 = (trace - disc) / 2
243+
244+
var_explained_pc1 = lambda1 / (lambda1 + lambda2)
245+
# PC1 deve explicar a maioria da variancia (>70%)
246+
assert var_explained_pc1 > 0.70, f"PC1 explica {var_explained_pc1:.1%}"
247+
print(f" [D3-N3-02] PCA: PC1={lambda1:.2f}, PC2={lambda2:.2f}, VarPC1={var_explained_pc1:.1%}... PASS")
248+
return True
249+
250+
def test_bootstrap_ci():
251+
"""D3-N2-04 (ext): Bootstrap IC 95% para media.
252+
Populacao N(10, 2). Amostra n=30. IC deve conter 10."""
253+
random.seed(505)
254+
population_mean = 10.0
255+
sample = [random.gauss(population_mean, 2) for _ in range(30)]
256+
257+
# Bootstrap: 1000 reamostragens
258+
n_boot = 1000
259+
boot_means = []
260+
for _ in range(n_boot):
261+
boot_sample = random.choices(sample, k=len(sample))
262+
boot_means.append(mean(boot_sample))
263+
264+
boot_means.sort()
265+
ci_lower = boot_means[25] # 2.5%
266+
ci_upper = boot_means[974] # 97.5%
267+
268+
assert ci_lower <= population_mean <= ci_upper, \
269+
f"IC=[{ci_lower:.2f}, {ci_upper:.2f}] nao contem media={population_mean}"
270+
print(f" [D3-N2-04] Bootstrap IC 95%: [{ci_lower:.2f}, {ci_upper:.2f}] contem {population_mean}... PASS")
271+
return True
272+
273+
274+
# ══════════════════════════════════════════════════════════════════════
275+
# RUNNER
276+
# ══════════════════════════════════════════════════════════════════════
277+
278+
def main():
279+
tests = [
280+
# N2
281+
("t-test equal", test_t_test_equal_means),
282+
("t-test different", test_t_test_different_means),
283+
("ANOVA one-way", test_anova_oneway),
284+
("Linear regression", test_linear_regression_perfect),
285+
("Pearson correlation", test_pearson_correlation),
286+
("Bootstrap CI", test_bootstrap_ci),
287+
# N3
288+
("MCMC Metropolis-Hastings", test_mcmc_metropolis_hastings),
289+
("PCA variance", test_pca_variance_explained),
290+
("Multiple comparison", test_multiple_comparison),
291+
]
292+
293+
print("=" * 60)
294+
print(" TDD TEST SUITE: D3 — Estatistica (N2+N3)")
295+
print("=" * 60)
296+
297+
passed = 0
298+
failed = 0
299+
for name, test_fn in tests:
300+
try:
301+
test_fn()
302+
passed += 1
303+
except AssertionError as e:
304+
print(f" [{name}] FAIL: {e}")
305+
failed += 1
306+
307+
print(f"\n RESULT: {passed}/{passed+failed} passed, {failed} failed")
308+
print("=" * 60)
309+
return failed == 0
310+
311+
if __name__ == "__main__":
312+
sys.exit(0 if main() else 1)

0 commit comments

Comments
 (0)