Skip to content

Commit 7617737

Browse files
fix: replace numpy/scipy with pure Python in experiment analytics (#6830)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent ede739b commit 7617737

4 files changed

Lines changed: 410 additions & 388 deletions

File tree

api/app_analytics/experiments.py

Lines changed: 133 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
GET /api/v1/environments/{env}/experiments/results/?feature=checkout_button
1919
"""
2020

21+
import math
22+
import random
2123
from typing import Any
2224

23-
import numpy as np
2425
from common.environments.permissions import VIEW_ENVIRONMENT
2526
from django.db.models import Count, Q
2627
from drf_spectacular.utils import OpenApiParameter, extend_schema
@@ -29,7 +30,6 @@
2930
from rest_framework.permissions import IsAuthenticated
3031
from rest_framework.request import Request
3132
from rest_framework.response import Response
32-
from scipy import stats as scipy_stats # type: ignore[import-untyped]
3333

3434
from environments.identities.traits.models import Trait
3535
from environments.models import Environment
@@ -146,43 +146,34 @@ def _calculate_two_variant_stats(
146146
"sample_size_warning": "One or more variants have zero samples",
147147
}
148148

149-
# Build contingency table
150-
table = np.array(
149+
# Build contingency table, replacing zeros with 0.5 to avoid log(0)
150+
table = [
151151
[
152-
[
153-
results[a]["conversions"],
154-
total_a - results[a]["conversions"],
155-
],
156-
[
157-
results[b]["conversions"],
158-
total_b - results[b]["conversions"],
159-
],
160-
]
161-
)
152+
max(results[a]["conversions"], 0.5),
153+
max(total_a - results[a]["conversions"], 0.5),
154+
],
155+
[
156+
max(results[b]["conversions"], 0.5),
157+
max(total_b - results[b]["conversions"], 0.5),
158+
],
159+
]
162160

163161
# G-test (log-likelihood ratio test)
164-
# Replace zeros with small values to avoid division errors
165-
table_safe = np.where(table == 0, 0.5, table)
166-
try:
167-
_, p_value, _, _ = scipy_stats.chi2_contingency(
168-
table_safe, correction=True, lambda_="log-likelihood"
169-
)
170-
except ValueError:
171-
p_value = 1.0
162+
p_value = _g_test(table)
172163

173164
# Bayesian "Chance to Win" using Monte Carlo simulation
174-
samples = 50000
175-
samples_a = np.random.beta(
176-
results[a]["conversions"] + 1,
177-
total_a - results[a]["conversions"] + 1,
178-
samples,
179-
)
180-
samples_b = np.random.beta(
181-
results[b]["conversions"] + 1,
182-
total_b - results[b]["conversions"] + 1,
183-
samples,
184-
)
185-
chance_a_wins = float((samples_a > samples_b).mean())
165+
n_samples = 10000
166+
a_wins = 0
167+
alpha_a = results[a]["conversions"] + 1
168+
beta_a = total_a - results[a]["conversions"] + 1
169+
alpha_b = results[b]["conversions"] + 1
170+
beta_b = total_b - results[b]["conversions"] + 1
171+
for _ in range(n_samples):
172+
sa = random.betavariate(alpha_a, beta_a)
173+
sb = random.betavariate(alpha_b, beta_b)
174+
if sa > sb:
175+
a_wins += 1
176+
chance_a_wins = a_wins / n_samples
186177

187178
# Determine winner and calculate lift (winner vs loser)
188179
if results[a]["rate"] > results[b]["rate"]:
@@ -239,41 +230,36 @@ def _calculate_multi_variant_stats(
239230
"sample_size_warning": f"Variant '{v}' has zero samples",
240231
}
241232

242-
# Build contingency table for all variants
243-
table = np.array(
233+
# Build contingency table, replacing zeros with 0.5 to avoid log(0)
234+
table = [
244235
[
245-
[results[v]["conversions"], results[v]["total"] - results[v]["conversions"]]
246-
for v in variants
236+
max(results[v]["conversions"], 0.5),
237+
max(results[v]["total"] - results[v]["conversions"], 0.5),
247238
]
248-
)
239+
for v in variants
240+
]
249241

250242
# G-test
251-
table_safe = np.where(table == 0, 0.5, table)
252-
try:
253-
_, p_value, _, _ = scipy_stats.chi2_contingency(
254-
table_safe, correction=True, lambda_="log-likelihood"
255-
)
256-
except ValueError:
257-
p_value = 1.0
243+
p_value = _g_test(table)
258244

259-
# Bayesian chance to win for each variant
260-
samples = 50000
261-
variant_samples = {}
262-
for v in variants:
263-
variant_samples[v] = np.random.beta(
245+
# Bayesian chance to win for each variant via Monte Carlo
246+
n_samples = 10000
247+
params = [
248+
(
264249
results[v]["conversions"] + 1,
265250
results[v]["total"] - results[v]["conversions"] + 1,
266-
samples,
267251
)
268-
269-
# Calculate chance each variant is the best
270-
chance_to_win = {}
271-
for v in variants:
272-
wins = np.ones(samples, dtype=bool)
273-
for other in variants:
274-
if other != v:
275-
wins &= variant_samples[v] > variant_samples[other]
276-
chance_to_win[v] = round(float(wins.mean()), 3)
252+
for v in variants
253+
]
254+
win_counts = [0] * len(variants)
255+
for _ in range(n_samples):
256+
draws = [random.betavariate(a, b) for a, b in params]
257+
best_idx = max(range(len(draws)), key=lambda i: draws[i])
258+
win_counts[best_idx] += 1
259+
260+
chance_to_win = {
261+
v: round(win_counts[i] / n_samples, 3) for i, v in enumerate(variants)
262+
}
277263

278264
# Find the best performing variant
279265
best_variant = max(variants, key=lambda v: results[v]["rate"])
@@ -296,6 +282,92 @@ def _calculate_multi_variant_stats(
296282
}
297283

298284

285+
def _g_test(table: list[list[float]]) -> float:
286+
"""
287+
Perform a G-test (log-likelihood ratio test) on a contingency table.
288+
289+
Returns the p-value under the chi-squared distribution.
290+
"""
291+
rows = len(table)
292+
cols = len(table[0])
293+
294+
row_totals = [sum(row) for row in table]
295+
col_totals = [sum(table[r][c] for r in range(rows)) for c in range(cols)]
296+
grand_total = sum(row_totals)
297+
298+
if grand_total == 0: # pragma: no cover
299+
return 1.0
300+
301+
g = 0.0
302+
for r in range(rows):
303+
for c in range(cols):
304+
observed = table[r][c]
305+
expected = row_totals[r] * col_totals[c] / grand_total
306+
if observed > 0 and expected > 0:
307+
g += observed * math.log(observed / expected)
308+
g *= 2
309+
310+
df = (rows - 1) * (cols - 1)
311+
if df == 0: # pragma: no cover
312+
return 1.0
313+
314+
return _chi2_sf(g, df)
315+
316+
317+
def _chi2_sf(x: float, df: int) -> float:
318+
"""Survival function (1 - CDF) for the chi-squared distribution."""
319+
if x <= 0:
320+
return 1.0
321+
return _upper_gamma_reg(df / 2.0, x / 2.0)
322+
323+
324+
def _upper_gamma_reg(a: float, x: float) -> float:
325+
"""Regularised upper incomplete gamma function Q(a, x)."""
326+
if x <= 0: # pragma: no cover
327+
return 1.0
328+
if x < a + 1:
329+
return 1.0 - _lower_gamma_series(a, x)
330+
return _upper_gamma_cf(a, x)
331+
332+
333+
def _lower_gamma_series(a: float, x: float) -> float:
334+
"""Regularised lower incomplete gamma P(a, x) via series expansion."""
335+
ap = a
336+
total = 1.0 / a
337+
term = 1.0 / a
338+
for _ in range(300):
339+
ap += 1
340+
term *= x / ap
341+
total += term
342+
if abs(term) < abs(total) * 1e-15:
343+
break
344+
return total * math.exp(-x + a * math.log(x) - math.lgamma(a))
345+
346+
347+
def _upper_gamma_cf(a: float, x: float) -> float:
348+
"""Regularised upper incomplete gamma Q(a, x) via continued fraction."""
349+
tiny = 1e-30
350+
b = x + 1.0 - a
351+
c = 1.0 / tiny
352+
d = 1.0 / b
353+
h = d
354+
for i in range(1, 300):
355+
an = -i * (i - a)
356+
b += 2.0
357+
d = an * d + b
358+
if abs(d) < tiny: # pragma: no cover
359+
d = tiny
360+
c = b + an / c
361+
if abs(c) < tiny: # pragma: no cover
362+
c = tiny
363+
d = 1.0 / d
364+
delta = d * c
365+
h *= delta
366+
if abs(delta - 1.0) < 1e-15:
367+
break
368+
return h * math.exp(-x + a * math.log(x) - math.lgamma(a))
369+
370+
299371
@extend_schema(
300372
parameters=[
301373
OpenApiParameter(

0 commit comments

Comments
 (0)