Skip to content

Commit 7fc5349

Browse files
committed
feat: bootstrap
1 parent 261c89c commit 7fc5349

5 files changed

Lines changed: 788 additions & 0 deletions

File tree

examples/example_bootstrap.ipynb

Lines changed: 417 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""
2+
Statistical inference utilities.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
__author__ = "Artem Romanyuk"
8+
__copyright__ = "Copyright (c) 2025 PySATL project"
9+
__license__ = "SPDX-License-Identifier: MIT"
10+
11+
from pysatl_core.inference.bootstrap import (
12+
Bootstrap,
13+
BootstrapResult,
14+
ClassicalResampling,
15+
ResamplingMethod,
16+
SmoothResampling,
17+
StatisticalFunctional,
18+
)
19+
20+
__all__ = [
21+
"Bootstrap",
22+
"BootstrapResult",
23+
"ClassicalResampling",
24+
"ResamplingMethod",
25+
"SmoothResampling",
26+
"StatisticalFunctional",
27+
]
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""
2+
Bootstrap inference for statistical functionals.
3+
4+
Supports classical resampling (sampling with replacement from the empirical
5+
distribution) and smooth resampling (sampling from a KDE-fitted continuous
6+
approximation).
7+
"""
8+
9+
from __future__ import annotations
10+
11+
__author__ = "Artem Romanyuk"
12+
__copyright__ = "Copyright (c) 2025 PySATL project"
13+
__license__ = "SPDX-License-Identifier: MIT"
14+
15+
from dataclasses import dataclass
16+
from typing import Protocol
17+
18+
import numpy as np
19+
from numpy.random import Generator
20+
from numpy.typing import NDArray
21+
22+
from pysatl_core.distributions.empirical.distribution import EmpiricalDistribution, EmpiricalMethod, ScipyGaussianKde
23+
24+
25+
class StatisticalFunctional(Protocol):
26+
"""A function that maps a sample to a scalar summary."""
27+
28+
def __call__(self, sample: NDArray[np.float64]) -> float: ...
29+
30+
31+
class ResamplingMethod(Protocol):
32+
"""Strategy for generating a single bootstrap resample from data."""
33+
34+
def resample(self, data: NDArray[np.float64], size: int) -> NDArray[np.float64]: ...
35+
36+
37+
class ClassicalResampling:
38+
"""Sample with replacement from the original data (discrete empirical distribution)."""
39+
40+
def __init__(self, rng: Generator | None = None) -> None:
41+
self._rng = rng if rng is not None else np.random.default_rng()
42+
43+
def resample(self, data: NDArray[np.float64], size: int) -> NDArray[np.float64]:
44+
return self._rng.choice(data, size=size, replace=True)
45+
46+
47+
class SmoothResampling:
48+
"""Sample from a KDE-fitted continuous approximation of the empirical distribution.
49+
50+
The fitted distribution is cached after the first resample call and reused
51+
across bootstrap iterations for the same data array.
52+
"""
53+
54+
def __init__(self, method: EmpiricalMethod = ScipyGaussianKde()) -> None:
55+
self._method = method
56+
self._distr: EmpiricalDistribution | None = None
57+
58+
def resample(self, data: NDArray[np.float64], size: int) -> NDArray[np.float64]:
59+
if self._distr is None or self._distr.data is not data:
60+
self._distr = EmpiricalDistribution(data, method=self._method)
61+
return self._distr.sample(size)
62+
63+
64+
@dataclass
65+
class BootstrapResult:
66+
"""Outcome of a bootstrap run for a single statistical functional.
67+
68+
Parameters
69+
----------
70+
observed:
71+
Value of the functional on the original data.
72+
replicates:
73+
Array of shape ``(B,)`` holding the functional value on each
74+
bootstrap resample.
75+
"""
76+
77+
observed: float
78+
replicates: NDArray[np.float64]
79+
80+
def standard_error(self) -> float:
81+
"""Standard deviation of the bootstrap replicates."""
82+
return float(self.replicates.std())
83+
84+
def bias(self) -> float:
85+
"""Estimated bias: mean of replicates minus the observed value."""
86+
return float(self.replicates.mean() - self.observed)
87+
88+
def confidence_interval(self, level: float = 0.95) -> tuple[float, float]:
89+
"""Percentile bootstrap confidence interval.
90+
91+
Parameters
92+
----------
93+
level:
94+
Coverage level in the open interval (0, 1). Default is 0.95.
95+
96+
Returns
97+
-------
98+
(lower, upper) bounds of the interval.
99+
100+
TODO: implement BCa (bias-corrected and accelerated) and Normal
101+
approximation methods for better accuracy in skewed or small-sample
102+
settings.
103+
"""
104+
alpha = 1.0 - level
105+
lo, hi = np.percentile(self.replicates, [100 * alpha / 2, 100 * (1 - alpha / 2)])
106+
return float(lo), float(hi)
107+
108+
109+
class Bootstrap:
110+
"""Bootstrap procedure for estimating the sampling distribution of a functional.
111+
112+
Parameters
113+
----------
114+
data:
115+
One-dimensional array of observed values.
116+
B:
117+
Number of bootstrap resamples. Default is 1000.
118+
method:
119+
Strategy for generating each resample. Defaults to
120+
:class:`ClassicalResampling` (sampling with replacement).
121+
rng:
122+
NumPy random generator. A fresh generator is created when ``None``.
123+
124+
Examples
125+
--------
126+
>>> import numpy as np
127+
>>> rng = np.random.default_rng(0)
128+
>>> data = rng.normal(0, 1, 200)
129+
>>> result = Bootstrap(data, B=500, rng=rng).run(np.mean)
130+
>>> lo, hi = result.confidence_interval()
131+
"""
132+
133+
def __init__(
134+
self,
135+
data: NDArray[np.float64],
136+
B: int = 1000,
137+
method: ResamplingMethod | None = None,
138+
rng: Generator | None = None,
139+
) -> None:
140+
self._data = np.asarray(data, dtype=float)
141+
self._B = B
142+
self._rng = rng if rng is not None else np.random.default_rng()
143+
self._method = method if method is not None else ClassicalResampling(rng=self._rng)
144+
145+
def run(self, functional: StatisticalFunctional) -> BootstrapResult:
146+
"""Run the bootstrap and return a :class:`BootstrapResult`.
147+
148+
Parameters
149+
----------
150+
functional:
151+
A callable that accepts a sample array and returns a scalar.
152+
"""
153+
observed = float(functional(self._data))
154+
n = len(self._data)
155+
replicates = np.array(
156+
[
157+
float(functional(self._method.resample(self._data, n)))
158+
for _ in range(self._B)
159+
]
160+
)
161+
return BootstrapResult(observed=observed, replicates=replicates)
162+
163+
164+
__all__ = [
165+
"Bootstrap",
166+
"BootstrapResult",
167+
"ClassicalResampling",
168+
"ResamplingMethod",
169+
"SmoothResampling",
170+
"StatisticalFunctional",
171+
]

tests/unit/inference/__init__.py

Whitespace-only changes.
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""
2+
Unit tests for Bootstrap, BootstrapResult, ClassicalResampling, SmoothResampling.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
__author__ = "Artem Romanyuk"
8+
__copyright__ = "Copyright (c) 2025 PySATL project"
9+
__license__ = "SPDX-License-Identifier: MIT"
10+
11+
import numpy as np
12+
import pytest
13+
from numpy.typing import NDArray
14+
15+
from pysatl_core.inference import (
16+
Bootstrap,
17+
BootstrapResult,
18+
ClassicalResampling,
19+
SmoothResampling,
20+
)
21+
22+
23+
@pytest.fixture
24+
def rng() -> np.random.Generator:
25+
return np.random.default_rng(42)
26+
27+
28+
@pytest.fixture
29+
def data(rng: np.random.Generator) -> NDArray[np.float64]:
30+
return rng.normal(0.0, 1.0, 100)
31+
32+
33+
class TestClassicalResampling:
34+
def test_returns_correct_size(self, data: NDArray[np.float64]) -> None:
35+
result = ClassicalResampling().resample(data, len(data))
36+
assert result.shape == (len(data),)
37+
38+
def test_custom_size(self, data: NDArray[np.float64]) -> None:
39+
result = ClassicalResampling().resample(data, 50)
40+
assert result.shape == (50,)
41+
42+
def test_samples_only_from_original_data(self, data: NDArray[np.float64]) -> None:
43+
result = ClassicalResampling().resample(data, len(data))
44+
assert all(v in data for v in result)
45+
46+
def test_can_repeat_elements(self) -> None:
47+
tiny = np.array([1.0, 2.0])
48+
method = ClassicalResampling()
49+
seen_repeats = False
50+
for _ in range(50):
51+
sample = method.resample(tiny, 10)
52+
if len(set(sample.tolist())) < len(sample):
53+
seen_repeats = True
54+
break
55+
assert seen_repeats
56+
57+
def test_reproducible_with_same_seed(self, data: NDArray[np.float64]) -> None:
58+
r1 = ClassicalResampling(rng=np.random.default_rng(0)).resample(data, len(data))
59+
r2 = ClassicalResampling(rng=np.random.default_rng(0)).resample(data, len(data))
60+
assert np.array_equal(r1, r2)
61+
62+
def test_different_seeds_give_different_results(self, data: NDArray[np.float64]) -> None:
63+
r1 = ClassicalResampling(rng=np.random.default_rng(0)).resample(data, len(data))
64+
r2 = ClassicalResampling(rng=np.random.default_rng(1)).resample(data, len(data))
65+
assert not np.array_equal(r1, r2)
66+
67+
68+
class TestSmoothResampling:
69+
def test_returns_correct_size(self, data: NDArray[np.float64]) -> None:
70+
result = SmoothResampling().resample(data, len(data))
71+
assert result.shape == (len(data),)
72+
73+
def test_custom_size(self, data: NDArray[np.float64]) -> None:
74+
result = SmoothResampling().resample(data, 30)
75+
assert result.shape == (30,)
76+
77+
def test_caches_distribution_for_same_data(self, data: NDArray[np.float64]) -> None:
78+
method = SmoothResampling()
79+
method.resample(data, 10)
80+
distr_first = method._distr
81+
method.resample(data, 10)
82+
assert method._distr is distr_first
83+
84+
def test_refits_when_data_changes(self, data: NDArray[np.float64]) -> None:
85+
method = SmoothResampling()
86+
method.resample(data, 10)
87+
other = data * 2.0
88+
method.resample(other, 10)
89+
assert method._distr is not None
90+
assert method._distr.data is other
91+
92+
93+
class TestBootstrapResult:
94+
@pytest.fixture
95+
def result(self) -> BootstrapResult:
96+
replicates = np.random.default_rng(0).normal(3.0, 0.5, 1000)
97+
return BootstrapResult(observed=3.1, replicates=replicates)
98+
99+
def test_standard_error_matches_std(self, result: BootstrapResult) -> None:
100+
assert result.standard_error() == pytest.approx(result.replicates.std())
101+
102+
def test_bias_formula(self, result: BootstrapResult) -> None:
103+
expected = float(result.replicates.mean()) - result.observed
104+
assert result.bias() == pytest.approx(expected)
105+
106+
def test_confidence_interval_lower_less_than_upper(self, result: BootstrapResult) -> None:
107+
lo, hi = result.confidence_interval()
108+
assert lo < hi
109+
110+
def test_confidence_interval_matches_percentiles(self, result: BootstrapResult) -> None:
111+
lo, hi = result.confidence_interval(level=0.95)
112+
assert lo == pytest.approx(float(np.percentile(result.replicates, 2.5)))
113+
assert hi == pytest.approx(float(np.percentile(result.replicates, 97.5)))
114+
115+
def test_confidence_interval_custom_level(self, result: BootstrapResult) -> None:
116+
lo, hi = result.confidence_interval(level=0.90)
117+
assert lo == pytest.approx(float(np.percentile(result.replicates, 5.0)))
118+
assert hi == pytest.approx(float(np.percentile(result.replicates, 95.0)))
119+
120+
def test_bias_zero_when_replicates_centered_on_observed(self) -> None:
121+
result = BootstrapResult(observed=2.0, replicates=np.array([1.0, 2.0, 3.0]))
122+
assert result.bias() == pytest.approx(0.0)
123+
124+
125+
class TestBootstrap:
126+
def test_run_returns_bootstrap_result(self, data: NDArray[np.float64], rng: np.random.Generator) -> None:
127+
assert isinstance(Bootstrap(data, B=100, rng=rng).run(np.mean), BootstrapResult)
128+
129+
def test_observed_is_functional_on_original_data(
130+
self, data: NDArray[np.float64], rng: np.random.Generator
131+
) -> None:
132+
result = Bootstrap(data, B=100, rng=rng).run(np.mean)
133+
assert result.observed == pytest.approx(float(np.mean(data)))
134+
135+
def test_replicates_shape(self, data: NDArray[np.float64], rng: np.random.Generator) -> None:
136+
result = Bootstrap(data, B=250, rng=rng).run(np.mean)
137+
assert result.replicates.shape == (250,)
138+
139+
def test_works_with_std(self, data: NDArray[np.float64], rng: np.random.Generator) -> None:
140+
result = Bootstrap(data, B=100, rng=rng).run(np.std)
141+
assert result.observed == pytest.approx(float(np.std(data)))
142+
143+
def test_works_with_median(self, data: NDArray[np.float64], rng: np.random.Generator) -> None:
144+
result = Bootstrap(data, B=100, rng=rng).run(np.median)
145+
assert result.observed == pytest.approx(float(np.median(data)))
146+
147+
def test_works_with_custom_functional(
148+
self, data: NDArray[np.float64], rng: np.random.Generator
149+
) -> None:
150+
p90 = lambda x: float(np.percentile(x, 90))
151+
result = Bootstrap(data, B=100, rng=rng).run(p90)
152+
assert result.observed == pytest.approx(float(np.percentile(data, 90)))
153+
154+
def test_reproducible_with_same_seed(self, data: NDArray[np.float64]) -> None:
155+
r1 = Bootstrap(data, B=200, rng=np.random.default_rng(7)).run(np.mean)
156+
r2 = Bootstrap(data, B=200, rng=np.random.default_rng(7)).run(np.mean)
157+
assert np.array_equal(r1.replicates, r2.replicates)
158+
159+
def test_default_method_uses_bootstrap_rng(self, data: NDArray[np.float64]) -> None:
160+
r1 = Bootstrap(data, B=50, rng=np.random.default_rng(99)).run(np.mean)
161+
r2 = Bootstrap(data, B=50, rng=np.random.default_rng(99)).run(np.mean)
162+
assert np.array_equal(r1.replicates, r2.replicates)
163+
164+
def test_smooth_resampling_produces_correct_shape(self, data: NDArray[np.float64]) -> None:
165+
result = Bootstrap(data, B=50, method=SmoothResampling()).run(np.mean)
166+
assert result.replicates.shape == (50,)
167+
168+
def test_smooth_resampling_observed_equals_classical(
169+
self, data: NDArray[np.float64], rng: np.random.Generator
170+
) -> None:
171+
r_classical = Bootstrap(data, B=50, rng=rng).run(np.mean)
172+
r_smooth = Bootstrap(data, B=50, method=SmoothResampling()).run(np.mean)
173+
assert r_classical.observed == pytest.approx(r_smooth.observed)

0 commit comments

Comments
 (0)