Skip to content

Commit 4d331b9

Browse files
committed
feat: empirical distribution
1 parent 866ef2c commit 4d331b9

9 files changed

Lines changed: 1062 additions & 0 deletions

File tree

src/pysatl_core/distributions/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@
2020
FitterMethod,
2121
)
2222
from .distribution import Distribution
23+
from .empirical import (
24+
EmpiricalComputationStrategy,
25+
EmpiricalDistribution,
26+
EmpiricalMethod,
27+
FittedEmpirical,
28+
ScipyGaussianKde,
29+
)
2330
from .registry import *
2431
from .registry import __all__ as _registry_all
2532
from .strategies import (
@@ -39,6 +46,12 @@
3946
"EvaluatorMethod",
4047
# distribution
4148
"Distribution",
49+
# empirical
50+
"EmpiricalComputationStrategy",
51+
"EmpiricalDistribution",
52+
"EmpiricalMethod",
53+
"FittedEmpirical",
54+
"ScipyGaussianKde",
4255
# strategies
4356
"ComputationStrategy",
4457
"DefaultComputationStrategy",
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"""
2+
Empirical distribution and its computation strategy.
3+
"""
4+
5+
__author__ = "Artem Romanyuk"
6+
__copyright__ = "Copyright (c) 2025 PySATL project"
7+
__license__ = "SPDX-License-Identifier: MIT"
8+
9+
from pysatl_core.distributions.empirical.distribution import (
10+
EmpiricalDistribution,
11+
EmpiricalMethod,
12+
FittedEmpirical,
13+
ScipyGaussianKde,
14+
)
15+
from pysatl_core.distributions.empirical.strategy import EmpiricalComputationStrategy
16+
17+
__all__ = [
18+
"EmpiricalComputationStrategy",
19+
"EmpiricalDistribution",
20+
"EmpiricalMethod",
21+
"FittedEmpirical",
22+
"ScipyGaussianKde",
23+
]
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
"""
2+
Empirical Distribution
3+
4+
Wraps a density estimator (e.g. KDE) built from observed data into a
5+
``Distribution`` that integrates with the characteristic graph.
6+
Providing only a PDF analytical computation is enough — the graph
7+
derives CDF and PPF automatically via numerical fitters.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
__author__ = "Artem Romanyuk"
13+
__copyright__ = "Copyright (c) 2025 PySATL project"
14+
__license__ = "SPDX-License-Identifier: MIT"
15+
16+
from dataclasses import dataclass
17+
from typing import TYPE_CHECKING, Any, Literal, Protocol, cast
18+
19+
import numpy as np
20+
from numpy.typing import NDArray
21+
22+
from pysatl_core.distributions.computations.computation import AnalyticalComputation
23+
from pysatl_core.distributions.distribution import _KEEP, Distribution
24+
from pysatl_core.distributions.empirical.strategy import EmpiricalComputationStrategy
25+
from pysatl_core.distributions.strategies import ComputationStrategy, SamplingStrategy
26+
from pysatl_core.sampling.unuran.core.unuran_sampling_strategy import DefaultUnuranSamplingStrategy
27+
from pysatl_core.types import CharacteristicName, ComputationFunc, UnivariateContinuous
28+
29+
if TYPE_CHECKING:
30+
from pysatl_core.distributions.support import Support
31+
32+
33+
class FittedEmpirical(Protocol):
34+
"""A fitted empirical density estimator that can evaluate PDF and CDF."""
35+
36+
def pdf(self, x: NDArray[np.float64]) -> NDArray[np.float64]:
37+
"""Evaluate the probability density function at points *x*."""
38+
...
39+
40+
def cdf(self, x: NDArray[np.float64]) -> NDArray[np.float64]:
41+
"""Evaluate the cumulative distribution function at points *x*."""
42+
...
43+
44+
45+
class EmpiricalMethod(Protocol):
46+
"""Strategy for constructing a :class:`FittedEmpirical` from a data sample."""
47+
48+
def fit(self, sample: NDArray[np.float64]) -> FittedEmpirical:
49+
"""Fit the estimator to *sample* and return an evaluable estimator."""
50+
...
51+
52+
53+
@dataclass(frozen=True)
54+
class ScipyGaussianKde:
55+
"""
56+
Gaussian KDE via :func:`scipy.stats.gaussian_kde`.
57+
58+
Parameters
59+
----------
60+
bandwidth : float or {"scott", "silverman"}, default "scott"
61+
Bandwidth selection method or explicit scalar value.
62+
"""
63+
64+
bandwidth: float | Literal["scott", "silverman"] = "scott"
65+
66+
def fit(self, sample: NDArray[np.float64]) -> FittedEmpirical:
67+
from scipy.stats import gaussian_kde
68+
69+
return _ScipyFittedKde(gaussian_kde(sample, bw_method=self.bandwidth))
70+
71+
72+
class _ScipyFittedKde:
73+
"""Thin wrapper that adapts ``scipy.stats.gaussian_kde`` to ``FittedEmpirical``."""
74+
75+
def __init__(self, kde: Any) -> None:
76+
self._kde = kde
77+
78+
def pdf(self, x: NDArray[np.float64]) -> NDArray[np.float64]:
79+
scalar_input = np.ndim(x) == 0
80+
x_arr = np.atleast_1d(np.asarray(x, dtype=float))
81+
finite = np.isfinite(x_arr)
82+
result = np.zeros_like(x_arr)
83+
if finite.any():
84+
result[finite] = self._kde.pdf(x_arr[finite])
85+
return result[0] if scalar_input else result
86+
87+
def cdf(self, x: NDArray[np.float64]) -> NDArray[np.float64]:
88+
scalar_input = np.ndim(x) == 0
89+
x_arr = np.atleast_1d(np.asarray(x, dtype=float))
90+
result = np.where(x_arr == np.inf, 1.0, np.where(x_arr == -np.inf, 0.0, np.nan))
91+
finite = np.isfinite(x_arr)
92+
if finite.any():
93+
result[finite] = np.array(
94+
[self._kde.integrate_box_1d(-np.inf, xi) for xi in x_arr[finite]],
95+
dtype=float,
96+
)
97+
return result[0] if scalar_input else result
98+
99+
100+
class EmpiricalDistribution(Distribution):
101+
"""
102+
A continuous univariate distribution built from an empirical density
103+
estimator (KDE by default) fitted to observed data.
104+
105+
The PDF is provided analytically via the estimator; CDF and PPF
106+
are derived automatically by the characteristic graph (numerical
107+
integration and root-finding respectively).
108+
109+
Parameters
110+
----------
111+
sample : NDArray[np.float64]
112+
One-dimensional array of observed values used to fit the estimator.
113+
method : EmpiricalMethod, default ScipyGaussianKde()
114+
Strategy used to construct the empirical density estimator.
115+
support : Support or None, default None
116+
Explicit support for the distribution. ``None`` leaves the support
117+
unrestricted, which is the natural choice for Gaussian KDE.
118+
sampling_strategy : SamplingStrategy or None, default None
119+
Overrides the default inverse-transform sampling strategy.
120+
computation_strategy : ComputationStrategy or None, default None
121+
Overrides the default graph-based computation strategy.
122+
123+
Examples
124+
--------
125+
>>> import numpy as np
126+
>>> rng = np.random.default_rng(0)
127+
>>> sample = rng.normal(0, 1, 500)
128+
>>> distr = EmpiricalDistribution(sample)
129+
>>> distr.calculate_characteristic("pdf", np.array([0.0]))
130+
array([0.39...])
131+
"""
132+
133+
def __init__(
134+
self,
135+
sample: NDArray[np.float64],
136+
method: EmpiricalMethod = ScipyGaussianKde(),
137+
support: Support | None = None,
138+
sampling_strategy: SamplingStrategy | None = None,
139+
computation_strategy: ComputationStrategy | None = None,
140+
) -> None:
141+
self._sample = np.asarray(sample, dtype=float)
142+
self._method = method
143+
self._estimator = method.fit(self._sample)
144+
145+
# TODO: DefaultUnuranSamplingStrategy fallback calls DefaultSamplingUnivariateStrategy
146+
# which passes the full U array to graph-derived PPF (scalar-only) and will fail.
147+
# Fix by making the fallback use a scalar loop instead of ppf(U).
148+
super().__init__(
149+
distribution_type=UnivariateContinuous,
150+
analytical_computations=self._build_analytical_computations(),
151+
support=support,
152+
sampling_strategy=sampling_strategy or DefaultUnuranSamplingStrategy(),
153+
computation_strategy=computation_strategy or EmpiricalComputationStrategy(),
154+
)
155+
156+
def _pdf(self, x: NDArray[np.float64], **_options: Any) -> NDArray[np.float64]:
157+
"""Indirection: always reads the current ``_estimator``.
158+
159+
Used as the analytical PDF func so swapping ``_estimator`` (via
160+
:meth:`set_method`) takes effect without rebuilding analytical
161+
computations or graph loop edges.
162+
"""
163+
return self._estimator.pdf(x)
164+
165+
def _cdf(self, x: NDArray[np.float64], **_options: Any) -> NDArray[np.float64]:
166+
"""Indirection: always reads the current ``_estimator``. See :meth:`_pdf`."""
167+
return self._estimator.cdf(x)
168+
169+
def _build_analytical_computations(
170+
self,
171+
) -> dict[str, AnalyticalComputation[Any, Any]]:
172+
"""Single source of truth for the analytical PDF/CDF entries."""
173+
result: dict[str, AnalyticalComputation[Any, Any]] = {
174+
CharacteristicName.PDF: AnalyticalComputation(
175+
target=CharacteristicName.PDF,
176+
func=cast(ComputationFunc[Any, Any], self._pdf),
177+
),
178+
CharacteristicName.CDF: AnalyticalComputation(
179+
target=CharacteristicName.CDF,
180+
func=cast(ComputationFunc[Any, Any], self._cdf),
181+
),
182+
}
183+
return result
184+
185+
@property
186+
def data(self) -> NDArray[np.float64]:
187+
"""The original data sample used to fit this distribution."""
188+
return self._sample
189+
190+
@property
191+
def method(self) -> EmpiricalMethod:
192+
"""The empirical method currently configured on this distribution."""
193+
return self._method
194+
195+
def with_method(self, method: EmpiricalMethod) -> EmpiricalDistribution:
196+
"""
197+
Return a clone of this distribution with a different empirical method.
198+
199+
The clone refits the new method on the same sample (shared array, no copy).
200+
Strategies are deep-copied like in any other ``with_*`` clone, so the new
201+
instance starts with empty caches and a fresh sampler — independent of
202+
anything the original may have memoised.
203+
204+
Use this in preference to :meth:`set_method` when you want to compare
205+
methods side-by-side or keep the original distribution intact.
206+
"""
207+
return self._clone_with_strategies(method=method)
208+
209+
def set_method(self, method: EmpiricalMethod) -> None:
210+
"""
211+
Replace the empirical method in place.
212+
213+
Refits ``method`` on the original sample and rebinds the underlying
214+
estimator. The graph's analytical loops do not need to be rebuilt:
215+
:meth:`_pdf` and :meth:`_cdf` always read the current estimator,
216+
and :class:`EmpiricalComputationStrategy` clears its fitted-method
217+
cache automatically when it notices the estimator object has changed.
218+
219+
Sampling strategies that hold cached state (notably
220+
:class:`DefaultUnuranSamplingStrategy`, whose generator is built once
221+
on the PDF at the time of first sample) are reset via their
222+
``invalidate()`` method when present. Strategies without
223+
``invalidate`` are left untouched — the assumption is that they hold
224+
no per-distribution state.
225+
226+
Notes
227+
-----
228+
Any external code that holds a direct reference to internal sampler
229+
state (e.g. a value previously read from
230+
``distr.sampling_strategy._sampler``) keeps that reference alive and
231+
will continue to sample from the *previous* distribution. Re-acquire
232+
such references after calling :meth:`set_method`.
233+
234+
For side-by-side comparison of methods, prefer :meth:`with_method`.
235+
"""
236+
self._method = method
237+
self._estimator = method.fit(self._sample)
238+
# Computation cache: auto-invalidated on next query via estimator-id
239+
# tracking in EmpiricalComputationStrategy. Custom strategies that
240+
# implement an invalidate() hook get a chance to reset, too.
241+
getattr(self._computation_strategy, "invalidate", lambda: None)()
242+
# Sampling cache: must be reset explicitly — UNURAN's C-side init
243+
# captured the old PDF and cannot be patched in place.
244+
getattr(self._sampling_strategy, "invalidate", lambda: None)()
245+
246+
def _clone_with_strategies(
247+
self,
248+
*,
249+
sampling_strategy: SamplingStrategy | None | object = _KEEP,
250+
computation_strategy: ComputationStrategy | None | object = _KEEP,
251+
method: EmpiricalMethod | object = _KEEP,
252+
) -> EmpiricalDistribution:
253+
clone = object.__new__(EmpiricalDistribution)
254+
clone._sample = self._sample
255+
if method is _KEEP:
256+
clone._method = self._method
257+
clone._estimator = self._estimator
258+
else:
259+
new_method = cast(EmpiricalMethod, method)
260+
clone._method = new_method
261+
clone._estimator = new_method.fit(self._sample)
262+
Distribution.__init__(
263+
clone,
264+
distribution_type=UnivariateContinuous,
265+
analytical_computations=clone._build_analytical_computations(),
266+
support=self.support,
267+
sampling_strategy=self._new_sampling_strategy(sampling_strategy=sampling_strategy),
268+
computation_strategy=self._new_computation_strategy(
269+
computation_strategy=computation_strategy
270+
),
271+
)
272+
return clone
273+
274+
275+
__all__ = [
276+
"EmpiricalDistribution",
277+
"EmpiricalMethod",
278+
"FittedEmpirical",
279+
"ScipyGaussianKde",
280+
]

0 commit comments

Comments
 (0)