Skip to content

Commit 7bfb3ce

Browse files
committed
Adding kdeRandom() method
1 parent a807f35 commit 7bfb3ce

4 files changed

Lines changed: 330 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## 1.2.3 - WIP
44
- Adding `kde()` method for Kernel Density Estimation — returns a closure that estimates PDF or CDF from sample data, supporting 9 kernel functions with aliases
5+
- Adding `kdeRandom()` method for random sampling from a Kernel Density Estimate — returns a closure that generates random floats from the KDE distribution
56

67
## 1.2.2 - 2026-02-21
78
- Adding `method` parameter to `quantiles()` supporting `'exclusive'` (default) and `'inclusive'` interpolation methods

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ The various mathematical statistics are listed below:
8383
| `covariance()` | the sample covariance of two inputs |
8484
| `linearRegression()` | return the slope and intercept of simple linear regression parameters estimated using ordinary least squares (supports `proportional: true` for regression through the origin) |
8585
| `kde()` | kernel density estimation — returns a closure that estimates the probability density (or CDF) at any point |
86+
| `kdeRandom()` | random sampling from a kernel density estimate — returns a closure that generates random floats from the KDE distribution |
8687

8788
#### Stat::mean( array $data )
8889
Return the sample arithmetic mean of the array _$data_.
@@ -429,6 +430,29 @@ $F(2.5);
429430
// estimated CDF at x = 2.5 (probability that a value is <= 2.5)
430431
```
431432

433+
#### Stat::kdeRandom ( array $data , float $h , string $kernel = 'normal' , ?int $seed = null )
434+
Generate random samples from a Kernel Density Estimate.
435+
Returns a `Closure` that, when called, produces a random float drawn from the KDE distribution defined by the data and bandwidth.
436+
437+
Supported kernels: `normal` (alias `gauss`), `logistic`, `sigmoid`, `rectangular` (alias `uniform`), `triangular`, `parabolic` (alias `epanechnikov`), `quartic` (alias `biweight`), `triweight`, `cosine`.
438+
439+
```php
440+
$data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2];
441+
$rand = Stat::kdeRandom($data, h: 1.5, seed: 8675309);
442+
$samples = [];
443+
for ($i = 0; $i < 10; $i++) {
444+
$samples[] = round($rand(), 1);
445+
}
446+
// [2.5, 3.3, -1.8, 7.3, -2.1, 4.6, 4.4, 5.9, -3.2, -1.6]
447+
```
448+
449+
Using a different kernel:
450+
451+
```php
452+
$rand = Stat::kdeRandom($data, h: 1.5, kernel: 'triangular', seed: 42);
453+
$rand();
454+
```
455+
432456
### Freq class
433457
With *Statistics* package you can calculate frequency table.
434458
A frequency table lists the frequency of various outcomes in a sample.

src/Stat.php

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,6 +880,185 @@ public static function kde(
880880
};
881881
}
882882

883+
/**
884+
* Generate random samples from a Kernel Density Estimate.
885+
*
886+
* Returns a Closure that, when called, produces a random float drawn
887+
* from the KDE distribution defined by the data and bandwidth.
888+
*
889+
* @param array<int|float> $data sample data
890+
* @param float $h bandwidth (smoothing parameter), must be > 0
891+
* @param string $kernel kernel name or alias
892+
* @param int|null $seed optional seed for reproducibility
893+
* @return \Closure a callable that returns a random float from the KDE
894+
*
895+
* @throws InvalidDataInputException if data is empty, bandwidth <= 0, or kernel is invalid
896+
*/
897+
public static function kdeRandom(
898+
array $data,
899+
float $h,
900+
string $kernel = 'normal',
901+
?int $seed = null,
902+
): \Closure {
903+
if ($data === []) {
904+
throw new InvalidDataInputException("The data must not be empty.");
905+
}
906+
if ($h <= 0) {
907+
throw new InvalidDataInputException("Bandwidth h must be positive.");
908+
}
909+
910+
$aliases = [
911+
'gauss' => 'normal',
912+
'uniform' => 'rectangular',
913+
'epanechnikov' => 'parabolic',
914+
'biweight' => 'quartic',
915+
];
916+
$kernel = strtolower($kernel);
917+
if (isset($aliases[$kernel])) {
918+
$kernel = $aliases[$kernel];
919+
}
920+
921+
// Acklam rational approximation for standard normal inverse CDF
922+
$normalInvCdf = static function (float $p): float {
923+
$a = [
924+
-3.969683028665376e+01,
925+
2.209460984245205e+02,
926+
-2.759285104469687e+02,
927+
1.383577518672690e+02,
928+
-3.066479806614716e+01,
929+
2.506628277459239e+00,
930+
];
931+
$b = [
932+
-5.447609879822406e+01,
933+
1.615858368580409e+02,
934+
-1.556989798598866e+02,
935+
6.680131188771972e+01,
936+
-1.328068155288572e+01,
937+
];
938+
$c = [
939+
-7.784894002430293e-03,
940+
-3.223964580411365e-01,
941+
-2.400758277161838e+00,
942+
-2.549732539343734e+00,
943+
4.374664141464968e+00,
944+
2.938163982698783e+00,
945+
];
946+
$d = [
947+
7.784695709041462e-03,
948+
3.224671290700398e-01,
949+
2.445134137142996e+00,
950+
3.754408661907416e+00,
951+
];
952+
953+
$pLow = 0.02425;
954+
$pHigh = 1.0 - $pLow;
955+
if ($p < $pLow) {
956+
$q = sqrt(-2.0 * log($p));
957+
return ((((($c[0] * $q + $c[1]) * $q + $c[2]) * $q + $c[3]) * $q + $c[4]) * $q + $c[5])
958+
/ (((($d[0] * $q + $d[1]) * $q + $d[2]) * $q + $d[3]) * $q + 1.0);
959+
}
960+
961+
if ($p <= $pHigh) {
962+
$q = $p - 0.5;
963+
$r = $q * $q;
964+
return ((((($a[0] * $r + $a[1]) * $r + $a[2]) * $r + $a[3]) * $r + $a[4]) * $r + $a[5]) * $q
965+
/ ((((($b[0] * $r + $b[1]) * $r + $b[2]) * $r + $b[3]) * $r + $b[4]) * $r + 1.0);
966+
}
967+
$q = sqrt(-2.0 * log(1.0 - $p));
968+
return -((((($c[0] * $q + $c[1]) * $q + $c[2]) * $q + $c[3]) * $q + $c[4]) * $q + $c[5])
969+
/ (((($d[0] * $q + $d[1]) * $q + $d[2]) * $q + $d[3]) * $q + 1.0);
970+
};
971+
972+
// Newton-Raphson solver for kernels without closed-form inverse CDF
973+
$newtonRaphson = static function (float $p, callable $cdf, callable $pdf, float $x0): float {
974+
$x = $x0;
975+
for ($i = 0; $i < 100; $i++) {
976+
$err = $cdf($x) - $p;
977+
if (abs($err) <= 1e-12) {
978+
break;
979+
}
980+
$x -= $err / $pdf($x);
981+
}
982+
return $x;
983+
};
984+
985+
// Quartic CDF and PDF for Newton-Raphson
986+
$quarticCdf = static fn(float $t): float
987+
=> $t <= -1.0 ? 0.0 : ($t >= 1.0 ? 1.0 : (15.0 * $t - 10.0 * $t ** 3 + 3.0 * $t ** 5) / 16.0 + 0.5);
988+
$quarticPdf = static fn(float $t): float
989+
=> ($t < -1.0 || $t > 1.0) ? 0.0 : (15.0 / 16.0) * (1.0 - $t * $t) ** 2;
990+
991+
// Triweight CDF and PDF for Newton-Raphson
992+
$triweightCdf = static fn(float $t): float
993+
=> $t <= -1.0 ? 0.0 : ($t >= 1.0 ? 1.0 : (35.0 * $t - 35.0 * $t ** 3 + 21.0 * $t ** 5 - 5.0 * $t ** 7) / 32.0 + 0.5);
994+
$triweightPdf = static fn(float $t): float
995+
=> ($t < -1.0 || $t > 1.0) ? 0.0 : (35.0 / 32.0) * (1.0 - $t * $t) ** 3;
996+
997+
$invcdfMap = [
998+
'normal' => $normalInvCdf,
999+
'logistic' => static fn(float $p): float => log($p / (1.0 - $p)),
1000+
'sigmoid' => static fn(float $p): float => log(tan($p * M_PI / 2.0)),
1001+
'rectangular' => static fn(float $p): float => 2.0 * $p - 1.0,
1002+
'triangular' => static fn(float $p): float
1003+
=> $p < 0.5 ? sqrt(2.0 * $p) - 1.0 : 1.0 - sqrt(2.0 - 2.0 * $p),
1004+
'parabolic' => static fn(float $p): float
1005+
=> 2.0 * cos((acos(2.0 * $p - 1.0) + M_PI) / 3.0),
1006+
'quartic' => static function (float $p) use ($newtonRaphson, $quarticCdf, $quarticPdf): float {
1007+
if ($p <= 0.5) {
1008+
$sign = 1.0;
1009+
} else {
1010+
$sign = -1.0;
1011+
$p = 1.0 - $p;
1012+
}
1013+
if ($p < 0.0106) {
1014+
$x = (2.0 * $p) ** 0.3838 - 1.0;
1015+
} else {
1016+
$x = (2.0 * $p) ** 0.4258865685331 - 1.0;
1017+
if ($p < 0.499) {
1018+
$x += 0.026818732 * sin(7.101753784 * $p + 2.73230839482953);
1019+
}
1020+
}
1021+
$x *= $sign;
1022+
return $newtonRaphson($sign === 1.0 ? $p : 1.0 - $p, $quarticCdf, $quarticPdf, $x);
1023+
},
1024+
'triweight' => static function (float $p) use ($newtonRaphson, $triweightCdf, $triweightPdf): float {
1025+
if ($p <= 0.5) {
1026+
$sign = 1.0;
1027+
} else {
1028+
$sign = -1.0;
1029+
$p = 1.0 - $p;
1030+
}
1031+
$x = (2.0 * $p) ** 0.3400218741872791 - 1.0;
1032+
if ($p > 0.00001 && $p < 0.499) {
1033+
$x -= 0.033 * sin(1.07 * 2.0 * M_PI * ($p - 0.035));
1034+
}
1035+
$x *= $sign;
1036+
return $newtonRaphson($sign === 1.0 ? $p : 1.0 - $p, $triweightCdf, $triweightPdf, $x);
1037+
},
1038+
'cosine' => static fn(float $p): float => (2.0 / M_PI) * asin(2.0 * $p - 1.0),
1039+
];
1040+
1041+
if (! isset($invcdfMap[$kernel])) {
1042+
$valid = implode(', ', array_merge(array_keys($invcdfMap), array_keys($aliases)));
1043+
throw new InvalidDataInputException(
1044+
"Unknown kernel '{$kernel}'. Valid kernels: {$valid}.",
1045+
);
1046+
}
1047+
1048+
$invcdf = $invcdfMap[$kernel];
1049+
$n = count($data);
1050+
1051+
if ($seed !== null) {
1052+
mt_srand($seed);
1053+
}
1054+
1055+
return static function () use ($data, $n, $h, $invcdf): float {
1056+
$i = mt_rand(0, $n - 1);
1057+
$u = mt_rand(1, mt_getrandmax()) / mt_getrandmax();
1058+
return $data[$i] + $h * $invcdf($u);
1059+
};
1060+
}
1061+
8831062
/**
8841063
* @param array<int|float> $x
8851064
* @param array<int|float> $y

tests/StatTest.php

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,4 +685,130 @@ public function test_kde_invalid_kernel(): void
685685
$this->expectException(InvalidDataInputException::class);
686686
Stat::kde([1.0, 2.0], 1.0, 'invalid_kernel');
687687
}
688+
689+
public function test_kde_random_returns_callable(): void
690+
{
691+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
692+
$sampler = Stat::kdeRandom($data, 1.0);
693+
$this->assertIsCallable($sampler);
694+
695+
$value = $sampler();
696+
$this->assertIsFloat($value);
697+
}
698+
699+
public function test_kde_random_all_kernels(): void
700+
{
701+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
702+
$kernels = [
703+
'normal', 'logistic', 'sigmoid',
704+
'rectangular', 'triangular', 'parabolic',
705+
'quartic', 'triweight', 'cosine',
706+
];
707+
708+
foreach ($kernels as $kernel) {
709+
$sampler = Stat::kdeRandom($data, 1.0, $kernel, seed: 42);
710+
$this->assertIsCallable($sampler, "Kernel '$kernel' should return a callable");
711+
$value = $sampler();
712+
$this->assertIsFloat($value, "Kernel '$kernel' should return a float");
713+
}
714+
}
715+
716+
public function test_kde_random_seed_reproducibility(): void
717+
{
718+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
719+
720+
$sampler1 = Stat::kdeRandom($data, 1.0, 'normal', seed: 123);
721+
$values1 = [];
722+
for ($i = 0; $i < 10; $i++) {
723+
$values1[] = $sampler1();
724+
}
725+
726+
$sampler2 = Stat::kdeRandom($data, 1.0, 'normal', seed: 123);
727+
$values2 = [];
728+
for ($i = 0; $i < 10; $i++) {
729+
$values2[] = $sampler2();
730+
}
731+
732+
$this->assertSame($values1, $values2);
733+
}
734+
735+
public function test_kde_random_aliases(): void
736+
{
737+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
738+
$aliasPairs = [
739+
['gauss', 'normal'],
740+
['uniform', 'rectangular'],
741+
['epanechnikov', 'parabolic'],
742+
['biweight', 'quartic'],
743+
];
744+
745+
foreach ($aliasPairs as [$alias, $canonical]) {
746+
$sampler1 = Stat::kdeRandom($data, 1.0, $alias, seed: 42);
747+
$values1 = [];
748+
for ($i = 0; $i < 5; $i++) {
749+
$values1[] = $sampler1();
750+
}
751+
752+
$sampler2 = Stat::kdeRandom($data, 1.0, $canonical, seed: 42);
753+
$values2 = [];
754+
for ($i = 0; $i < 5; $i++) {
755+
$values2[] = $sampler2();
756+
}
757+
758+
$this->assertSame(
759+
$values1,
760+
$values2,
761+
"Alias '$alias' should produce same results as '$canonical'",
762+
);
763+
}
764+
}
765+
766+
public function test_kde_random_known_output(): void
767+
{
768+
$data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2];
769+
$rand = Stat::kdeRandom($data, 1.5, seed: 8675309);
770+
$results = [];
771+
for ($i = 0; $i < 10; $i++) {
772+
$results[] = round($rand(), 1);
773+
}
774+
775+
$this->assertSame(
776+
[2.5, 3.3, -1.8, 7.3, -2.1, 4.6, 4.4, 5.9, -3.2, -1.6],
777+
$results,
778+
);
779+
}
780+
781+
public function test_kde_random_statistical_properties(): void
782+
{
783+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
784+
$dataMean = 3.0;
785+
$sampler = Stat::kdeRandom($data, 0.5, 'normal', seed: 42);
786+
787+
$sum = 0.0;
788+
$n = 10000;
789+
for ($i = 0; $i < $n; $i++) {
790+
$sum += $sampler();
791+
}
792+
$sampleMean = $sum / $n;
793+
794+
$this->assertEqualsWithDelta($dataMean, $sampleMean, 0.15);
795+
}
796+
797+
public function test_kde_random_empty_data(): void
798+
{
799+
$this->expectException(InvalidDataInputException::class);
800+
Stat::kdeRandom([], 1.0);
801+
}
802+
803+
public function test_kde_random_invalid_bandwidth(): void
804+
{
805+
$this->expectException(InvalidDataInputException::class);
806+
Stat::kdeRandom([1.0, 2.0], 0.0);
807+
}
808+
809+
public function test_kde_random_invalid_kernel(): void
810+
{
811+
$this->expectException(InvalidDataInputException::class);
812+
Stat::kdeRandom([1.0, 2.0], 1.0, 'invalid_kernel');
813+
}
688814
}

0 commit comments

Comments
 (0)