Skip to content

Commit a807f35

Browse files
committed
Adding kde() method for Kernel Density Estimation
1 parent 66d91a5 commit a807f35

4 files changed

Lines changed: 307 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## 1.2.3 - WIP
4+
- Adding `kde()` method for Kernel Density Estimation — returns a closure that estimates PDF or CDF from sample data, supporting 9 kernel functions with aliases
5+
36
## 1.2.2 - 2026-02-21
47
- Adding `method` parameter to `quantiles()` supporting `'exclusive'` (default) and `'inclusive'` interpolation methods
58
- Adding `medianGrouped()` method for estimating the median of grouped/binned continuous data using interpolation

README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ The various mathematical statistics are listed below:
8282
| `correlation()` | Pearson’s or Spearman’s rank correlation coefficient for two inputs |
8383
| `covariance()` | the sample covariance of two inputs |
8484
| `linearRegression()` | return the slope and intercept of simple linear regression parameters estimated using ordinary least squares (supports `proportional: true` for regression through the origin) |
85+
| `kde()` | kernel density estimation — returns a closure that estimates the probability density (or CDF) at any point |
8586

8687
#### Stat::mean( array $data )
8788
Return the sample arithmetic mean of the array _$data_.
@@ -400,6 +401,34 @@ list($slope, $intercept) = Stat::linearRegression(
400401
// $intercept = 0.0
401402
```
402403

404+
#### Stat::kde ( array $data , float $h , string $kernel = 'normal' , bool $cumulative = false )
405+
Create a continuous probability density function (or cumulative distribution function) from discrete sample data using Kernel Density Estimation.
406+
Returns a `Closure` that can be called with any point to estimate the density (or CDF value).
407+
408+
Supported kernels: `normal` (alias `gauss`), `logistic`, `sigmoid`, `rectangular` (alias `uniform`), `triangular`, `parabolic` (alias `epanechnikov`), `quartic` (alias `biweight`), `triweight`, `cosine`.
409+
410+
```php
411+
$data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2];
412+
$f = Stat::kde($data, h: 1.5);
413+
$f(2.5);
414+
// estimated density at x = 2.5
415+
```
416+
417+
Using a different kernel:
418+
419+
```php
420+
$f = Stat::kde($data, h: 1.5, kernel: 'triangular');
421+
$f(2.5);
422+
```
423+
424+
Cumulative distribution function:
425+
426+
```php
427+
$F = Stat::kde($data, h: 1.5, cumulative: true);
428+
$F(2.5);
429+
// estimated CDF at x = 2.5 (probability that a value is <= 2.5)
430+
```
431+
403432
### Freq class
404433
With *Statistics* package you can calculate frequency table.
405434
A frequency table lists the frequency of various outcomes in a sample.

src/Stat.php

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,171 @@ private static function ranks(array $data): array
715715
return $ranks;
716716
}
717717

718+
/**
719+
* Create a continuous probability density function or cumulative distribution
720+
* function from discrete sample data using Kernel Density Estimation.
721+
*
722+
* Returns a Closure that estimates the density (or CDF) at any given point.
723+
*
724+
* @param array<int|float> $data sample data
725+
* @param float $h bandwidth (smoothing parameter), must be > 0
726+
* @param string $kernel kernel name (normal, logistic, sigmoid, rectangular, triangular, parabolic, quartic, triweight, cosine) or alias
727+
* @param bool $cumulative if true, return CDF estimator; otherwise PDF estimator
728+
* @return \Closure a callable that takes a float and returns the estimated density or CDF value
729+
*
730+
* @throws InvalidDataInputException if data is empty, bandwidth <= 0, or kernel is invalid
731+
*/
732+
public static function kde(
733+
array $data,
734+
float $h,
735+
string $kernel = 'normal',
736+
bool $cumulative = false,
737+
): \Closure {
738+
if ($data === []) {
739+
throw new InvalidDataInputException("The data must not be empty.");
740+
}
741+
if ($h <= 0) {
742+
throw new InvalidDataInputException("Bandwidth h must be positive.");
743+
}
744+
745+
$aliases = [
746+
'gauss' => 'normal',
747+
'uniform' => 'rectangular',
748+
'epanechnikov' => 'parabolic',
749+
'biweight' => 'quartic',
750+
];
751+
$kernel = strtolower($kernel);
752+
if (isset($aliases[$kernel])) {
753+
$kernel = $aliases[$kernel];
754+
}
755+
756+
$sqrt2pi = sqrt(2.0 * M_PI);
757+
758+
// Standard normal CDF using Abramowitz & Stegun approximation (7.1.26)
759+
$normalCdf = static function (float $t) use ($sqrt2pi): float {
760+
$negative = $t < 0;
761+
$t = abs($t);
762+
$b1 = 0.319381530;
763+
$b2 = -0.356563782;
764+
$b3 = 1.781477937;
765+
$b4 = -1.821255978;
766+
$b5 = 1.330274429;
767+
$p = 0.2316419;
768+
$k = 1.0 / (1.0 + $p * $t);
769+
$pdf = exp(-$t * $t / 2.0) / $sqrt2pi;
770+
$cdf = 1.0 - $pdf * $k * ($b1 + $k * ($b2 + $k * ($b3 + $k * ($b4 + $k * $b5))));
771+
772+
return $negative ? 1.0 - $cdf : $cdf;
773+
};
774+
775+
$kernels = [
776+
'normal' => [
777+
'pdf' => static fn(float $t): float => exp(-$t * $t / 2.0) / $sqrt2pi,
778+
'cdf' => $normalCdf,
779+
'support' => null,
780+
],
781+
'logistic' => [
782+
'pdf' => static fn(float $t): float => 0.5 / (1.0 + cosh($t)),
783+
'cdf' => static fn(float $t): float => 1.0 / (1.0 + exp(-$t)),
784+
'support' => null,
785+
],
786+
'sigmoid' => [
787+
'pdf' => static fn(float $t): float => (1.0 / M_PI) / cosh($t),
788+
'cdf' => static fn(float $t): float => (2.0 / M_PI) * atan(exp($t)),
789+
'support' => null,
790+
],
791+
'rectangular' => [
792+
'pdf' => static fn(float $t): float => 0.5,
793+
'cdf' => static fn(float $t): float => 0.5 * $t + 0.5,
794+
'support' => 1.0,
795+
],
796+
'triangular' => [
797+
'pdf' => static fn(float $t): float => 1.0 - abs($t),
798+
'cdf' => static fn(float $t): float => $t >= 0
799+
? 1.0 - (1.0 - $t) * (1.0 - $t) / 2.0
800+
: (1.0 + $t) * (1.0 + $t) / 2.0,
801+
'support' => 1.0,
802+
],
803+
'parabolic' => [
804+
'pdf' => static fn(float $t): float => 0.75 * (1.0 - $t * $t),
805+
'cdf' => static fn(float $t): float => -0.25 * $t * $t * $t + 0.75 * $t + 0.5,
806+
'support' => 1.0,
807+
],
808+
'quartic' => [
809+
'pdf' => static fn(float $t): float => (15.0 / 16.0) * (1.0 - $t * $t) ** 2,
810+
'cdf' => static fn(float $t): float => (15.0 * $t - 10.0 * $t ** 3 + 3.0 * $t ** 5) / 16.0 + 0.5,
811+
'support' => 1.0,
812+
],
813+
'triweight' => [
814+
'pdf' => static fn(float $t): float => (35.0 / 32.0) * (1.0 - $t * $t) ** 3,
815+
'cdf' => static fn(float $t): float => (35.0 * $t - 35.0 * $t ** 3 + 21.0 * $t ** 5 - 5.0 * $t ** 7) / 32.0 + 0.5,
816+
'support' => 1.0,
817+
],
818+
'cosine' => [
819+
'pdf' => static fn(float $t): float => (M_PI / 4.0) * cos(M_PI * $t / 2.0),
820+
'cdf' => static fn(float $t): float => 0.5 * sin(M_PI * $t / 2.0) + 0.5,
821+
'support' => 1.0,
822+
],
823+
];
824+
825+
if (! isset($kernels[$kernel])) {
826+
$valid = implode(', ', array_merge(array_keys($kernels), array_keys($aliases)));
827+
throw new InvalidDataInputException(
828+
"Unknown kernel '{$kernel}'. Valid kernels: {$valid}.",
829+
);
830+
}
831+
832+
$kernelDef = $kernels[$kernel];
833+
$support = $kernelDef['support'];
834+
$fn = $cumulative ? $kernelDef['cdf'] : $kernelDef['pdf'];
835+
836+
$sorted = $data;
837+
sort($sorted);
838+
$n = count($sorted);
839+
840+
if ($cumulative) {
841+
return static function (float $x) use ($sorted, $n, $h, $fn, $support): float {
842+
$sum = 0.0;
843+
if ($support !== null) {
844+
$lo = self::bisectLeft($sorted, $x - $h * $support);
845+
$hi = self::bisectRight($sorted, $x + $h * $support);
846+
for ($i = $lo; $i < $hi; $i++) {
847+
$t = ($x - $sorted[$i]) / $h;
848+
$sum += $fn($t);
849+
}
850+
// Points entirely to the left contribute 1.0 each
851+
$sum += $lo;
852+
} else {
853+
for ($i = 0; $i < $n; $i++) {
854+
$t = ($x - $sorted[$i]) / $h;
855+
$sum += $fn($t);
856+
}
857+
}
858+
859+
return $sum / $n;
860+
};
861+
}
862+
863+
return static function (float $x) use ($sorted, $n, $h, $fn, $support): float {
864+
$sum = 0.0;
865+
if ($support !== null) {
866+
$lo = self::bisectLeft($sorted, $x - $h * $support);
867+
$hi = self::bisectRight($sorted, $x + $h * $support);
868+
for ($i = $lo; $i < $hi; $i++) {
869+
$t = ($x - $sorted[$i]) / $h;
870+
$sum += $fn($t);
871+
}
872+
} else {
873+
for ($i = 0; $i < $n; $i++) {
874+
$t = ($x - $sorted[$i]) / $h;
875+
$sum += $fn($t);
876+
}
877+
}
878+
879+
return $sum / ($n * $h);
880+
};
881+
}
882+
718883
/**
719884
* @param array<int|float> $x
720885
* @param array<int|float> $y

tests/StatTest.php

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,4 +575,114 @@ public function test_proportional_linear_regression_with_all_zeros_x(): void
575575
$this->expectException(InvalidDataInputException::class);
576576
Stat::linearRegression([0, 0, 0, 0, 0], [1, 2, 3, 4, 5], proportional: true);
577577
}
578+
579+
public function test_kde_normal(): void
580+
{
581+
$data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2];
582+
$f = Stat::kde($data, 1.5);
583+
$this->assertIsCallable($f);
584+
585+
$density = $f(2.5);
586+
$this->assertIsFloat($density);
587+
$this->assertGreaterThan(0, $density);
588+
589+
// Verify against manually computed value
590+
// f(2.5) = (1/(6*1.5)) * sum of K((2.5 - xi)/1.5)
591+
$n = count($data);
592+
$h = 1.5;
593+
$sum = 0.0;
594+
$sqrt2pi = sqrt(2.0 * M_PI);
595+
foreach ($data as $xi) {
596+
$t = (2.5 - $xi) / $h;
597+
$sum += exp(-$t * $t / 2.0) / $sqrt2pi;
598+
}
599+
$expected = $sum / ($n * $h);
600+
$this->assertEqualsWithDelta($expected, $density, 1e-10);
601+
}
602+
603+
public function test_kde_all_kernels(): void
604+
{
605+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
606+
$kernels = [
607+
'normal', 'logistic', 'sigmoid',
608+
'rectangular', 'triangular', 'parabolic',
609+
'quartic', 'triweight', 'cosine',
610+
];
611+
612+
foreach ($kernels as $kernel) {
613+
$f = Stat::kde($data, 1.0, $kernel);
614+
$this->assertIsCallable($f, "Kernel '$kernel' should return a callable");
615+
$density = $f(3.0);
616+
$this->assertGreaterThanOrEqual(0, $density, "Kernel '$kernel' density should be >= 0");
617+
}
618+
}
619+
620+
public function test_kde_cumulative(): void
621+
{
622+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
623+
$F = Stat::kde($data, 1.0, 'normal', cumulative: true);
624+
$this->assertIsCallable($F);
625+
626+
// CDF should be monotonically non-decreasing
627+
$prev = $F(-100.0);
628+
foreach ([-10.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 100.0] as $x) {
629+
$current = $F($x);
630+
$this->assertGreaterThanOrEqual($prev, $current, "CDF should be non-decreasing at x=$x");
631+
$prev = $current;
632+
}
633+
634+
// Approaches 0 far left and 1 far right
635+
$this->assertEqualsWithDelta(0.0, $F(-100.0), 0.01);
636+
$this->assertEqualsWithDelta(1.0, $F(100.0), 0.01);
637+
}
638+
639+
public function test_kde_aliases(): void
640+
{
641+
$data = [1.0, 2.0, 3.0, 4.0, 5.0];
642+
$x = 3.0;
643+
644+
// gauss == normal
645+
$f1 = Stat::kde($data, 1.0, 'gauss');
646+
$f2 = Stat::kde($data, 1.0, 'normal');
647+
$this->assertEqualsWithDelta($f1($x), $f2($x), 1e-15);
648+
649+
// uniform == rectangular
650+
$f1 = Stat::kde($data, 1.0, 'uniform');
651+
$f2 = Stat::kde($data, 1.0, 'rectangular');
652+
$this->assertEqualsWithDelta($f1($x), $f2($x), 1e-15);
653+
654+
// epanechnikov == parabolic
655+
$f1 = Stat::kde($data, 1.0, 'epanechnikov');
656+
$f2 = Stat::kde($data, 1.0, 'parabolic');
657+
$this->assertEqualsWithDelta($f1($x), $f2($x), 1e-15);
658+
659+
// biweight == quartic
660+
$f1 = Stat::kde($data, 1.0, 'biweight');
661+
$f2 = Stat::kde($data, 1.0, 'quartic');
662+
$this->assertEqualsWithDelta($f1($x), $f2($x), 1e-15);
663+
}
664+
665+
public function test_kde_empty_data(): void
666+
{
667+
$this->expectException(InvalidDataInputException::class);
668+
Stat::kde([], 1.0);
669+
}
670+
671+
public function test_kde_invalid_bandwidth(): void
672+
{
673+
$this->expectException(InvalidDataInputException::class);
674+
Stat::kde([1.0, 2.0], 0.0);
675+
}
676+
677+
public function test_kde_invalid_bandwidth_negative(): void
678+
{
679+
$this->expectException(InvalidDataInputException::class);
680+
Stat::kde([1.0, 2.0], -1.0);
681+
}
682+
683+
public function test_kde_invalid_kernel(): void
684+
{
685+
$this->expectException(InvalidDataInputException::class);
686+
Stat::kde([1.0, 2.0], 1.0, 'invalid_kernel');
687+
}
578688
}

0 commit comments

Comments
 (0)