Skip to content

Commit db84181

Browse files
committed
Adding zscores() and outliers() methods
1 parent 18a339a commit db84181

6 files changed

Lines changed: 203 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
- Adding `sem()` method for standard error of the mean
1010
- Adding `meanAbsoluteDeviation()` method for mean absolute deviation — average distance from the mean
1111
- Adding `medianAbsoluteDeviation()` method for median absolute deviation — robust dispersion measure resistant to outliers
12+
- Adding `zscores()` method for computing z-scores of each value in a dataset
13+
- Adding `outliers()` method for z-score based outlier detection with configurable threshold
1214

1315
## 1.2.5 - 2026-02-22
1416
- Adding `kurtosis()` method for excess kurtosis

README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ The various mathematical statistics are listed below:
8787
| `pskewness()` | population (biased) skewness |
8888
| `kurtosis()` | excess kurtosis (sample formula, 0 for normal distribution) |
8989
| `coefficientOfVariation()` | coefficient of variation (CV%), relative dispersion as percentage |
90+
| `zscores()` | z-scores for each value — how many standard deviations from the mean |
91+
| `outliers()` | outlier detection based on z-score threshold |
9092
| `geometricMean()` | geometric mean |
9193
| `harmonicMean()` | harmonic mean |
9294
| `correlation()` | Pearson’s or Spearman’s rank correlation coefficient for two inputs |
@@ -482,6 +484,36 @@ $cv = Stat::coefficientOfVariation([10, 20, 30, 40, 50], population: true);
482484
// ~47.14 (population)
483485
```
484486

487+
#### Stat::zscores( array $data, ?int $round = null )
488+
Return the z-score for each value in the dataset. A z-score indicates how many standard deviations a value is from the mean. Z-scores are useful for standardizing data, comparing values from different distributions, and identifying outliers.
489+
490+
The z-scores of any dataset always sum to zero, and values beyond ±2 or ±3 are typically considered unusual or outliers.
491+
492+
Requires at least 2 data points and non-zero standard deviation.
493+
494+
```php
495+
use HiFolks\Statistics\Stat;
496+
$zscores = Stat::zscores([2, 4, 4, 4, 5, 5, 7, 9]);
497+
// array of z-scores, one per value
498+
499+
$zscores = Stat::zscores([2, 4, 4, 4, 5, 5, 7, 9], 2);
500+
// z-scores rounded to 2 decimal places
501+
```
502+
503+
#### Stat::outliers( array $data, float $threshold = 3.0 )
504+
Return values from the dataset that are outliers based on z-score threshold. A value is considered an outlier if its absolute z-score exceeds the threshold.
505+
506+
The default threshold of 3.0 is a widely used convention — in a normal distribution, about 99.7% of values fall within 3 standard deviations of the mean, so values beyond that are rare. Use a lower threshold (e.g. 2.0) for stricter detection, or a higher one for more lenient filtering.
507+
508+
```php
509+
use HiFolks\Statistics\Stat;
510+
$outliers = Stat::outliers([1, 2, 3, 4, 5, 6, 7, 8, 9, 100]);
511+
// [100]
512+
513+
$outliers = Stat::outliers([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1.0);
514+
// values more than 1 stdev from the mean
515+
```
516+
485517
#### Stat::covariance ( array $x , array $y )
486518
Covariance, static method, returns the sample covariance of two inputs *$x* and *$y*.
487519
Covariance is a measure of the joint variability of two inputs.

src/Stat.php

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,62 @@ public static function medianAbsoluteDeviation(array $data, ?int $round = null):
716716
return Math::round((float) self::median($deviations), $round);
717717
}
718718

719+
/**
720+
* Return the z-scores for each value in the dataset.
721+
* A z-score indicates how many standard deviations a value is from the mean.
722+
*
723+
* Formula: zi = (xi - mean) / stdev
724+
*
725+
* @param array<int|float> $data
726+
* @param int|null $round whether to round each z-score
727+
* @return array<float> the z-scores
728+
*
729+
* @throws InvalidDataInputException if data size is less than 2 or stdev is zero
730+
*/
731+
public static function zscores(array $data, ?int $round = null): array
732+
{
733+
$mean = self::mean($data);
734+
$stdev = self::stdev($data);
735+
if ($stdev == 0) {
736+
throw new InvalidDataInputException(
737+
"Z-scores are undefined when all values are identical (standard deviation is zero).",
738+
);
739+
}
740+
741+
$zscores = [];
742+
foreach ($data as $value) {
743+
$zscores[] = Math::round(($value - $mean) / $stdev, $round);
744+
}
745+
746+
return $zscores;
747+
}
748+
749+
/**
750+
* Return values from the dataset that are outliers based on z-score threshold.
751+
* A value is considered an outlier if its absolute z-score exceeds the threshold.
752+
*
753+
* The default threshold of 3.0 is a common convention (values more than 3 standard
754+
* deviations from the mean).
755+
*
756+
* @param array<int|float> $data
757+
* @param float $threshold absolute z-score threshold (default 3.0)
758+
* @return array<int|float> the outlier values
759+
*
760+
* @throws InvalidDataInputException if data size is less than 2 or stdev is zero
761+
*/
762+
public static function outliers(array $data, float $threshold = 3.0): array
763+
{
764+
$zscores = self::zscores($data);
765+
$outliers = [];
766+
foreach ($data as $i => $value) {
767+
if (abs($zscores[$i]) > $threshold) {
768+
$outliers[] = $value;
769+
}
770+
}
771+
772+
return $outliers;
773+
}
774+
719775
/**
720776
* Return the variance from the numeric data.
721777
*

src/Statistics.php

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,32 @@ public function medianAbsoluteDeviation(?int $round = null): float
297297
return Stat::medianAbsoluteDeviation($this->numericalArray(), $round);
298298
}
299299

300+
/**
301+
* Return the z-scores for each value in the dataset.
302+
*
303+
* @param int|null $round whether to round each z-score
304+
* @return array<float>
305+
*
306+
* @see Stat::zscores()
307+
*/
308+
public function zscores(?int $round = null): array
309+
{
310+
return Stat::zscores($this->numericalArray(), $round);
311+
}
312+
313+
/**
314+
* Return values that are outliers based on z-score threshold.
315+
*
316+
* @param float $threshold absolute z-score threshold (default 3.0)
317+
* @return array<int|float>
318+
*
319+
* @see Stat::outliers()
320+
*/
321+
public function outliers(float $threshold = 3.0): array
322+
{
323+
return Stat::outliers($this->numericalArray(), $threshold);
324+
}
325+
300326
/**
301327
* Return the variance from the numeric data
302328
*

tests/StatTest.php

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,4 +1347,77 @@ public function test_median_absolute_deviation_empty_throws(): void
13471347
$this->expectException(InvalidDataInputException::class);
13481348
Stat::medianAbsoluteDeviation([]);
13491349
}
1350+
1351+
// --- zscores ---
1352+
1353+
public function test_zscores(): void
1354+
{
1355+
$data = [2, 4, 4, 4, 5, 5, 7, 9];
1356+
$zscores = Stat::zscores($data);
1357+
$mean = (float) Stat::mean($data);
1358+
$stdev = Stat::stdev($data);
1359+
1360+
$this->assertCount(count($data), $zscores);
1361+
foreach ($data as $i => $value) {
1362+
$this->assertEqualsWithDelta(($value - $mean) / $stdev, $zscores[$i], 1e-10);
1363+
}
1364+
}
1365+
1366+
public function test_zscores_sum_to_zero(): void
1367+
{
1368+
$data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
1369+
$zscores = Stat::zscores($data);
1370+
$this->assertEqualsWithDelta(0.0, array_sum($zscores), 1e-10);
1371+
}
1372+
1373+
public function test_zscores_with_rounding(): void
1374+
{
1375+
$data = [2, 4, 4, 4, 5, 5, 7, 9];
1376+
$zscores = Stat::zscores($data, 2);
1377+
foreach ($zscores as $z) {
1378+
$this->assertEquals(round($z, 2), $z);
1379+
}
1380+
}
1381+
1382+
public function test_zscores_identical_values_throws(): void
1383+
{
1384+
$this->expectException(InvalidDataInputException::class);
1385+
Stat::zscores([5, 5, 5, 5]);
1386+
}
1387+
1388+
public function test_zscores_too_few_data_throws(): void
1389+
{
1390+
$this->expectException(InvalidDataInputException::class);
1391+
Stat::zscores([5]);
1392+
}
1393+
1394+
// --- outliers ---
1395+
1396+
public function test_outliers_detects_extreme_values(): void
1397+
{
1398+
$data = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 100];
1399+
$outliers = Stat::outliers($data);
1400+
$this->assertContains(100, $outliers);
1401+
}
1402+
1403+
public function test_outliers_no_outliers(): void
1404+
{
1405+
$data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
1406+
$outliers = Stat::outliers($data);
1407+
$this->assertEmpty($outliers);
1408+
}
1409+
1410+
public function test_outliers_custom_threshold(): void
1411+
{
1412+
$data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
1413+
// With a very low threshold, more values are flagged
1414+
$outliers = Stat::outliers($data, 1.0);
1415+
$this->assertNotEmpty($outliers);
1416+
}
1417+
1418+
public function test_outliers_identical_values_throws(): void
1419+
{
1420+
$this->expectException(InvalidDataInputException::class);
1421+
Stat::outliers([5, 5, 5, 5]);
1422+
}
13501423
}

tests/StatisticTest.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,4 +316,18 @@ public function test_median_absolute_deviation(): void
316316
$s = Statistics::make([1, 2, 3, 4, 5]);
317317
$this->assertEqualsWithDelta(1.0, $s->medianAbsoluteDeviation(), 1e-10);
318318
}
319+
320+
public function test_zscores(): void
321+
{
322+
$s = Statistics::make([2, 4, 4, 4, 5, 5, 7, 9]);
323+
$zscores = $s->zscores();
324+
$this->assertCount(8, $zscores);
325+
$this->assertEqualsWithDelta(0.0, array_sum($zscores), 1e-10);
326+
}
327+
328+
public function test_outliers(): void
329+
{
330+
$s = Statistics::make([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 100]);
331+
$this->assertContains(100, $s->outliers());
332+
}
319333
}

0 commit comments

Comments
 (0)