From 4826e13c6d2b2ba71e9f45c823602447c144abd8 Mon Sep 17 00:00:00 2001 From: Roberto Butti Date: Sat, 21 Feb 2026 08:23:45 +0100 Subject: [PATCH 1/2] Adding Spearman rank correlation --- CHANGELOG.md | 1 + README.md | 16 ++++++++-- TODO.md | 10 ++----- src/Stat.php | 47 ++++++++++++++++++++++++++++- tests/StatTest.php | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 137 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a008b4..bf41ec0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 1.2.2 - WIP - Adding `medianGrouped()` method for estimating the median of grouped/binned continuous data using interpolation +- Adding Spearman rank correlation via `method` parameter in `correlation()` (`method='ranked'`) ## 1.2.1 - 2026-02-20 diff --git a/README.md b/README.md index ffbba20..05896b5 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ The various mathematical statistics are listed below: | `variance()` | variance for a sample | | `geometricMean()` | geometric mean | | `harmonicMean()` | harmonic mean | -| `correlation()` | the Pearson’s correlation coefficient for two inputs | +| `correlation()` | Pearson’s or Spearman’s rank correlation coefficient for two inputs | | `covariance()` | the sample covariance of two inputs | | `linearRegression()` | return the slope and intercept of simple linear regression parameters estimated using ordinary least squares | @@ -316,9 +316,11 @@ $covariance = Stat::covariance( // -7.5 ``` -#### Stat::correlation ( array $x , array $y ) +#### Stat::correlation ( array $x , array $y, string $method = ‘linear’ ) Return the Pearson’s correlation coefficient for two inputs. Pearson’s correlation coefficient r takes values between -1 and +1. It measures the strength and direction of the linear relationship, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. +Use `$method = ‘ranked’` for Spearman’s rank correlation, which measures monotonic relationships (not just linear). Spearman’s correlation is computed by applying Pearson’s formula to the ranks of the data. + ```php $correlation = Stat::correlation( [1, 2, 3, 4, 5, 6, 7, 8, 9], @@ -335,6 +337,16 @@ $correlation = Stat::correlation( // -1.0 ``` +Spearman’s rank correlation (non-linear but monotonic relationship): +```php +$correlation = Stat::correlation( + [1, 2, 3, 4, 5], + [1, 4, 9, 16, 25], + ‘ranked’ +); +// 1.0 +``` + #### Stat::linearRegression ( array $x , array $y ) Return the slope and intercept of simple linear regression parameters estimated using ordinary least squares. Simple linear regression describes the relationship between an independent variable *$x* and a dependent variable *$y* in terms of a linear function. diff --git a/TODO.md b/TODO.md index 7fbcb05..5d11c9e 100644 --- a/TODO.md +++ b/TODO.md @@ -1,9 +1,6 @@ Missing Functions - Python Function: median_grouped(data, interval) - Description: Median of grouped/binned continuous data - Status: Missing - ──────────────────────────────────────── + Python Function: kde(data, h, kernel) Description: Kernel Density Estimation Status: Missing @@ -14,10 +11,7 @@ Missing Parameters/Variants - Feature: correlation() with method='ranked' - Python: Supports both Pearson and Spearman rank correlation - This Package: Only Pearson - ──────────────────────────────────────── + Feature: linear_regression() with proportional=True Python: Supports proportional regression (intercept forced to 0) This Package: No proportional option diff --git a/src/Stat.php b/src/Stat.php index 9fdae35..1388231 100644 --- a/src/Stat.php +++ b/src/Stat.php @@ -614,8 +614,14 @@ public static function covariance(array $x, array $y): false|float * or if the length of arrays are < 2, or if the 2 input arrays has not numeric elements, * or if the elements of the array are constants */ - public static function correlation(array $x, array $y): false|float + public static function correlation(array $x, array $y, string $method = 'linear'): false|float { + if ($method !== 'linear' && $method !== 'ranked') { + throw new InvalidDataInputException( + "Correlation method must be 'linear' or 'ranked'.", + ); + } + $countX = count($x); $countY = count($y); if ($countX !== $countY) { @@ -628,6 +634,12 @@ public static function correlation(array $x, array $y): false|float "Correlation requires at least two data points.", ); } + + if ($method === 'ranked') { + $x = self::ranks($x); + $y = self::ranks($y); + } + $meanX = self::mean($x); $meanY = self::mean($y); $a = 0; @@ -651,6 +663,39 @@ public static function correlation(array $x, array $y): false|float return $a / $b; } + /** + * Assign average ranks to data values (handles ties by averaging). + * + * @param array $data + * @return array + */ + private static function ranks(array $data): array + { + $n = count($data); + $indexed = []; + for ($i = 0; $i < $n; $i++) { + $indexed[] = [$data[$i], $i]; + } + + usort($indexed, fn ($a, $b) => $a[0] <=> $b[0]); + + $ranks = array_fill(0, $n, 0.0); + $i = 0; + while ($i < $n) { + $j = $i; + while ($j < $n && $indexed[$j][0] === $indexed[$i][0]) { + $j++; + } + $averageRank = ($i + 1 + $j) / 2.0; + for ($k = $i; $k < $j; $k++) { + $ranks[$indexed[$k][1]] = $averageRank; + } + $i = $j; + } + + return $ranks; + } + /** * @param array $x * @param array $y diff --git a/tests/StatTest.php b/tests/StatTest.php index 594cd95..cca52c2 100644 --- a/tests/StatTest.php +++ b/tests/StatTest.php @@ -361,6 +361,80 @@ public function test_calculates_correlation(): void $this->assertEquals(0.71, $correlation); } + public function test_calculates_spearman_correlation(): void + { + // Monotonic relationship: ranks are perfectly correlated + $correlation = Stat::correlation( + [1, 2, 3, 4, 5], + [2, 4, 6, 8, 10], + 'ranked', + ); + $this->assertIsFloat($correlation); + $this->assertEqualsWithDelta(1.0, $correlation, 1e-9); + + // Inverse monotonic relationship + $correlation = Stat::correlation( + [1, 2, 3, 4, 5], + [10, 8, 6, 4, 2], + 'ranked', + ); + $this->assertIsFloat($correlation); + $this->assertEqualsWithDelta(-1.0, $correlation, 1e-9); + + // Non-linear but monotonic: Spearman = 1, Pearson < 1 + $correlation = Stat::correlation( + [1, 2, 3, 4, 5], + [1, 4, 9, 16, 25], + 'ranked', + ); + $this->assertIsFloat($correlation); + $this->assertEqualsWithDelta(1.0, $correlation, 1e-9); + } + + public function test_calculates_spearman_correlation_planets(): void + { + // Python docs example: planetary orbital periods and distances from the sun + $orbitalPeriod = [88, 225, 365, 687, 4331, 10_756, 30_687, 60_190]; + $distFromSun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500]; + + // Perfect monotonic relationship → Spearman = 1.0 + $correlation = Stat::correlation($orbitalPeriod, $distFromSun, 'ranked'); + $this->assertEqualsWithDelta(1.0, $correlation, 1e-9); + + // Linear (Pearson) correlation is imperfect + $correlation = Stat::correlation($orbitalPeriod, $distFromSun); + $this->assertEquals(0.9882, round($correlation, 4)); + + // Kepler's third law: linear correlation between + // the square of the period and the cube of the distance + $periodSquared = array_map(fn ($p) => $p * $p, $orbitalPeriod); + $distCubed = array_map(fn ($d) => $d * $d * $d, $distFromSun); + $correlation = Stat::correlation($periodSquared, $distCubed); + $this->assertEquals(1.0, round($correlation, 4)); + } + + public function test_calculates_spearman_correlation_with_ties(): void + { + // Ties should receive average ranks + $correlation = Stat::correlation( + [1, 2, 2, 3], + [10, 20, 20, 30], + 'ranked', + ); + $this->assertIsFloat($correlation); + $this->assertEqualsWithDelta(1.0, $correlation, 1e-9); + } + + public function test_calculates_correlation_invalid_method(): void + { + $this->expectException(InvalidDataInputException::class); + Stat::correlation( + [1, 2, 3], + [4, 5, 6], + 'invalid', + ); + } + public function test_calculates_correlation_wrong_usage_different_lengths(): void { $this->expectException(InvalidDataInputException::class); From a8cbfd8c608b1530bccd5b82ea5ba621bbc7640f Mon Sep 17 00:00:00 2001 From: Roberto Butti Date: Sat, 21 Feb 2026 08:26:33 +0100 Subject: [PATCH 2/2] Fine tuning parameter types in tests --- src/Stat.php | 2 +- tests/StatTest.php | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Stat.php b/src/Stat.php index 1388231..c98943a 100644 --- a/src/Stat.php +++ b/src/Stat.php @@ -677,7 +677,7 @@ private static function ranks(array $data): array $indexed[] = [$data[$i], $i]; } - usort($indexed, fn ($a, $b) => $a[0] <=> $b[0]); + usort($indexed, fn(array $a, array $b): int => $a[0] <=> $b[0]); $ranks = array_fill(0, $n, 0.0); $i = 0; diff --git a/tests/StatTest.php b/tests/StatTest.php index cca52c2..ccb3b37 100644 --- a/tests/StatTest.php +++ b/tests/StatTest.php @@ -403,13 +403,15 @@ public function test_calculates_spearman_correlation_planets(): void // Linear (Pearson) correlation is imperfect $correlation = Stat::correlation($orbitalPeriod, $distFromSun); + $this->assertIsFloat($correlation); $this->assertEquals(0.9882, round($correlation, 4)); // Kepler's third law: linear correlation between // the square of the period and the cube of the distance - $periodSquared = array_map(fn ($p) => $p * $p, $orbitalPeriod); - $distCubed = array_map(fn ($d) => $d * $d * $d, $distFromSun); + $periodSquared = array_map(fn(int $p): int => $p * $p, $orbitalPeriod); + $distCubed = array_map(fn(int $d): int => $d * $d * $d, $distFromSun); $correlation = Stat::correlation($periodSquared, $distCubed); + $this->assertIsFloat($correlation); $this->assertEquals(1.0, round($correlation, 4)); }