diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b9cb02..2a008b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 1.2.2 - WIP +- Adding `medianGrouped()` method for estimating the median of grouped/binned continuous data using interpolation + + ## 1.2.1 - 2026-02-20 - Adding `invCdf()` method to normal distribution - Adding `getVariance()` method to normal distribution (sigma squared) diff --git a/README.md b/README.md index 0025c18..ffbba20 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ The various mathematical statistics are listed below: | `median()` | median or "middle value" of data | | `medianLow()` | low median of data | | `medianHigh()` | high median of data | +| `medianGrouped()` | median of grouped data, using interpolation | | `mode()` | single mode (most common value) of discrete or nominal data | | `multimode()` | list of modes (most common values) of discrete or nominal data | | `quantiles()` | cut points dividing the range of a probability distribution into continuous intervals with equal probabilities | @@ -192,6 +193,35 @@ $median = Stat::medianHigh([1, 3, 5, 7]); // 5 ``` +#### Stat::medianGrouped( array $data, float $interval = 1.0 ) +Estimate the median for numeric data that has been grouped or binned around the midpoints of consecutive, fixed-width intervals. +The `$interval` parameter specifies the width of each bin (default `1.0`). This function uses interpolation within the median interval, assuming values are evenly distributed across each bin. + +```php +use HiFolks\Statistics\Stat; +$median = Stat::medianGrouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]); +// 3.7 +$median = Stat::medianGrouped([1, 3, 3, 5, 7]); +// 3.25 +$median = Stat::medianGrouped([1, 3, 3, 5, 7], 2); +// 3.5 +``` + +For example, demographic data summarized into ten-year age groups: +```php +use HiFolks\Statistics\Stat; +// 172 people aged 20-30, 484 aged 30-40, 387 aged 40-50, etc. +$data = array_merge( + array_fill(0, 172, 25), + array_fill(0, 484, 35), + array_fill(0, 387, 45), + array_fill(0, 22, 55), + array_fill(0, 6, 65), +); +round(Stat::medianGrouped($data, 10), 1); +// 37.5 +``` + #### Stat::quantiles( array $data, $n=4, $round=null ) Divide data into n continuous intervals with equal probability. Returns a list of n - 1 cut points separating the intervals. Set n to 4 for quartiles (the default). Set n to 10 for deciles. Set n to 100 for percentiles which gives the 99 cut points that separate data into 100 equal-sized groups. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..7fbcb05 --- /dev/null +++ b/TODO.md @@ -0,0 +1,115 @@ + Missing Functions + + Python Function: median_grouped(data, interval) + Description: Median of grouped/binned continuous data + Status: Missing + ──────────────────────────────────────── + Python Function: kde(data, h, kernel) + Description: Kernel Density Estimation + Status: Missing + ──────────────────────────────────────── + Python Function: kde_random(data, h, kernel) + Description: Random sampling from KDE + Status: Missing + + Missing Parameters/Variants + + Feature: correlation() with method='ranked' + Python: Supports both Pearson and Spearman rank correlation + This Package: Only Pearson + ──────────────────────────────────────── + Feature: linear_regression() with proportional=True + Python: Supports proportional regression (intercept forced to 0) + This Package: No proportional option + ──────────────────────────────────────── + Feature: variance(data, xbar) / pvariance(data, mu) + Python: Can pass pre-computed mean to avoid recalculation + This Package: No pre-computed mean parameter + ──────────────────────────────────────── + Feature: quantiles() with method='inclusive' + Python: Supports both exclusive and inclusive methods + This Package: No method parameter + + Summary + + The package is actually very close to full parity with Python's statistics + module. The gaps are: + + 1. median_grouped - interpolation-based median for grouped/binned data + 2. kde / kde_random - Kernel Density Estimation (added in Python 3.13, + relatively new) + 3. Spearman rank correlation - via method parameter on correlation() + 4. Proportional linear regression - forcing intercept through origin + 5. Minor parameter additions (xbar/mu on variance/stdev, method on quantiles) + + Items 1, 3, and 4 would be the most practical additions to reach near-complete + parity with Python's statistics module. The KDE functions (2) are newer and + more niche. + + + + + Currently Implemented (for reference) + + Central tendency, variance/stdev, median variants, mode/multimode, + geometric/harmonic mean, quantiles, covariance, correlation, linear + regression, normal distribution (PDF, CDF, inverse CDF, z-score), frequency + tables. + + --- + Missing Functions + + Descriptive Statistics + + - Trimmed/Truncated mean - mean after removing outliers (top/bottom x%) + - Weighted median - median with weights (like fmean supports weights, but + median doesn't) + - Skewness - measure of asymmetry of the distribution + - Kurtosis - measure of "tailedness" of the distribution + - Standard error of the mean (SEM) + - Coefficient of variation (CV) - stdev / mean, useful for comparing + variability across datasets + - Mean absolute deviation (MAD) + - Percentile - arbitrary percentile (e.g., 90th percentile) — quantiles() + exists but a direct percentile($data, $p) would be convenient + + Correlation & Regression + + - Spearman rank correlation - non-parametric correlation + - Kendall tau correlation - another rank-based correlation + - Multiple/polynomial regression + - R-squared (coefficient of determination) + + Hypothesis Testing + + - T-test (one-sample, two-sample, paired) + - Chi-squared test + - Z-test + - P-value calculation + - Confidence intervals + + Other Distributions (beyond Normal) + + - Student's t-distribution + - Chi-squared distribution + - Binomial distribution + - Poisson distribution + - Uniform distribution + - Exponential distribution + + Outlier Detection + + - IQR-based outlier detection (the building blocks exist with + firstQuartile/thirdQuartile, but no dedicated method) + - Z-score based outlier detection + + Ranking & Order Statistics + + - Rank - assign ranks to data points + - Percentile rank - what percentile a given value falls at + + --- + The most impactful additions would likely be skewness, kurtosis, coefficient + of variation, percentile, and Spearman correlation — these are commonly needed + and align well with the package's existing scope (inspired by Python's + statistics module). diff --git a/src/Stat.php b/src/Stat.php index 2849d01..9fdae35 100644 --- a/src/Stat.php +++ b/src/Stat.php @@ -143,6 +143,96 @@ public static function median( }; } + /** + * Estimate the median for grouped data that has been binned + * around the midpoints of consecutive, fixed-width intervals. + * + * Uses interpolation within the median interval: + * L + interval * (n/2 - cf) / f + * + * where: + * - L is the lower limit of the median interval + * - cf is the cumulative frequency of the preceding interval + * - f is the number of elements in the median interval + * + * @param array $data + * @param float $interval the width of each bin + * @return float the estimated median for grouped data + * + * @throws InvalidDataInputException if the data is empty + */ + public static function medianGrouped(array $data, float $interval = 1.0): float + { + sort($data); + $n = count($data); + if ($n === 0) { + throw new InvalidDataInputException("The data must not be empty."); + } + + // Find the value at the midpoint (midpoint of the class interval) + $x = (float) $data[intdiv($n, 2)]; + + // Find where all the x values occur in the sorted data + // All x will lie within data[i:j] + $i = self::bisectLeft($data, $x); + $j = self::bisectRight($data, $x, $i); + + // Lower limit of the median interval + $L = $x - $interval / 2.0; + // Cumulative frequency of the preceding interval + $cf = $i; + // Number of elements in the median interval + $f = $j - $i; + + return $L + $interval * ($n / 2.0 - $cf) / $f; + } + + /** + * Binary search: find the leftmost position where $target can be inserted + * in $data while keeping it sorted. + * + * @param array $data sorted array + * @param float $target value to locate + */ + private static function bisectLeft(array $data, float $target): int + { + $lo = 0; + $hi = count($data); + while ($lo < $hi) { + $mid = intdiv($lo + $hi, 2); + if ($data[$mid] < $target) { + $lo = $mid + 1; + } else { + $hi = $mid; + } + } + + return $lo; + } + + /** + * Binary search: find the rightmost position where $target can be inserted + * in $data while keeping it sorted. + * + * @param array $data sorted array + * @param float $target value to locate + * @param int $lo lower bound for the search + */ + private static function bisectRight(array $data, float $target, int $lo = 0): int + { + $hi = count($data); + while ($lo < $hi) { + $mid = intdiv($lo + $hi, 2); + if ($data[$mid] <= $target) { + $lo = $mid + 1; + } else { + $hi = $mid; + } + } + + return $lo; + } + /** * Return the low median of data. * The low median is always a member of the data set. diff --git a/src/Statistics.php b/src/Statistics.php index 7adfc99..cd33b71 100755 --- a/src/Statistics.php +++ b/src/Statistics.php @@ -173,6 +173,18 @@ public function median(): mixed return Stat::median($this->values); } + /** + * Estimate the median for grouped data. + * + * @param float $interval the width of each bin + * + * @see Stat::medianGrouped() + */ + public function medianGrouped(float $interval = 1.0): float + { + return Stat::medianGrouped($this->numericalArray(), $interval); + } + /** * Return the first quartile. * diff --git a/tests/StatTest.php b/tests/StatTest.php index 4d6ec3c..594cd95 100644 --- a/tests/StatTest.php +++ b/tests/StatTest.php @@ -92,6 +92,40 @@ public function test_calculates_median_high_with_empty_array(): void Stat::medianHigh([]); } + public function test_calculates_median_grouped(): void + { + // Python: median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) == 3.7 + $this->assertEquals(3.7, Stat::medianGrouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])); + + // Python: median_grouped([52, 52, 53, 54]) == 52.5 + $this->assertEquals(52.5, Stat::medianGrouped([52, 52, 53, 54])); + + // Python: median_grouped([1, 3, 3, 5, 7]) == 3.25 + $this->assertEquals(3.25, Stat::medianGrouped([1, 3, 3, 5, 7])); + + // With interval=2: median_grouped([1, 3, 3, 5, 7], interval=2) == 3.5 + $this->assertEquals(3.5, Stat::medianGrouped([1, 3, 3, 5, 7], 2)); + + // Demographics example from Python docs (interval=10) + $data = array_merge( + array_fill(0, 172, 25), + array_fill(0, 484, 35), + array_fill(0, 387, 45), + array_fill(0, 22, 55), + array_fill(0, 6, 65), + ); + $this->assertEquals(37.5, round(Stat::medianGrouped($data, 10), 1)); + + // Single element: L = 1 - 0.5 = 0.5, result = 0.5 + 1*(0.5-0)/1 = 1.0 + $this->assertEquals(1.0, Stat::medianGrouped([1])); + } + + public function test_calculates_median_grouped_with_empty_array(): void + { + $this->expectException(InvalidDataInputException::class); + Stat::medianGrouped([]); + } + public function test_calculates_mode(): void { $this->assertEquals(3, Stat::mode([1, 1, 2, 3, 3, 3, 3, 4]));