Skip to content

Commit 3b709c0

Browse files
committed
Simplify the determination of outliers & pass on sorted?:
There could be an argument made that if we have few outliers, reversing the lists twice could be faster than passing through the entire list once with 2 conditions. We can probably optimize & benchmark on this later.
1 parent d769e4e commit 3b709c0

3 files changed

Lines changed: 77 additions & 33 deletions

File tree

lib/statistex.ex

Lines changed: 50 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ defmodule Statistex do
1515
alias Statistex.{Mode, Percentile}
1616
require Integer
1717

18+
import Statistex.Helper, only: [maybe_sort: 2]
19+
1820
defstruct [
1921
:total,
2022
:average,
@@ -88,6 +90,7 @@ defmodule Statistex do
8890
@empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number."
8991

9092
@first_quartile 25
93+
@median_percentile 50
9194
@third_quartile 75
9295
# https://en.wikipedia.org/wiki/Interquartile_range#Outliers
9396
# https://builtin.com/articles/1-5-iqr-rule
@@ -167,17 +170,21 @@ defmodule Statistex do
167170
end
168171

169172
def statistics(samples, configuration) do
170-
samples = Enum.sort(samples)
173+
sorted_samples = Enum.sort(samples)
171174

172175
# these statistics are required to do the outlier calculations
173176
%{minimum: minimum, maximum: maximum, percentiles: percentiles} =
174-
base_statistics(samples, configuration)
177+
base_statistics(sorted_samples, configuration)
175178

176179
outlier_bounds =
177-
do_outlier_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum)
180+
do_outlier_bounds(sorted_samples,
181+
percentiles: percentiles,
182+
minimum: minimum,
183+
maximum: maximum
184+
)
178185

179186
# make sure rest remains sorted and so can be used again to ok results
180-
{outliers, rest} = do_outliers(samples, outlier_bounds: outlier_bounds)
187+
{outliers, rest} = do_outliers(sorted_samples, outlier_bounds: outlier_bounds)
181188

182189
if exclude_outliers?(configuration) and Enum.any?(outliers) do
183190
# figure out to avoid double sorting
@@ -188,15 +195,22 @@ defmodule Statistex do
188195

189196
create_full_statistics(rest, minimum, maximum, percentiles, outliers, outlier_bounds)
190197
else
191-
create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds)
198+
create_full_statistics(
199+
sorted_samples,
200+
minimum,
201+
maximum,
202+
percentiles,
203+
outliers,
204+
outlier_bounds
205+
)
192206
end
193207
end
194208

195-
defp base_statistics(samples, configuration) do
196-
minimum = hd(samples)
197-
maximum = List.last(samples)
209+
defp base_statistics(sorted_samples, configuration) do
210+
minimum = hd(sorted_samples)
211+
maximum = List.last(sorted_samples)
198212

199-
percentiles = calculate_percentiles(samples, configuration)
213+
percentiles = calculate_percentiles(sorted_samples, configuration)
200214

201215
%{minimum: minimum, maximum: maximum, percentiles: percentiles}
202216
end
@@ -459,23 +473,26 @@ defmodule Statistex do
459473
end
460474
end
461475

462-
@median_percentile 50
463-
defp calculate_percentiles(samples, configuration) do
476+
defp calculate_percentiles(sorted_samples, configuration) do
464477
percentiles_configuration = Keyword.get(configuration, :percentiles, [])
465478

466479
# median_percentile is manually added so that it can be used directly by median
467480
percentiles_configuration =
468-
Enum.uniq([25, @median_percentile, 75 | percentiles_configuration])
481+
Enum.uniq([
482+
@first_quartile,
483+
@median_percentile,
484+
@third_quartile | percentiles_configuration
485+
])
469486

470-
Percentile.percentiles(samples, percentiles_configuration)
487+
Percentile.percentiles(sorted_samples, percentiles_configuration, sorted: true)
471488
end
472489

473490
@doc """
474491
Calculates the value at the `percentile_rank`-th percentile.
475492
476493
Think of this as the
477494
value below which `percentile_rank` percent of the samples lie. For example,
478-
if `Statistex.percentile(samples, 99)` == 123.45,
495+
if `Statistex.percentile(samples, 99) == 123.45`,
479496
99% of samples are less than 123.45.
480497
481498
Passing a number for `percentile_rank` calculates a single percentile.
@@ -517,9 +534,8 @@ defmodule Statistex do
517534
"""
518535
@spec percentiles(samples, number | [number(), ...]) ::
519536
percentiles()
520-
def percentiles(samples, percentiles) do
521-
samples |> Enum.sort() |> Percentile.percentiles(percentiles)
522-
end
537+
defdelegate percentiles(samples, percentiles, options), to: Percentile
538+
defdelegate percentiles(samples, percentiles), to: Percentile
523539

524540
@doc """
525541
A map showing which sample occurs how often in the samples.
@@ -631,6 +647,9 @@ defmodule Statistex do
631647
iex> Statistex.outlier_bounds([3, 4, 5])
632648
{0.0, 8.0}
633649
650+
iex> Statistex.outlier_bounds([4, 5, 3])
651+
{0.0, 8.0}
652+
634653
iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])
635654
{22.5, 66.5}
636655
@@ -640,19 +659,21 @@ defmodule Statistex do
640659
@spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number}
641660
def outlier_bounds(samples, options \\ [])
642661
def outlier_bounds([], _), do: raise(ArgumentError, @empty_list_error_message)
643-
def outlier_bounds(samples, options), do: samples |> Enum.sort() |> do_outlier_bounds(options)
662+
def outlier_bounds(samples, options), do: do_outlier_bounds(samples, options)
644663

645664
defp do_outlier_bounds(samples, options) do
665+
# double check do we need both get lazies here?
646666
percentiles =
647667
Keyword.get_lazy(options, :percentiles, fn ->
648-
Percentile.percentiles(samples, [@first_quartile, @third_quartile])
668+
Percentile.percentiles(samples, [@first_quartile, @third_quartile], options)
649669
end)
650670

651671
q1 = get_percentile(samples, @first_quartile, percentiles)
652672
q3 = get_percentile(samples, @third_quartile, percentiles)
653673
iqr = q3 - q1
674+
outlier_tolerance = iqr * @iqr_factor
654675

655-
{q1 - iqr * @iqr_factor, q3 + iqr * @iqr_factor}
676+
{q1 - outlier_tolerance, q3 + outlier_tolerance}
656677
end
657678

658679
@doc """
@@ -671,21 +692,21 @@ defmodule Statistex do
671692
"""
672693
@spec outliers(samples, keyword) :: samples | []
673694
def outliers(samples, options \\ []) do
674-
{outliers, _rest} = samples |> Enum.sort() |> do_outliers(options)
695+
sorted_samples = maybe_sort(samples, options)
696+
697+
# maybe allow folks to get the same
698+
{outliers, _rest} = do_outliers(sorted_samples, options)
675699

676700
outliers
677701
end
678702

679-
defp do_outliers(samples, options) do
703+
defp do_outliers(sorted_samples, options) do
680704
{lower_bound, upper_bound} =
681-
Keyword.get_lazy(options, :outlier_bounds, fn -> do_outlier_bounds(samples, options) end)
682-
683-
{min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end)
684-
685-
{max, rest} =
686-
rest |> Enum.reverse() |> Enum.split_while(fn sample -> sample > upper_bound end)
705+
Keyword.get_lazy(options, :outlier_bounds, fn ->
706+
do_outlier_bounds(sorted_samples, options)
707+
end)
687708

688-
{min ++ max, rest}
709+
Enum.split_with(sorted_samples, fn sample -> sample < lower_bound || sample > upper_bound end)
689710
end
690711

691712
defp get_percentile(samples, percentile, percentiles) do

lib/statistex/helper.ex

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
defmodule Statistex.Helper do
2+
@moduledoc false
3+
# Everyone loves helper modules... ok ok, no. But I needed/wanted this function,
4+
# but didn't wanna put it on the main module.
5+
6+
# With the design goal that we don't want to needlessly do operations, esp. big ones
7+
# like sorting we need an optional `sorted?` arguments in a bunch of places.
8+
# This unifies the handling of that.
9+
def maybe_sort(samples, options) do
10+
sorted? = Access.get(options, :sorted?, false)
11+
12+
if sorted? do
13+
samples
14+
else
15+
Enum.sort(samples)
16+
end
17+
end
18+
end

lib/statistex/percentile.ex

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,27 @@
11
defmodule Statistex.Percentile do
22
@moduledoc false
33

4-
@spec percentiles(Statistex.samples(), number | [number, ...]) ::
4+
import Statistex.Helper, only: [maybe_sort: 2]
5+
6+
@spec percentiles(Statistex.samples(), number | [number, ...], keyword()) ::
57
Statistex.percentiles()
6-
def percentiles([], _) do
8+
def percentiles(samples, percentiles, options \\ [])
9+
10+
def percentiles([], _, _) do
711
raise(
812
ArgumentError,
913
"Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number."
1014
)
1115
end
1216

13-
def percentiles(samples, percentile_ranks) do
17+
def percentiles(samples, percentile_ranks, options) do
1418
number_of_samples = length(samples)
19+
sorted_samples = maybe_sort(samples, options)
1520

1621
percentile_ranks
1722
|> List.wrap()
1823
|> Enum.reduce(%{}, fn percentile_rank, acc ->
19-
perc = percentile(samples, number_of_samples, percentile_rank)
24+
perc = percentile(sorted_samples, number_of_samples, percentile_rank)
2025
Map.put(acc, percentile_rank, perc)
2126
end)
2227
end

0 commit comments

Comments
 (0)