Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions janitor/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from .filter import filter_column_isin, filter_date, filter_on, filter_string
from .find_replace import find_replace
from .flag_nulls import flag_nulls
from .flag_outliers import flag_outliers
from .get_dupes import get_dupes
from .get_one_to_one import get_one_to_one
from .groupby_agg import groupby_agg
Expand Down Expand Up @@ -171,6 +172,7 @@
"filter_string",
"find_replace",
"flag_nulls",
"flag_outliers",
"get_dupes",
"get_one_to_one",
"get_join_indices",
Expand Down
123 changes: 123 additions & 0 deletions janitor/functions/flag_outliers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Implementation source for `flag_outliers`."""

from typing import Hashable, Iterable, Literal, Optional, Union

import pandas as pd
import pandas_flavor as pf


def _check_column(
df: pd.DataFrame,
column_names: Union[Iterable, str],
present: bool = True,
):
"""Check presence or absence of columns in a DataFrame."""
if isinstance(column_names, str) or not isinstance(column_names, Iterable):
column_names = [column_names]
for column_name in column_names:
if present and column_name not in df.columns:
raise ValueError(f"{column_name} not present in dataframe columns!")
elif not present and column_name in df.columns:
raise ValueError(f"{column_name} already present in dataframe columns!")


@pf.register_dataframe_method
def flag_outliers(
df: pd.DataFrame,
column_name: str,
method: Literal["iqr", "zscore"] = "iqr",
threshold: float = 1.5,
flag_column_name: Optional[Hashable] = None,
) -> pd.DataFrame:
"""Creates a new boolean column flagging outlier values in a numeric column.

Supports two detection methods:

- ``iqr``: Flags values below ``Q1 - threshold * IQR`` or above
``Q3 + threshold * IQR``.
- ``zscore``: Flags values whose absolute Z-score exceeds ``threshold``
(default threshold should be set to 3.0 for Z-score method).

This method does not mutate the original DataFrame.

Examples:
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame({"values": [10, 12, 11, 13, 100, 9, 11]})
>>> df.flag_outliers(column_name="values")
values values_outlier_flag
0 10 False
1 12 False
2 11 False
3 13 False
4 100 True
5 9 False
6 11 False

Args:
df: Input pandas DataFrame.
column_name: Name of the numeric column to check for outliers.
method: Outlier detection method. Either ``"iqr"`` (default) or
``"zscore"``.
threshold: Multiplier for IQR method (default ``1.5``) or the
Z-score cutoff (commonly ``3.0``). Must be a positive number.
flag_column_name: Name for the output boolean flag column. Defaults
to ``"<column_name>_outlier_flag"`` if not provided.

Raises:
ValueError: If ``column_name`` is not present in the DataFrame.
ValueError: If ``flag_column_name`` is already present in the
DataFrame.
ValueError: If ``method`` is not one of ``"iqr"`` or ``"zscore"``.
ValueError: If ``threshold`` is not a positive number.
TypeError: If the specified column is not numeric.

Returns:
Input DataFrame with a new boolean outlier flag column appended.

<!--
# noqa: DAR402
-->
"""
_check_column(df, [column_name])

if flag_column_name is None:
flag_column_name = f"{column_name}_outlier_flag"

_check_column(df, [flag_column_name], present=False)

if method not in ("iqr", "zscore"):
raise ValueError(
f"Invalid method '{method}'. Choose either 'iqr' or 'zscore'."
)

if threshold <= 0:
raise ValueError(
f"threshold must be a positive number, got {threshold}."
)

if not pd.api.types.is_numeric_dtype(df[column_name]):
raise TypeError(
f"Column '{column_name}' must be numeric to detect outliers."
)

series = df[column_name]

if method == "iqr":
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
iqr = q3 - q1
lower = q1 - threshold * iqr
upper = q3 + threshold * iqr
outlier_mask = (series < lower) | (series > upper)
else:
mean = series.mean()
std = series.std()
if std == 0:
outlier_mask = pd.Series([False] * len(df), index=df.index)
else:
outlier_mask = ((series - mean) / std).abs() > threshold

df = df.copy()
df[flag_column_name] = outlier_mask
return df
106 changes: 106 additions & 0 deletions tests/functions/test_flag_outliers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Tests for `flag_outliers` function."""
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal
from janitor.functions import flag_outliers


@pytest.mark.functions
def test_iqr_flags_outlier():
"""Checks that IQR method correctly flags a clear outlier."""
df = pd.DataFrame({"values": [10, 12, 11, 13, 100, 9, 11]})
result = df.flag_outliers(column_name="values")
assert "values_outlier_flag" in result.columns
assert result["values_outlier_flag"].iloc[4]
assert not result["values_outlier_flag"].iloc[0]


@pytest.mark.functions
def test_zscore_flags_outlier():
"""Checks that Z-score method correctly flags a clear outlier."""
df = pd.DataFrame({"values": [10, 12, 11, 13, 100, 9, 11]})
result = df.flag_outliers(column_name="values", method="zscore", threshold=2.0)
assert "values_outlier_flag" in result.columns
assert result["values_outlier_flag"].iloc[4]


@pytest.mark.functions
def test_no_outliers_all_false():
"""Checks that no rows are flagged when data has no outliers."""
df = pd.DataFrame({"values": [10, 11, 12, 11, 10, 12, 11]})
result = df.flag_outliers(column_name="values")
assert result["values_outlier_flag"].sum() == 0


@pytest.mark.functions
def test_custom_flag_column_name():
"""Checks that custom flag column name is used correctly."""
df = pd.DataFrame({"values": [10, 12, 11, 100]})
result = df.flag_outliers(column_name="values", flag_column_name="is_outlier")
assert "is_outlier" in result.columns
assert "values_outlier_flag" not in result.columns


@pytest.mark.functions
def test_does_not_mutate_original():
"""Checks that the original DataFrame is not mutated."""
df = pd.DataFrame({"values": [10, 12, 11, 100]})
original = df.copy()
df.flag_outliers(column_name="values")
assert_frame_equal(df, original)


@pytest.mark.functions
def test_non_method_functional():
"""Checks behaviour when flag_outliers is used as a function."""
df = pd.DataFrame({"values": [10, 12, 11, 100]})
result = flag_outliers(df, column_name="values")
assert "values_outlier_flag" in result.columns


@pytest.mark.functions
def test_fail_invalid_method():
"""Checks that ValueError is raised for an invalid method."""
df = pd.DataFrame({"values": [10, 12, 11, 100]})
with pytest.raises(ValueError):
df.flag_outliers(column_name="values", method="invalid")


@pytest.mark.functions
def test_fail_negative_threshold():
"""Checks that ValueError is raised for a non-positive threshold."""
df = pd.DataFrame({"values": [10, 12, 11, 100]})
with pytest.raises(ValueError):
df.flag_outliers(column_name="values", threshold=-1.0)


@pytest.mark.functions
def test_fail_non_numeric_column():
"""Checks that TypeError is raised for a non-numeric column."""
df = pd.DataFrame({"names": ["alice", "bob", "charlie"]})
with pytest.raises(TypeError):
df.flag_outliers(column_name="names")


@pytest.mark.functions
def test_fail_column_not_in_df():
"""Checks that ValueError is raised when column is not in DataFrame."""
df = pd.DataFrame({"values": [10, 12, 11, 100]})
with pytest.raises(ValueError):
df.flag_outliers(column_name="nonexistent")


@pytest.mark.functions
def test_fail_flag_column_already_exists():
"""Checks that ValueError is raised when flag column already exists."""
df = pd.DataFrame({"values": [10, 12, 11, 100], "values_outlier_flag": [0, 0, 0, 0]})
with pytest.raises(ValueError):
df.flag_outliers(column_name="values")


@pytest.mark.functions
def test_zscore_constant_column():
"""Checks that a constant column (std=0) produces no outliers."""
df = pd.DataFrame({"values": [5, 5, 5, 5, 5]})
result = df.flag_outliers(column_name="values", method="zscore")
assert result["values_outlier_flag"].sum() == 0
Loading