Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ The library's building blocks are `ErrorMechanism`s, `ErrorType`s, and `ErrorMod
- An `ErrorType` describes _how_ the value is wrong: a typo, an outlier, a category swap, and so on. Read the documentation for a [full list of supported error types](https://tab-err.readthedocs.io/latest/api/tab_err/error_type/index.html).
- An `ErrorModel` is a set of mechanisms and types to perturb existing data with realistic errors. It is shareable as metadata.

`tab_err` is supported by a `pandas` backend.
`tab_err` supports pandas and Polars backends, and has experimental support for cuDF, Modin, and PyArrow.

## Examples

Expand Down
321 changes: 165 additions & 156 deletions examples/Error_Types.ipynb
Comment thread
chandlerNick marked this conversation as resolved.

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ classifiers = [

requires-python = ">=3.10,<3.15"
dependencies = [
"numpy>=2.2.6,<2.5.0",
"pandas>=2.3.3,<2.4.0",
"narwhals>=1.30.0",
"numpy>=1.24.0",
]

[project.urls]
Expand All @@ -41,6 +41,9 @@ ci = [
"tomli>=2.2.1",
]
dev = [
"pandas>=2.0.0",
"polars>=1.0.0",
"pyarrow>=14.0.0", # Required for polars.to_pandas()
"pytest>=8.3.5,<9.0.0",
"ruff>=0.9.8",
"pre-commit>=4.0.1,<5.0.0",
Expand Down Expand Up @@ -98,3 +101,8 @@ convention = "google"

[tool.ty.environment]
python-version = "3.10"

[tool.pytest.ini_options]
filterwarnings = [
"ignore::UserWarning"
]
11 changes: 6 additions & 5 deletions tab_err/_error_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tab_err.api import low_level

if TYPE_CHECKING:
import pandas as pd
from narwhals.typing import IntoDataFrame

from tab_err import ErrorMechanism, ErrorType

Expand All @@ -25,17 +25,18 @@ class ErrorModel:
error_type: ErrorType
error_rate: float

def apply(self: ErrorModel, data: pd.DataFrame, column: str | int) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Applies the defined ErrorModel to the given column of a pandas DataFrame.
def apply(self: ErrorModel, data: IntoDataFrame, column: str | int) -> tuple[IntoDataFrame, IntoDataFrame]:
"""Applies the defined ErrorModel to the given column of a DataFrame.

Args:
data (pd.DataFrame): The pandas DataFrame to create errors in.
data (IntoDataFrame): The DataFrame to create errors in. Supports pandas, Polars, and other narwhals-compatible backends.
column (str | int): The column to create errors in.

Returns:
tuple[pd.DataFrame, pd.DataFrame]:
tuple[IntoDataFrame, IntoDataFrame]:
- The first element is a copy of 'data' with errors.
- The second element is the associated error mask.
Both are returned in the same format as the input data.
"""
data_with_errors, error_mask = low_level.create_errors(
data=data, column=column, error_rate=self.error_rate, error_mechanism=self.error_mechanism, error_type=self.error_type
Expand Down
111 changes: 97 additions & 14 deletions tab_err/_utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
from __future__ import annotations

import random
from typing import TYPE_CHECKING

import numpy as np

if TYPE_CHECKING:
import pandas as pd
from collections.abc import Sequence

from narwhals.typing import IntoDType

import random
import warnings
from typing import Any

def set_column(data: pd.DataFrame, column: int | str, series: pd.Series) -> None:
import narwhals as nw
import numpy as np


def set_column(data: nw.DataFrame, column: int | str, series: nw.Series) -> nw.DataFrame:
"""Replaces a column in the given DataFrame with the given Series.

Mutates data and changes the dtype of the original data to that of the series,
which, depending on the error type, might change.
Returns a new DataFrame with the column replaced.
"""
col = data.columns[column] if isinstance(column, int) else column
data[col] = data[col].astype(series.dtype)
data[col] = series
col_name = get_column_str(data, column)
return data.with_columns(series.alias(col_name))


def get_column_str(data: pd.DataFrame, column: int | str) -> str:
def get_column_str(data: nw.DataFrame, column: int | str) -> str:
"""Return column's name of the given DataFrame, where column can be defined as name or index."""
if isinstance(column, int):
col = data.columns[column]
Expand All @@ -33,7 +37,7 @@ def get_column_str(data: pd.DataFrame, column: int | str) -> str:
return col


def get_column(data: pd.DataFrame, column: int | str) -> pd.Series:
def get_column(data: nw.DataFrame, column: int | str) -> nw.Series:
"""Selects a column from the given DataFrame and returns it as a Series."""
return data[get_column_str(data, column)]

Expand All @@ -56,8 +60,87 @@ def check_error_rate(error_rate: float) -> None:
raise ValueError(msg)


def check_data_emptiness(data: pd.DataFrame) -> None:
def check_data_emptiness(data: nw.DataFrame) -> None:
"""Check that the dataset is not empty, raise a ValueError otherwise."""
if data.empty:
if data.is_empty():
msg = "The dataframe is empty, cannot introduce errors."
raise ValueError(msg)


def is_string_dtype(series: nw.Series) -> bool:
"""Check if a series has a string dtype."""
return series.dtype in {nw.String, nw.Object}


def is_numeric_dtype(series: nw.Series) -> bool:
"""Check if a series has a numeric dtype."""
return series.dtype.is_numeric()


def is_integer_dtype(series: nw.Series) -> bool:
"""Check if a series has an integer dtype."""
return series.dtype.is_integer()


def is_datetime_dtype(series: nw.Series) -> bool:
"""Check if a series has a datetime dtype."""
return series.dtype == nw.Datetime


def select_string_columns(data: nw.DataFrame) -> list[str | int]:
"""Select columns with string dtype."""
return [col for col in data.columns if is_string_dtype(data[col])]


def select_numeric_columns(data: nw.DataFrame) -> list[str | int]:
"""Select columns with numeric dtype."""
return [col for col in data.columns if is_numeric_dtype(data[col])]


def select_datetime_columns(data: nw.DataFrame) -> list[str | int]:
"""Select columns with datetime dtype."""
return [col for col in data.columns if is_datetime_dtype(data[col])]


def select_numeric_or_datetime_columns(data: nw.DataFrame) -> list[str | int]:
"""Select columns with numeric or datetime dtype."""
return [col for col in data.columns if is_numeric_dtype(data[col]) or is_datetime_dtype(data[col])]


def create_empty_boolean_mask(data: nw.DataFrame) -> nw.DataFrame:
"""Create an empty boolean mask DataFrame with the same shape as data."""
n_rows = len(data)
mask_values = [False] * n_rows
return nw.from_dict(
dict.fromkeys(data.columns, mask_values),
backend=nw.get_native_namespace(data),
)


def cast_series_like(series: nw.Series, like: nw.Series, column: int | str) -> nw.Series:
"""Cast series to the dtype of 'like' when possible, otherwise keep original."""
if series.dtype == like.dtype:
return series
dtype: IntoDType = like.dtype

try:
return series.cast(dtype)
except Exception as exc: # noqa: BLE001
msg = f"Failed to cast column {column} to dtype {like.dtype}: {exc}. Keeping inferred dtype."
warnings.warn(msg, stacklevel=2)
return series


def _values_to_list(values: Sequence[Any] | np.ndarray) -> list[Any]:
"""Normalize values into a list for nw.new_series."""
if isinstance(values, np.ndarray):
return values.tolist()
return list(values)


def new_series_like(data: nw.DataFrame, column: int | str, values: Sequence[Any] | np.ndarray) -> nw.Series:
"""Create a new series for 'column' and cast it back to the original dtype."""
col_name = get_column_str(data, column)
original = get_column(data, column)
series = nw.new_series(col_name, _values_to_list(values), backend=nw.get_native_namespace(data))
return cast_series_like(series, original, column)
57 changes: 31 additions & 26 deletions tab_err/api/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import warnings
from typing import TYPE_CHECKING

import pandas as pd
import narwhals as nw

from tab_err import ErrorMechanism, ErrorType, error_mechanism, error_type
from tab_err._error_model import ErrorModel
from tab_err._utils import check_data_emptiness, check_error_rate, seed_randomness_and_get_generator
from tab_err.api import MidLevelConfig, mid_level

if TYPE_CHECKING:
from narwhals.typing import IntoDataFrame
from numpy.random import Generator


Expand Down Expand Up @@ -39,15 +40,15 @@ def _are_same_error_mechanism(error_mechanism1: ErrorMechanism, error_mechanism2


def _build_column_type_dictionary(
data: pd.DataFrame,
data: nw.DataFrame,
random_generator: Generator,
error_types_to_include: list[ErrorType] | None = None,
error_types_to_exclude: list[ErrorType] | None = None,
) -> dict[int | str, list[ErrorType]]:
"""Creates a dictionary mapping from column names to the list of valid error types to apply to that column.

Args:
data (pd.DataFrame): The pandas DataFrame to create errors in.
data (nw.DataFrame): The DataFrame to create errors in.
random_generator (Generator): Random Generator. Defaults to None.
error_types_to_include (list[ErrorType] | None, optional): A list of the error types to be included when building error models. Defaults to None.
error_types_to_exclude (list[ErrorType] | None, optional): A list of the error types to be excluded when building error models. Defaults to None.
Expand Down Expand Up @@ -92,7 +93,7 @@ def _build_column_type_dictionary(
# else: do nothing because the default behavior uses all error types

if len(error_types_applied) == 0:
msg = "The list of error types to be applied cannot have length 0. Use the default or resturcture your input."
msg = "The list of error types to be applied cannot have length 0. Use the default or restructure your input."
raise ValueError(msg)

return {
Expand All @@ -101,15 +102,15 @@ def _build_column_type_dictionary(


def _build_column_mechanism_dictionary(
data: pd.DataFrame,
data: nw.DataFrame,
random_generator: Generator,
error_mechanisms_to_include: list[ErrorMechanism] | None = None,
error_mechanisms_to_exclude: list[ErrorMechanism] | None = None,
) -> dict[int | str, list[ErrorMechanism]]:
"""Builds a dictionary mapping from column names to the list of valid error mechanisms to apply to that column.

Args:
data (pd.DataFrame): The pandas DataFrame to create errors in.
data (nw.DataFrame): The DataFrame to create errors in.
random_generator (Generator): Random Generator. Defaults to None.
error_mechanisms_to_include (list[ErrorMechanism] | None, optional): The error mechanisms (EAR, ECAR, ENAR) to include from the dictionary.
Defaults to None.
Expand All @@ -123,7 +124,7 @@ def _build_column_mechanism_dictionary(
msg = "Possible conflict in error mechanisms to apply. Set at least on of: error_mechanisms_to_exclude or error_mechanisms_to_include to None."
raise ValueError(msg)

columns_mechanisms = {}
columns_mechanisms: dict[int | str, list[ErrorMechanism]] = {}

if error_mechanisms_to_include is not None and error_mechanisms_to_exclude is None: # Include specified
if not all(issubclass(type(cls), ErrorMechanism) for cls in error_mechanisms_to_include): # Check input
Expand Down Expand Up @@ -164,19 +165,19 @@ def _build_column_mechanism_dictionary(


def _build_column_number_of_models_dictionary(
data: pd.DataFrame, column_types: dict[int | str, list[ErrorType]], column_mechanisms: dict[int | str, list[ErrorMechanism]]
data: nw.DataFrame, column_types: dict[int | str, list[ErrorType]], column_mechanisms: dict[int | str, list[ErrorMechanism]]
) -> dict[int | str, int]:
"""Builds a dictionary mapping from column names to the number of error models to apply to that column.

Args:
data (pd.DataFrame): The pandas DataFrame to create errors in.
data (nw.DataFrame): The DataFrame to create errors in.
column_types (dict[int | str, list[ErrorType]]): A dictionary mapping from column names to the list of valid error types to apply to that column.
column_mechanisms (dict[int | str, list[ErrorMechanism]]): A dictionary mapping from column names to the list of valid error mechanisms to apply.

Returns:
dict[int | str, int]: A dictionary mapping from column names to the number of error models to apply to that column.
"""
column_num_models = {}
column_num_models: dict[int | str, int] = {}

for column in data.columns:
column_num_models[column] = len(column_types[column]) * len(column_mechanisms[column])
Expand All @@ -189,19 +190,19 @@ def _build_column_number_of_models_dictionary(


def create_errors( # noqa: PLR0913
data: pd.DataFrame,
data: IntoDataFrame,
error_rate: float,
n_error_models_per_column: int = 1,
error_types_to_include: list[ErrorType] | None = None,
error_types_to_exclude: list[ErrorType] | None = None,
error_mechanisms_to_include: list[ErrorMechanism] | None = None,
error_mechanisms_to_exclude: list[ErrorMechanism] | None = None,
seed: int | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
) -> tuple[IntoDataFrame, IntoDataFrame]:
"""Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate.

Args:
data (pd.DataFrame): The pandas DataFrame to create errors in.
data (IntoDataFrame): The DataFrame to create errors in. Supports pandas, Polars, and (experimental) other narwhals-compatible backends.
error_rate (float): The maximum error rate to be introduced to each column in the DataFrame.
n_error_models_per_column (int, optional): The number of valid error models to apply to each column. Defaults to 1.
error_types_to_include (list[ErrorType] | None, optional): A list of the error types to be included when building error models. Defaults to None.
Expand All @@ -215,39 +216,43 @@ def create_errors( # noqa: PLR0913
seed (int | None, optional): Random seed. Defaults to None.

Returns:
tuple[pd.DataFrame, pd.DataFrame]:
tuple[IntoDataFrame, IntoDataFrame]:
- The first element is a copy of 'data' with errors.
- The second element is the associated error mask.
Both are returned in the same format as the input data.
"""
# Wrap native DataFrame to narwhals
data_nw = nw.from_native(data, eager_only=True)

random_generator = seed_randomness_and_get_generator(seed=seed)
# Input Checking
check_error_rate(error_rate)
check_data_emptiness(data)
check_data_emptiness(data_nw)

# Set Up Data
data_copy = data.copy()
error_mask = pd.DataFrame(data=False, index=data.index, columns=data.columns)
data_copy = data_nw.clone()

# Build Dictionaries
col_type = _build_column_type_dictionary(
data=data, random_generator=random_generator, error_types_to_include=error_types_to_include, error_types_to_exclude=error_types_to_exclude
data=data_nw, random_generator=random_generator, error_types_to_include=error_types_to_include, error_types_to_exclude=error_types_to_exclude
)
col_mechanisms = _build_column_mechanism_dictionary(
data=data,
data=data_nw,
random_generator=random_generator,
error_mechanisms_to_include=error_mechanisms_to_include,
error_mechanisms_to_exclude=error_mechanisms_to_exclude,
)
col_num_models = _build_column_number_of_models_dictionary(data=data, column_types=col_type, column_mechanisms=col_mechanisms)
col_num_models = _build_column_number_of_models_dictionary(data=data_nw, column_types=col_type, column_mechanisms=col_mechanisms)

if n_error_models_per_column > 0:
error_rate = error_rate / n_error_models_per_column
config_dictionary: dict[str | int, list[ErrorModel]] = {
column: [] for column in data.columns if col_num_models[column] > 0
column: [] for column in data_nw.columns if col_num_models[column] > 0
} # Filter out those columns with no valid error models

if error_rate * len(data) < 1: # This value is calculated and rounded to 0 in the sample function of the error mechanism subclasses "n_errors"
msg = f"With a per-model error rate of: {error_rate} and {len(data)} rows, 0 errors will be introduced."
n_rows = len(data_nw)
if error_rate * n_rows < 1: # This value is calculated and rounded to 0 in the sample function of the error mechanism subclasses "n_errors"
msg = f"With a per-model error rate of: {error_rate} and {n_rows} rows, 0 errors will be introduced."
warnings.warn(msg, stacklevel=2)

for column, error_model_list in config_dictionary.items():
Expand All @@ -264,6 +269,6 @@ def create_errors( # noqa: PLR0913
msg = f"n_error_models_per_column is: {n_error_models_per_column} and should be a positive integer"
raise ValueError(msg)

# Create Errors & Return
dirty_data, error_mask = mid_level.create_errors(data_copy, config)
return dirty_data, error_mask
# Create Errors & Return (mid_level handles native conversion)
dirty_data_native, error_mask_native = mid_level.create_errors(nw.to_native(data_copy), config)
return dirty_data_native, error_mask_native
Loading