Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Iterable, Self, Sequence, Type

import numpy as np
import pandas as pd
from optuna import Trial

from tuning.utils import Tunable, TunableParam
Expand All @@ -23,6 +24,14 @@ def __init__(self, logger: Logger = Logger.root, **kwargs):
# Keep tabs on the list of tunable parameters
self.tunable_hooks: list[Tunable] = []

@property
@abstractmethod
def data(self) -> pd.DataFrame:
"""
A pandas dataframe which will mediate the samples and their features for us
"""
...

def tune(self, trial: Trial):
for h in self.tunable_hooks:
h.tune(trial)
Expand Down Expand Up @@ -97,6 +106,45 @@ def split(
"""
...

@abstractmethod
def features(self) -> Iterable[str]:
# List all features available in the dataset
...

@abstractmethod
def get_features(self, idx) -> Self:
"""
Explicitly query for some features within this DataManager
:param idx: The feature(s) to get from this class.
:return: A subset of the DataManager's data with only the requested features.
This should *always* be an instance of the same class to allow for function chaining!
"""
...

@abstractmethod
def set_features(self, idx, new_data) -> Self:
"""
Set the values of some feature(s), overwriting them if they already exist
:param idx: The feature(s) ot overwrite or set
:param new_data: The data to use
:return: An instance of the data manager w/ the new features
"""
...

@abstractmethod
def drop_features(self, idx) -> Self:
"""
Drop some subset of features from the dataset
:param idx: The feature(s) to drop
:return: A modified version of this instance
"""
...

@abstractmethod
def n_features(self) -> int:
# Just returns the number of features in this dataset; required for certain checks
...

@abstractmethod
def __len__(self):
# How this is done will depend on the backing data structure
Expand Down
172 changes: 67 additions & 105 deletions data/hooks/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from data import BaseDataManager
from data.hooks import registered_data_hook
from data.hooks.base import FittedDataHook
from data.mixins import MultiFeatureMixin


@registered_data_hook("one_hot_encode")
Expand Down Expand Up @@ -68,28 +67,22 @@ def from_config(cls, config: dict, logger: Logger = Logger.root) -> Self:
return cls(config, logger=logger)

def run(self, x: BaseDataManager, y: Optional[BaseDataManager] = None) -> BaseDataManager:
# If this is multi-feature dataset, sub-features can be selected
if isinstance(x, MultiFeatureMixin):
# Update the list of tracked features based on the dataset
self.update_tracked_features(x)

# Fit to and transform the training data first
x: BaseDataManager | MultiFeatureMixin
tmp_x = x.get_features(self.tracked_features)
# noinspection PyUnresolvedReferences
tmp_x = self.backing_encoder.fit_transform(tmp_x.as_array())
# Densify result if it is in a sparse format
if hasattr(tmp_x, "todense"):
tmp_x = tmp_x.todense()

# Generate the new feature names based on this transform, and delete the old ones!
new_features = self.backing_encoder.get_feature_names_out(self.tracked_features)
x_out = x.drop_features(self.tracked_features)
x_out = x_out.set_features(new_features, tmp_x)
# Otherwise, just fit and transform everything in bulk
# TODO: Implement a method of converting back to the original DataManager type
else:
x_out = self.backing_encoder.fit_transform(x.as_array())
# Update the list of tracked features based on the dataset
self.update_tracked_features(x)

# Fit to and transform the training data first
tmp_x = x.get_features(self.tracked_features)
# noinspection PyUnresolvedReferences
tmp_x = self.backing_encoder.fit_transform(tmp_x.as_array())
# Densify result if it is in a sparse format
if hasattr(tmp_x, "todense"):
tmp_x = tmp_x.todense()

# Generate the new feature names based on this transform, and delete the old ones!
new_features = self.backing_encoder.get_feature_names_out(self.tracked_features)
x_out = x.drop_features(self.tracked_features)
x_out = x_out.set_features(new_features, tmp_x)

return x_out

def run_fitted(self,
Expand All @@ -98,29 +91,20 @@ def run_fitted(self,
y_train: Optional[BaseDataManager] = None,
y_test: Optional[BaseDataManager] = None
) -> (BaseDataManager, BaseDataManager):
# If this is multi-feature dataset, sub-features can be selected
if isinstance(x_train, MultiFeatureMixin):
# Fit to and transform the training data first
x_train: BaseDataManager | MultiFeatureMixin
train_out = self.run(x_train, y_train)

# Generate the new feature names based on this transform, and delete the old ones!
new_features = self.backing_encoder.get_feature_names_out(self.tracked_features)

# Then ONLY transform the testing data
x_test: BaseDataManager | MultiFeatureMixin
tmp_test: BaseDataManager | MultiFeatureMixin = x_test.get_features(self.tracked_features)
tmp_test = self.backing_encoder.transform(tmp_test.as_array())
if hasattr(tmp_test, "todense"):
tmp_test = tmp_test.todense()
test_out = x_test.drop_features(self.tracked_features)
test_out = test_out.set_features(new_features, tmp_test)

# Otherwise, just fit and transform everything in bulk
# TODO: Implement a method of converting back to the original DataManager type
else:
train_out = self.backing_encoder.fit_transform(x_train.as_array())
test_out = self.backing_encoder.fit_transform(x_test.as_array())
# Fit to and transform the training data first
train_out = self.run(x_train, y_train)

# Generate the new feature names based on this transform, and delete the old ones!
new_features = self.backing_encoder.get_feature_names_out(self.tracked_features)

# Then ONLY transform the testing data
tmp_test = x_test.get_features(self.tracked_features)
tmp_test = self.backing_encoder.transform(tmp_test.as_array())
if hasattr(tmp_test, "todense"):
tmp_test = tmp_test.todense()
test_out = x_test.drop_features(self.tracked_features)
test_out = test_out.set_features(new_features, tmp_test)

return train_out, test_out

def update_tracked_features(self, x):
Expand Down Expand Up @@ -202,18 +186,14 @@ def from_config(cls, config: dict, logger: Logger = Logger.root) -> Self:

def run(self, x: BaseDataManager, y: Optional[BaseDataManager] = None) -> BaseDataManager:
# If this is a multi-feature dataset, select the relevant features
if isinstance(x, MultiFeatureMixin):
self.update_tracked_features(x)
tmp_x = x.get_features(self.tracked_features).as_array()
self.update_tracked_features(x)
tmp_x = x.get_features(self.tracked_features).as_array()

# Fit+transform (for training) the ordinal encoder
tmp_x = self.backing_encoder.fit_transform(tmp_x)
# Fit+transform (for training) the ordinal encoder
tmp_x = self.backing_encoder.fit_transform(tmp_x)

# Replace them with the new columns
x_out = x.set_features(self.tracked_features, tmp_x)
else:
# Not a multi-feature dataset, so encode the entire array
x_out = self.backing_encoder.fit_transform(x.as_array())
# Replace them with the new columns
x_out = x.set_features(self.tracked_features, tmp_x)

return x_out

Expand All @@ -223,19 +203,14 @@ def run_fitted(self,
y_train: Optional[BaseDataManager] = None,
y_test: Optional[BaseDataManager] = None
) -> (BaseDataManager, BaseDataManager):
if isinstance(x_train, MultiFeatureMixin):
# Fit+transform on the training data
train_out = self.run(x_train, y_train)

# Transform (only) the test data
tmp_test = x_test.get_features(self.tracked_features).as_array()
tmp_test = self.backing_encoder.transform(tmp_test)
test_out = x_test.drop_features(self.tracked_features)
test_out = test_out.set_features(self.tracked_features, tmp_test)
else:
# Encode the entire arrays when not using multi-feature
train_out = self.backing_encoder.fit_transform(x_train.as_array())
test_out = self.backing_encoder.transform(x_test.as_array())
# Fit+transform on the training data
train_out = self.run(x_train, y_train)

# Transform (only) the test data
tmp_test = x_test.get_features(self.tracked_features).as_array()
tmp_test = self.backing_encoder.transform(tmp_test)
test_out = x_test.drop_features(self.tracked_features)
test_out = test_out.set_features(self.tracked_features, tmp_test)

return train_out, test_out

Expand Down Expand Up @@ -366,27 +341,21 @@ def from_config(cls, config: dict, logger: Logger = Logger.root) -> Self:
return cls(config, logger=logger)

def run(self, x: BaseDataManager, y: Optional[BaseDataManager] = None) -> BaseDataManager:
# If this is multi-feature dataset, sub-features can be selected
if isinstance(x, MultiFeatureMixin):
# Fit to and transform the training data first
x: BaseDataManager | MultiFeatureMixin
# Setup
sub_x = x.get_features([self.feature])

# Setup
sub_x = x.get_features([self.feature])
# Fit this model to the provided feature subset
x_df = self.fit(sub_x)

# Fit this model to the provided feature subset
x_df = self.fit(sub_x)
# Use the (now fit) encoder to generate our encoded data
x_df = self.ohe_to_ladder(x_df)

# Use the (now fit) encoder to generate our encoded data
x_df = self.ohe_to_ladder(x_df)
# Update the dataset using these new feature names
x_out = x.drop_features([self.feature])
x_out = x_out.set_features(x_df.columns, x_df.to_numpy())

# Update the dataset using these new feature names
x_out = x.drop_features([self.feature])
x_out = x_out.set_features(x_df.columns, x_df.to_numpy())
x_out: MultiFeatureMixin | BaseDataManager

# Return the result
return x_out
# Return the result
return x_out

def fit(self, x):
"""
Expand Down Expand Up @@ -461,31 +430,24 @@ def run_fitted(self,
y_train: Optional[BaseDataManager] = None,
y_test: Optional[BaseDataManager] = None
) -> (BaseDataManager, BaseDataManager):
# If this is multi-feature dataset, sub-features can be selected
if isinstance(x_train, MultiFeatureMixin):
# Fit to and transform the training data first
x_train: BaseDataManager | MultiFeatureMixin
x_test: BaseDataManager | MultiFeatureMixin
train_out = self.run(x_train, y_train)
# Fit to and transform the training data first
train_out = self.run(x_train, y_train)

# Use the now-fit encoder to transform our testing input
sub_x_train: BaseDataManager | MultiFeatureMixin = x_test.get_features([self.feature])
sub_x_train = self.backing_ohe_encoder.transform(sub_x_train.as_array())
# Use the now-fit encoder to transform our testing input
sub_x_train = x_test.get_features([self.feature])
sub_x_train = self.backing_ohe_encoder.transform(sub_x_train.as_array())

ohe_feature_cols = self.backing_ohe_encoder.get_feature_names_out([self.feature])
ohe_feature_cols = self.backing_ohe_encoder.get_feature_names_out([self.feature])

# Convert it to a dataframe for ease of use
ohe_df = pd.DataFrame(sub_x_train, columns=ohe_feature_cols)
# Convert it to a dataframe for ease of use
ohe_df = pd.DataFrame(sub_x_train, columns=ohe_feature_cols)

x_df = self.ohe_to_ladder(ohe_df)
x_df = self.ohe_to_ladder(ohe_df)

# Update the testing dataset with these results
x_test = x_test.drop_features([self.feature])
test_out = x_test.set_features(x_df.columns, x_df.to_numpy())
# Update the testing dataset with these results
x_test = x_test.drop_features([self.feature])
test_out = x_test.set_features(x_df.columns, x_df.to_numpy())

# Ladder Encoding only makes sense in the context of multiple features; as such, any other type will not work!
else:
raise NotImplementedError("Ladder Encoding only makes sense in the context of a multi-feature dataset!")
return train_out, test_out


Expand Down
Loading
Loading