Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ jobs:
shell: bash -l {0} # Ensures the Conda environment is properly loaded
run: |
export PYTHONPATH=$PYTHONPATH:$(pwd)
pytest -v tests/tests_encoding.py
pytest
6 changes: 4 additions & 2 deletions data/hooks/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,10 @@ def fit(self, x):
# Iterate through the columns in the OHE in our specified order to identity clusters of infrequent groups
col_group = []
for c in self.order:
# If this column doesn't exist in the OHE, skip it entirely
# If this column doesn't exist in the OHE, treat it as frequency 0 and add a homogenous column to the ohe_df
if c not in ohe_df.columns:
col_group.append(c)
ohe_df[c] = 0
continue

# Otherwise, append it to our current column group
Expand Down Expand Up @@ -475,7 +477,7 @@ def ohe_to_ladder(self, x_df: pd.DataFrame):
rung_val = rung_df.any(axis="columns")

# Generate a string representing the combination of columns which will be grouped
group_str = "|".join(g)
group_str = "|".join([str(x) for x in g])

# Generate the corresponding column's name, but only if we had a prior group (the first col will be dropped)
col_name = ""
Expand Down
19 changes: 19 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
This package uses PyTest to run its tests, but due to the way its structured there is a little bit of setup required:

1. Ensure all required packages are installed (see `environment.yml` in the directory above this one)
2. Activate the environment you installed the packages into it (if installing to environment which is not your "base")
3. Add the path of Modular Optuna ML to the Python path, allowing PyTest to see its contents.

```bash
cd ..
export PYTHONPATH=$PYTHONPATH:"$PWD"
cd tests
```

4. Run the tests you want. For example, to run all tests:

```bash
pytest
```

If you are running PyTest through an IDE (such as PyCharm), you may also need to mark the `tests` directory as a testing directory to do in-place runs.
81 changes: 81 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import json
import tempfile
from pathlib import Path
from typing import Self

import numpy as np
import pandas as pd
import pytest

from config.data import DataConfig
from data.base import BaseDataManager


class DummyDataManager(BaseDataManager):

def __init__(self, new_data: pd.DataFrame, **kwargs):
# Initiate superclass stuff
super().__init__(**kwargs)

# Set the DataFrame managed by this object to be the dataframe provided
self._data = new_data

@property
def data(self) -> pd.DataFrame:
return self._data

def _replace_data(self, new_df: pd.DataFrame):
self._data = new_df

def shallow_copy(self) -> Self:
return DummyDataManager(self._data)

@classmethod
def from_config(cls, config: dict) -> Self:
raise NotImplementedError(
f"This is a dummy data manager meant for automated testing; it cannot be built from a config file."
)

def pre_split(self, is_cross: bool, targets: Self = None) -> Self:
raise NotImplementedError(
f"This is a dummy data manager meant for automated testing; it should not be split."
)

def split(self, train_idx: np.ndarray, test_idx: np.ndarray, train_target: Self, test_target: Self,
is_cross: bool = True) -> (Self, Self):
raise NotImplementedError(
f"This is a dummy data manager meant for automated testing; it should not be split."
)

def __len__(self):
return self._data.shape[0]


@pytest.fixture
def iris_data_config():
"""
Fixture that returns a DataConfig instance for the Iris dataset.
The tsv file is loaded as a TabularDataManager instance: data_config.data_manager
Loaded data is stored as a pandas DataFrame: data_config.data_manager.data
"""
config_dict = {
"label": "IrisTesting",
"format": "tabular",
"data_source": str(Path(__file__).parent.resolve() / 'testing_files' / 'iris_data' / 'iris_testing.tsv'),
"separator": "\t",
"index": "id",
}

# Create a temporary file for the config
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json") as tmp_file:
json.dump(config_dict, tmp_file)
tmp_config_path = Path(tmp_file.name) # Convert string path to Path object

# Ensure the file exists before using it
assert tmp_config_path.exists(), f"Temporary config file {tmp_config_path} was not created."

data_config = DataConfig.from_json_file(tmp_config_path)

return data_config


Loading
Loading