diff --git a/.github/workflows/docbuild.yml b/.github/workflows/docbuild.yml index 679bd09a..392444d7 100644 --- a/.github/workflows/docbuild.yml +++ b/.github/workflows/docbuild.yml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: ["3.10", "3.11"] steps: - name: Checkout repo diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aff5cdcb..3783f73e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: ["3.10", "3.11"] steps: - name: Checkout repo @@ -30,12 +30,11 @@ jobs: pydocstyle - name: Test run: | - cp $(python -c 'import site; print(site.getsitepackages()[0])')/afqinsight/_version.py afqinsight/_version.py - tox + cd && mkdir for_test && cd for_test && pytest --pyargs afqinsight --cov-report term-missing --cov=afqinsight - name: Coveralls run: | coveralls - if: matrix.python-version == 3.8 + if: matrix.python-version == 3.10 env: COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b17d1dbd..2e49df86 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/python/black - rev: 22.3.0 + rev: 24.2.0 hooks: - id: black - language_version: python3.8 + language_version: python diff --git a/afqinsight/__init__.py b/afqinsight/__init__.py index 2a2b1873..c0b40cf4 100755 --- a/afqinsight/__init__.py +++ b/afqinsight/__init__.py @@ -1,4 +1,5 @@ """AFQ-Insight is a Python library for statistical learning of tractometry data.""" + from . import datasets # noqa from . import utils # noqa from .cross_validate import * # noqa diff --git a/afqinsight/_serial_bagging.py b/afqinsight/_serial_bagging.py index fc46b166..fe06d6ee 100644 --- a/afqinsight/_serial_bagging.py +++ b/afqinsight/_serial_bagging.py @@ -9,6 +9,7 @@ parallelism when using a dask.distributed backend, I will gladly remove this private module. @richford """ + import itertools import numbers import numpy as np @@ -383,7 +384,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features - elif isinstance(self.max_features, np.float): + elif isinstance(self.max_features, float): max_features = self.max_features * self.n_features_in_ else: raise ValueError("max_features must be int or float") @@ -897,7 +898,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features - elif isinstance(self.max_features, np.float): # pragma: no cover + elif isinstance(self.max_features, float): # pragma: no cover max_features = self.max_features * self.n_features_in_ else: # pragma: no cover raise ValueError("max_features must be int or float") diff --git a/afqinsight/augmentation/__init__.py b/afqinsight/augmentation/__init__.py index b40f6984..69033fef 100644 --- a/afqinsight/augmentation/__init__.py +++ b/afqinsight/augmentation/__init__.py @@ -13,4 +13,5 @@ augmentation for time series classification with neural networks," PLOS ONE 16(7): e0254841. DOI: https://doi.org/10.1371/journal.pone.0254841 """ + from .augmentation import * # noqa: F401,F403 diff --git a/afqinsight/augmentation/augmentation.py b/afqinsight/augmentation/augmentation.py index 57131646..916cbb9e 100644 --- a/afqinsight/augmentation/augmentation.py +++ b/afqinsight/augmentation/augmentation.py @@ -13,6 +13,7 @@ augmentation for time series classification with neural networks," PLOS ONE 16(7): e0254841. DOI: https://doi.org/10.1371/journal.pone.0254841 """ + import numpy as np from tqdm import tqdm diff --git a/afqinsight/augmentation/dtw.py b/afqinsight/augmentation/dtw.py index 4007c904..4d99519a 100644 --- a/afqinsight/augmentation/dtw.py +++ b/afqinsight/augmentation/dtw.py @@ -11,6 +11,7 @@ augmentation for time series classification with neural networks," PLOS ONE 16(7): e0254841. DOI: https://doi.org/10.1371/journal.pone.0254841 """ + import numpy as np import sys diff --git a/afqinsight/cnn.py b/afqinsight/cnn.py deleted file mode 100644 index c32207af..00000000 --- a/afqinsight/cnn.py +++ /dev/null @@ -1,621 +0,0 @@ -"""Build, fit, and predict with 1-D convolutional neural networks.""" - -import functools -import numpy as np -import os.path as op -import tempfile - -from dipy.utils.optpkg import optional_package -from sklearn.impute import SimpleImputer -from sklearn.metrics import r2_score -from sklearn.model_selection import train_test_split -from sklearn.utils.validation import check_X_y, check_is_fitted - -keras_msg = ( - "To use afqinsight's convolutional neural nets for tractometry data, you will need " - "to have tensorflow and kerastuner installed. You can do this by installing " - "afqinsight with `pip install afqinsight[tf]`, or by separately installing these packages " - "with `pip install tensorflow keras-tuner`." -) - -kt, _, _ = optional_package("keras_tuner", keras_msg) -tf, has_tf, _ = optional_package("tensorflow", keras_msg) - -if has_tf: - from tensorflow.keras.models import Sequential - from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPool1D, Dropout - from tensorflow.keras.callbacks import ModelCheckpoint - - -def build_model(hp, conv_layers, input_shape): - """Build a keras model. - - Uses keras tuner to build model - can control # layers, # filters in each layer, kernel size, - regularization etc - - Parameters - ---------- - hp : tensorflow.keras.HyperParameters() - Hyperparameters class from which to sample hyperparameters - - conv_layers : int - number of layers (one layer is Conv and MaxPool) in the sequential model. - - input_shape : int - input shape of X so the model gets built continuously as you are adding layers - - Returns - ------- - model : tensorflow.keras.Model - compiled model that uses hyperparameters defined inline to hypertune the model - - """ - model = Sequential() - model.add( - Conv1D( - filters=hp.Int("init_conv_filters", min_value=32, max_value=512, step=32), - kernel_size=hp.Int("init_conv_kernel", min_value=1, max_value=4, step=1), - activation="relu", - input_shape=input_shape, - ) - ) - - for i in range(conv_layers - 1): - model.add( - Conv1D( - filters=hp.Int( - "conv_filters" + str(i), min_value=32, max_value=512, step=32 - ), - kernel_size=hp.Int( - "conv_kernel" + str(i), min_value=1, max_value=4, step=1 - ), - activation="relu", - ) - ) - - model.add(MaxPool1D(pool_size=2, padding="same")) - - model.add(Dropout(0.25)) - model.add(Flatten()) - - dense_filters_2 = hp.Int("dense_filters_2", min_value=32, max_value=512, step=32) - model.add(Dense(dense_filters_2, activation="relu")) - model.add(Dropout(0.25)) - model.add(Dense(64, activation="relu")) - model.add(Dense(1, activation="linear")) - - model.compile( - loss="mean_squared_error", optimizer="adam", metrics=["mean_squared_error"] - ) - - return model - - -class ModelBuilder: - """Build a complex model architecture with the specified number of layers. - - Parameters - ---------- - tuner_type : str or class. - Tuner to use. One of {"hyperband", "bayesian", "random"}. - - input_shape : tuple - Expected shape of the input data. - - layers : int - Number of layers in the model. - - max_epochs : int - Number of epochs to train the model. - - X_test : numpy.ndarray - Test data. - - y_test : numpy.ndarray - Test labels or test values. - - batch_size : int - Batch size to use when training. - - directory : str - Directory to save the model to. - - project_name : str, optional - A string, the name to use as prefix for files saved by the tuner object. Defaults to None - - tuner_kwargs : dict, optional - Keyword arguments to pass to the tuner class on initialization. - Defaults to tuner defaults. - """ - - def __init__( - self, - tuner_type, - input_shape, - layers, - max_epochs, - X_test, - y_test, - batch_size, - directory=None, - project_name=None, - **tuner_kwargs, - ): - - self.tuner_type = tuner_type - self.layers = layers - self.input_shape = input_shape - self.max_epochs = max_epochs - self.batch_size = batch_size - self.X_test = X_test - self.y_test = y_test - self.directory = directory - self.project_name = project_name - self.tuner_kwargs = tuner_kwargs - - def _get_tuner(self): - """Call build_model and instantiate a Keras Tuner for the returned model depending on user choice of tuner. - - Returns - ------- - tuner : kerastuner.tuners - BayesianOptimization, Hyperband, or RandomSearch tuner - - """ - # setting parameters beforehand - hypermodel = functools.partial( - build_model, conv_layers=self.layers, input_shape=self.input_shape - ) - if isinstance(self.tuner_type, str): - # instantiating tuner based on user's choice - if self.tuner_type == "hyperband": - tuner = kt.Hyperband( - hypermodel=hypermodel, - objective="mean_squared_error", - max_epochs=10, - overwrite=True, - project_name=self.project_name, - directory=self.directory, - **self.tuner_kwargs, - ) - - elif self.tuner_type == "bayesian": - tuner = kt.BayesianOptimization( - hypermodel=hypermodel, - objective="mean_squared_error", - max_trials=10, - overwrite=True, - project_name=self.project_name, - directory=self.directory, - **self.tuner_kwargs, - ) - - elif self.tuner_type == "random": - tuner = kt.RandomSearch( - hypermodel=hypermodel, - objective="mean_squared_error", - max_trials=10, - overwrite=True, - project_name=self.project_name, - directory=self.directory, - **self.tuner_kwargs, - ) - else: - raise ValueError( - f"tuner parameter expects 'hyperband', 'bayesian', or 'random', but you provided {self.tuner_type}" - ) - return tuner - # We do not cover the following line, because CNN also handles this - # error: - else: # pragma: no cover - raise TypeError( - f"`tuner` parameter should be a string, but you provided {self.tuner_type}" - ) - - def _get_best_weights(self, model, X, y): - """Fit a CNN and save the best weights. - - Use keras ModelCheckpoint to fit CNN and save the weights from the epoch - that produced the lowest validation loss to a temporary file. Uses - temporary file to load the best weights into the CNN model and returns - this best model. - - Parameters - ---------- - model : tensorflow.keras.Sequential() - Hyperparameters class from which to sample hyperparameters - - X : array-like of shape (n_samples, n_features) - The feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - model : tensorflow.keras.Model - fitted keras model with best weights loaded - - """ - weights_path = op.join(tempfile.mkdtemp(), "weights.hdf5") - # making model checkpoint to save best model (# epochs) to file - model_checkpoint_callback = ModelCheckpoint( - filepath=weights_path, - monitor="val_loss", - mode="auto", - save_best_only=True, - save_weights_only=True, - verbose=True, - ) - - # Fitting model using model checkpoint callback to find best model which is saved to 'weights' - model.fit( - X, - y, - epochs=self.max_epochs, - batch_size=self.batch_size, - callbacks=[model_checkpoint_callback], - validation_data=(self.X_test, self.y_test), - ) - # loading in weights - model.load_weights(weights_path) - - # return the model - return model - - def build_basic_model(self, X, y): - """Build a sequential model without hyperparameter tuning. - - Builds a static baseline sequential model with no hyperparameter tuning. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - model : tensorflow.keras.Model - compiled model using basic Weston Havens architecture - - """ - model = Sequential() - model.add(Dense(128, activation="relu", input_shape=X.shape[1:])) - model.add(Conv1D(24, kernel_size=2, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(32, kernel_size=2, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(64, kernel_size=3, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(128, kernel_size=4, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(256, kernel_size=4, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Dropout(0.25)) - model.add(Flatten()) - model.add(Dense(128, activation="relu")) - model.add(Dropout(0.25)) - model.add(Dense(64, activation="relu")) - model.add(Dense(1, activation="linear")) - - model.compile( - loss="mean_squared_error", optimizer="adam", metrics=["mean_squared_error"] - ) - - best_model = self._get_best_weights(model, X, y) - return best_model - - def build_tuned_model(self, X, y): - """Build a tuned model using Keras tuner. - - Initializes a Keras tuner on user's model, searches for best hyperparameters, and saves them. - Then builds "best" model using saved best hyperparameters found during the search and returns model - with best weights loaded from _get_best_weights. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - model : tensorflow.keras.Model - compiled model that uses hyperparameters defined inline to hypertune the model - - """ - # initialize tuner - tuner = self._get_tuner() - - # Find the optimal hyperparameters - tuner.search(X, y, epochs=50, validation_split=0.2) - - # Save the optimal hyperparameters - best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] - - # make CNN model using best hyperparameters - model = tuner.hypermodel.build(best_hps) - - best_model = self._get_best_weights(model, X, y) - return best_model - - -class CNN: - """A Convolutional Neural Network model with a fit/predict interface. - - Parameters - ---------- - n_nodes : int - Number of nodes in each bundle profile. - - n_channels : int - Number of metrics in each bundle profile. - - max_epochs : int - Maximum number of epochs to train model. - - batch_size : int - Number of samples per batch. - - tuner_type : str - Type of hyperparameter tuner to use. One of 'hyperband', 'bayesian', or - 'random'. - - layers : int - Number of convolutional layers to use. - - test_size : float - Fraction of data to use as test set. - - impute_strategy : str, optional - Imputation strategy to use. One of 'mean', 'median', or 'knn'. - Default: "median". - - random_state : int or RandomState instance, optional - Default: None. - - directory : str, optional - Directory to save model and hyperparameters. Default: "." - - project_name : str, optional - A string, the name to use as prefix for files saved by the tuner - object. Defaults to None - - tuner_kwargs : dict, optional - Keyword arguments to pass to tuner. Default: tuner defaults. - """ - - def __init__( - self, - n_nodes, - n_channels, - max_epochs=50, - batch_size=32, - tuner_type=None, - layers=1, - test_size=0.2, - impute_strategy="median", - random_state=None, - directory=None, - project_name=None, - **tuner_kwargs, - ): - # checking n_nodes is passed as int - if not isinstance(n_nodes, int): - raise TypeError("Parameter n_nodes must be an integer.") - else: - self.n_nodes = n_nodes - - # checking n_channels is passed as int - if not isinstance(n_channels, int): - raise TypeError("Parameter n_channels must be an integer.") - else: - self.n_channels = n_channels - - # checking layers is passed as int - if not isinstance(layers, int): - raise TypeError("Parameter layers must be an integer.") - else: - self.layers = layers - - # checking max epochs is passed as int - if not isinstance(max_epochs, int): - raise TypeError("Parameter max_epochs must be an integer.") - else: - self.max_epochs = max_epochs - - if not isinstance(batch_size, int): - raise TypeError("Parameter batch_size must be an integer.") - else: - self.batch_size = batch_size - - # checking tiner is passed as str or None - if not isinstance(tuner_type, str) and tuner_type is not None: - raise TypeError("Parameter tuner must be str.") - else: - # tuner can be None (no tuning) BayesianOptimization, Hyperband, or RandomSearch - self.tuner_type = tuner_type - - # checking val split is passed as float - if not isinstance(test_size, float): - raise TypeError("Parameter test_size must be a float.") - else: - self.test_size = test_size - - # checking strategy is passed as str and has value of 'median', 'mean', or 'knn' - if not isinstance(impute_strategy, str): - raise TypeError("Parameter impute_strategy must be a string.") - elif impute_strategy not in ["median", "mean", "knn"]: - raise ValueError( - f"Parameter impute_strategy must be 'median', 'mean', or 'knn' but you provided {impute_strategy}" - ) - else: - self.impute_strategy = impute_strategy - - if random_state is not None: - if not (isinstance(random_state, int) or isinstance(np.random.RandomState)): - raise TypeError( - f"Parameter random_state must be an int or RandomState, but you provided {random_state}" - ) - self.random_state = random_state - - self.directory = directory - self.project_name = project_name - self.tuner_kwargs = tuner_kwargs - self.model_ = None - self.best_hps_ = None - - def _preprocess(self, X, y=None): - """Convert feature matrix for input into a CNN. - - Masks NAN values for X and y (if y is given), imputes X, and reshapes X - to be in proper form for CNN model. In more conventional machine - learning, X has shape (n_samples, n_features), where n_features is - n_nodes * n_bundles * n_metrics. However, in our CNN approach, we treat - each bundle/metric combination as a separate channel, analogous to RGB - channels in a 2D image. The remaining one dimension is the nodes - dimension. Thus the output has shape (n_samples, n_channels, n_nodes), - where n_channels = n_metrics * n_bundles. - - Parameters - ---------- - X : array-like of shape (n_samples, n_metrics * n_nodes) - Diffusion MRI tractometry features (columns) for each subject in the sample (rows). - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - X : array-like of shape (n_samples, n_channels, n_nodes) - The imputed and reshaped feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - """ - # n_nodes * n_channels must = X.shape[1] - if self.n_nodes * self.n_channels != X.shape[1]: - raise ValueError( - "The product n_nodes and n_channels is not the correct shape." - ) - - # We don't cover the following line, because this case is also handled - # in the fall to fit: - if len(X.shape) > 2: # pragma: no cover - raise ValueError("Expected X to be a 2D matrix.") - if y is not None: - nan_mask = np.logical_not(np.isnan(y)) - X = X[nan_mask, :] - y = y[nan_mask] - - imp = SimpleImputer(strategy=self.impute_strategy) - X = imp.fit_transform(X) - - if y is not None: - X, y = check_X_y(X, y) - - n_subjects = X.shape[0] - - X = np.swapaxes(X.reshape((n_subjects, self.n_channels, self.n_nodes)), 1, 2) - - if y is not None: - return X, y - else: - return X - - def fit(self, X, y): - """Fit the model. - - Preprocesses X and y, builds CNN model, tunes model hyperparameters and - fits the model to given X and y, using X_test and y_test to validate and - find best weights and hyperparameters. - - Parameters - ---------- - X : array-like of shape (n_samples, n_metrics * n_nodes) - Diffusion MRI tractometry features (columns) for each subject (rows). - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - self : CNN - updated CNN instantiation - - """ - X, y = self._preprocess(X, y) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=self.test_size, random_state=self.random_state - ) - # CNN gets n_nodes, n_channels, max_epochs, tuner=None, layers=None - # Model Builder takes tuner_type, input_shape, layers, max_epochs, **kwargs - builder = ModelBuilder( - self.tuner_type, - X_train.shape[1:], - self.layers, - self.max_epochs, - X_test, - y_test, - self.batch_size, - self.directory, - self.project_name, - **self.tuner_kwargs, - ) - if self.tuner_type is None: - self.model_ = builder.build_basic_model(X_train, y_train) - else: - self.model_ = builder.build_tuned_model(X_train, y_train) - - self.is_fitted_ = True - - return self - - def predict(self, X): - """Predict target values. - - Preprocesses X and returns predicted y values for X from fitted CNN model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_metrics * n_nodes) - Tractometry features (columns) for each subject in the sample (rows). - - Returns - ------- - pred : array-like of shape (n_samples,) or (n_samples, n_targets) - predicted values - """ - X = self._preprocess(X) - check_is_fitted(self, "is_fitted_") - pred = self.model_.predict(X).squeeze() - return pred - - def score(self, y_test, y_hat): - """Score the performance of the model. - - Masks out NaN values from y_test and returns $R^2$ score for the CNN model comparing to y_hat - - Parameters - ---------- - y_test : array-like of shape (n_samples,) or (n_samples, n_targets) - Testing target values - - y_hat : array-like of shape (n_samples,) or (n_samples, n_targets) - Predicted target values - - Returns - ------- - r2_score : float - r-squared score for y_test and y_hat for CNN model - - """ - nan_mask = np.logical_not(np.isnan(y_test)) - y_test = y_test[nan_mask] - return r2_score(y_test, y_hat) diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index d75ba20b..0925ba8b 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -1,4 +1,5 @@ """Generate samples of synthetic data sets or extract AFQ data.""" + import hashlib import numpy as np import os @@ -22,7 +23,7 @@ "afqinsight[torch]`, or by separately installing these packages with " "`pip install torch`." ) -torch, HAS_TORCH, _ = optional_package("torch", torch_msg) +torch, HAS_TORCH, _ = optional_package("torch", trip_msg=torch_msg) tf_msg = ( "To use AFQ-Insight's tensorflow classes, you will need to have tensorflow " @@ -30,7 +31,7 @@ "afqinsight[tensorflow]`, or by separately installing these packages with " "`pip install tensorflow`." ) -tf, _, _ = optional_package("tensorflow", tf_msg) +tf, _, _ = optional_package("tensorflow", trip_msg=tf_msg) __all__ = ["AFQDataset", "load_afq_data", "bundles2channels"] _DATA_DIR = op.join(op.expanduser("~"), ".cache", "afq-insight") @@ -310,7 +311,7 @@ def load_afq_data( else: classes = None - y = np.squeeze(y.to_numpy()) + y = np.squeeze(y.to_numpy().astype(float)) return AFQData( X=X, @@ -663,7 +664,7 @@ def from_study(study, verbose=None): "weston-havens": dict(dwi_metrics=["md", "fa"], target_cols=["Age"]), "hbn": dict( dwi_metrics=["dki_md", "dki_fa"], - target_cols=["age", "sex", "scan_site_id"], + target_cols=["age", "sex", "scan_site_id", "dl_qc_score"], label_encode_cols=["sex", "scan_site_id"], index_col="subject_id", ), @@ -705,9 +706,11 @@ def __getitem__(self, indices): target_cols=self.target_cols, group_names=self.group_names, subjects=np.array(self.subjects)[indices].tolist(), - sessions=np.array(self.sessions)[indices].tolist() - if self.sessions is not None - else None, + sessions=( + np.array(self.sessions)[indices].tolist() + if self.sessions is not None + else None + ), classes=self.classes, ) @@ -763,7 +766,7 @@ def drop_target_na(self): This method modifies the ``X``, ``y``, and ``subjects`` attributes in-place. """ if self.y is not None: - nan_mask = np.isnan(self.y) + nan_mask = np.isnan(self.y.astype(float)) if len(self.y.shape) > 1: nan_mask = nan_mask.astype(int).sum(axis=1).astype(bool) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index a01ae4d7..61637c0b 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -9,12 +9,12 @@ "tensorflow`." ) -tf, has_tf, _ = optional_package("tensorflow", keras_msg) +tf, has_tf, _ = optional_package("tensorflow", trip_msg=keras_msg) if has_tf: from tensorflow.keras.models import Model - from tensorflow.keras.layers import Dense, Flatten, Dropout, Input - from tensorflow.keras.layers import MaxPooling1D, Conv1D + from tensorflow.keras.layers import Dense, Flatten, Dropout, Input, Reshape + from tensorflow.keras.layers import MaxPooling1D, Conv1D, Conv1DTranspose from tensorflow.keras.layers import LSTM, Bidirectional from tensorflow.keras.layers import ( BatchNormalization, @@ -23,7 +23,10 @@ concatenate, Activation, add, + Layer, ) + from tensorflow.keras.losses import binary_crossentropy + else: # Since all model building functions start with Input, we make Input the # tripwire instance for cases where tensorflow is not installed. @@ -307,3 +310,162 @@ def cnn_resnet(input_shape, n_classes, output_activation="softmax", verbose=Fals model.summary() return model + + +def fc_autoencoder(input_shape, encoding_dim=None, verbose=False): + """ + Fully connected autoencoder + """ + ip = Input(shape=input_shape) + if encoding_dim is None: + encoding_dim = (input_shape[0] * input_shape[1]) // 8 + + fc = Flatten()(ip) + fc = Dense(input_shape[0] * input_shape[1], activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + fc = Dense(encoding_dim, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + pre_out = Dense((input_shape[0] * input_shape[1]))(fc) + out = Reshape(input_shape)(pre_out) + + model = Model([ip], [out]) + if verbose: + model.summary() + return model + + +def cnn_autoencoder(input_shape, encoding_dim=8, verbose=False): + """ + Convolutional autoencoder + """ + ip = Input(shape=input_shape) + # Encoder + x = Conv1D(32, 3, activation="relu", padding="same")(ip) + x = MaxPooling1D(2, padding="same")(x) + x = Conv1D(16, 3, activation="relu", padding="same")(x) + x = MaxPooling1D(2, padding="same")(x) + shape = x.shape + # Latent + x = Flatten()(x) + x = Dense(encoding_dim, activation="relu")(x) + # Decoder + x = Reshape(shape)(x) + x = Conv1DTranspose(32, 3, strides=2, activation="relu", padding="same")(x) + x = Conv1DTranspose(16, 3, strides=2, activation="relu", padding="same")(x) + x = Conv1DTranspose(1, 3, activation="sigmoid", padding="same")(x) + + model = Model([ip], [x]) + if verbose: + model.summary() + + return model + + +class _Sampling(Layer): + """ + Sample the latent layer of a VAE + """ + + def call(self, inputs): + z_mean, z_log_var = inputs + batch = tf.shape(z_mean)[0] + dim = tf.shape(z_mean)[1] + epsilon = tf.random.normal(shape=(batch, dim)) + return z_mean + tf.exp(0.5 * z_log_var) * epsilon + + +def _fc_vae_encoder(input_shape, encoding_dim=None, verbose=False): + """ + Encoder section for a fully connected variational autoencoder + """ + ip = Input(shape=input_shape) + + if encoding_dim is None: + encoding_dim = (input_shape[0] * input_shape[1]) // 8 + + fc = Flatten()(ip) + fc = Dense(input_shape[0] * input_shape[1], activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + + z_mean = Dense(encoding_dim, activation="relu")(fc) + z_log_var = Dense(encoding_dim, name="z_mean")(fc) + z = _Sampling()([z_mean, z_log_var]) + return Model([ip], [z_mean, z_log_var, z], name="encoder") + + +def _fc_vae_decoder(input_shape, encoding_dim=None, verbose=False): + """ + Decoder section for a fully connected variational autoencoder + """ + ip = Input(shape=(encoding_dim,)) + fc = Flatten()(ip) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + pre_out = Dense((input_shape[0] * input_shape[1]))(fc) + out = Reshape(input_shape)(pre_out) + return Model([ip], [out], name="decoder") + + +class _VAE(Model): + """ + A variational autoencoder class + """ + + def __init__(self, encoder, decoder, **kwargs): + super().__init__(**kwargs) + self.encoder = encoder + self.decoder = decoder + self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss") + self.reconstruction_loss_tracker = tf.keras.metrics.Mean( + name="reconstruction_loss" + ) + self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss") + + @property + def metrics(self): + return [ + self.total_loss_tracker, + self.reconstruction_loss_tracker, + self.kl_loss_tracker, + ] + + def call(self, inputs): + z_mean, z_log_var, z = self.encoder(inputs) + reconstructed = self.decoder(z) + return reconstructed + + def train_step(self, data): + with tf.GradientTape() as tape: + z_mean, z_log_var, z = self.encoder(data) + reconstruction = self.decoder(z) + reconstruction_loss = tf.reduce_mean( + tf.reduce_sum(binary_crossentropy(data, reconstruction), axis=1) + ) + kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)) + kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1)) + total_loss = reconstruction_loss + kl_loss + grads = tape.gradient(total_loss, self.trainable_weights) + self.optimizer.apply_gradients(zip(grads, self.trainable_weights)) + self.total_loss_tracker.update_state(total_loss) + self.reconstruction_loss_tracker.update_state(reconstruction_loss) + self.kl_loss_tracker.update_state(kl_loss) + return { + "loss": self.total_loss_tracker.result(), + "reconstruction_loss": self.reconstruction_loss_tracker.result(), + "kl_loss": self.kl_loss_tracker.result(), + } + + +def fc_vae(input_shape, encoding_dim=None, verbose=False): + """ + Fully connected variational autoencoder. + """ + if encoding_dim is None: + encoding_dim = (input_shape[0] * input_shape[1]) // 8 + + encoder = _fc_vae_encoder(input_shape, encoding_dim, verbose) + decoder = _fc_vae_decoder(input_shape, encoding_dim, verbose) + return _VAE(encoder, decoder) diff --git a/afqinsight/pipeline.py b/afqinsight/pipeline.py index c60179b6..ad5e8a6b 100755 --- a/afqinsight/pipeline.py +++ b/afqinsight/pipeline.py @@ -1,4 +1,5 @@ """sklearn-compatible pipelines for AFQ data.""" + import inspect import groupyr as gpr diff --git a/afqinsight/tests/test_bagging.py b/afqinsight/tests/test_bagging.py index a89fdd7a..2a12f40b 100644 --- a/afqinsight/tests/test_bagging.py +++ b/afqinsight/tests/test_bagging.py @@ -7,17 +7,16 @@ import numpy as np import joblib +import pytest from afqinsight._serial_bagging import SerialBaggingClassifier, SerialBaggingRegressor from sklearn.base import BaseEstimator -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import assert_raise_message +from numpy.testing import assert_array_equal +from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_raises +from numpy.testing import assert_warns from sklearn.utils._testing import ignore_warnings from sklearn.dummy import DummyClassifier, DummyRegressor @@ -213,7 +212,6 @@ def fit(self, X, y): X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: - # Trained on sparse format sparse_classifier = SerialBaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params @@ -242,6 +240,9 @@ def fit(self, X, y): self.training_size_ = X.shape[0] self.training_hash_ = joblib.hash(X) + def predict(self, X): + return np.zeros(X.shape[0]) + def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. @@ -505,15 +506,14 @@ def test_parallel_classification(): assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) - assert_raise_message( + with pytest.raises( ValueError, "Number of features of the model " "must match the input. Model n_features is {0} " "and input n_features is {1} " "".format(X_test.shape[1], X_err.shape[1]), - ensemble.decision_function, - X_err, - ) + ): + ensemble.decision_function(X_err) ensemble = SerialBaggingClassifier( SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0 @@ -690,14 +690,11 @@ def test_warm_start_equal_n_estimators(): y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 + with pytest.warns( + UserWarning, match="Warm-start fitting without increasing n_estimators does not" + ): + clf.fit(X_train, y_train) - assert_warns_message( - UserWarning, - "Warm-start fitting without increasing n_estimators does not", - clf.fit, - X_train, - y_train, - ) assert_array_equal(y_pred, clf.predict(X_test)) diff --git a/afqinsight/tests/test_cnn.py b/afqinsight/tests/test_cnn.py deleted file mode 100644 index 10898ffd..00000000 --- a/afqinsight/tests/test_cnn.py +++ /dev/null @@ -1,212 +0,0 @@ -import afqinsight as afqi -import os.path as op -import pytest -import tempfile - -from afqinsight.cnn import CNN -from afqinsight.datasets import load_afq_data - -data_path = op.join(afqi.__path__[0], "data") -test_data_path = op.join(data_path, "test_data") - -X, y, groups, feature_names, group_names, subjects, _, _ = load_afq_data( - fn_nodes=op.join(test_data_path, "nodes.csv"), - fn_subjects=op.join(test_data_path, "subjects.csv"), - target_cols=["test_class"], - label_encode_cols=["test_class"], -) - - -def test_basic_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN(100, 6, 5, project_name="test-project", directory=tdir) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - -def test_hyperband_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN( - 100, 6, 5, 64, "hyperband", project_name="test-project", directory=tdir - ) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - model2 = CNN( - 100, 6, 5, 64, "hyperband", 4, project_name="test-project", directory=tdir - ) - model2.fit(X, y) - assert model2.is_fitted_ is True - y_hat2 = model2.predict(X) - _ = model2.score(y, y_hat2) - - model3 = CNN( - 100, - 6, - 5, - 64, - "hyperband", - 4, - 0.3, - project_name="test-project", - directory=tdir, - ) - model3.fit(X, y) - assert model3.is_fitted_ is True - y_hat3 = model3.predict(X) - _ = model3.score(y, y_hat3) - - model4 = CNN( - 100, - 6, - 5, - 64, - "hyperband", - 4, - 0.3, - factor=2, - hyperband_iterations=2, - seed=2, - project_name="test-project", - directory=tdir, - ) - model4.fit(X, y) - assert model4.is_fitted_ is True - y_hat4 = model4.predict(X) - _ = model4.score(y, y_hat4) - - -def test_bayesian_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN(100, 6, 5, 64, "bayesian", directory=tdir) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - model2 = CNN(100, 6, 5, 64, "bayesian", 4, directory=tdir) - model2.fit(X, y) - assert model2.is_fitted_ is True - y_hat2 = model2.predict(X) - _ = model2.score(y, y_hat2) - - model3 = CNN(100, 6, 5, 64, "bayesian", 4, 0.3, directory=tdir) - model3.fit(X, y) - assert model3.is_fitted_ is True - y_hat3 = model3.predict(X) - _ = model3.score(y, y_hat3) - - model4 = CNN( - 100, - 6, - 5, - 64, - "bayesian", - 4, - 0.3, - num_initial_points=2, - alpha=0.02, - beta=0.5, - seed=5, - directory=tdir, - ) - model4.fit(X, y) - assert model4.is_fitted_ is True - y_hat4 = model4.predict(X) - _ = model4.score(y, y_hat4) - - -def test_random_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN(100, 6, 5, 64, "random", directory=tdir) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - model2 = CNN(100, 6, 5, 64, "random", 4, directory=tdir) - model2.fit(X, y) - assert model2.is_fitted_ is True - y_hat2 = model2.predict(X) - _ = model2.score(y, y_hat2) - - model3 = CNN(100, 6, 5, 64, "random", 4, 0.3, directory=tdir) - model3.fit(X, y) - assert model3.is_fitted_ is True - y_hat3 = model3.predict(X) - _ = model3.score(y, y_hat3) - - model4 = CNN( - 100, 6, 5, 64, "random", 4, 0.3, impute_strategy="mean", directory=tdir - ) - model4.fit(X, y) - assert model4.is_fitted_ is True - y_hat4 = model4.predict(X) - _ = model4.score(y, y_hat4) - - -def test_fail_cnn(): - - with pytest.raises(ValueError): - # passing in wrong shape of X (not 2d): - model = CNN(100, 6, 5, 64) - model.fit(X.reshape((7, 100, -1)), y) - - with pytest.raises(ValueError): - # passing in wrong tuner value - model = CNN(100, 6, 5, 64, "wrong") - model.fit(X, y) - - with pytest.raises(TypeError): - # passing in int for tuner - model = CNN(100, 6, 5, 64, 0) - - with pytest.raises(ValueError): - # passing in n_nodes and n_channels that multiply to equal - # proper dimension for given x - model = CNN(78, 6, 5, 64, "random") - model.fit(X, y) - - with pytest.raises(TypeError): - # passing in float for tuner_type - model = CNN(100, 6, 5, 64, 0.0) - - with pytest.raises(TypeError): - # passing in float for n_nodes - model = CNN(1.1, 6, 5, 64, "random") - - with pytest.raises(TypeError): - # passing in float for n_channels - model = CNN(100, 6.0, 5, 64, "random") - - with pytest.raises(TypeError): - # passing in float for layers - model = CNN(100, 6, layers=5.0) - - with pytest.raises(TypeError): - # passing in float for batch size - model = CNN(100, 6, 5, 6.4, "random") - - with pytest.raises(TypeError): - # passing in string for batch size - model = CNN(100, 6, 5, "64", "random") - - with pytest.raises(TypeError): - # passing in an integer for test_size - model = CNN(100, 6, test_size=20) - - with pytest.raises(TypeError): - # passing in an integer for impute_strategy (this should be a string). - model = CNN(100, 6, impute_strategy=20) - - with pytest.raises(ValueError): - # passing in the wrong string for impute_strategy: - model = CNN(100, 6, impute_strategy="foo") - - with pytest.raises(TypeError): - # passing in a string for random_state (should be int or RandomState). - model = CNN(100, 6, random_state="foo") diff --git a/afqinsight/tests/test_datasets.py b/afqinsight/tests/test_datasets.py index 7a807a96..b6ac9e5f 100644 --- a/afqinsight/tests/test_datasets.py +++ b/afqinsight/tests/test_datasets.py @@ -536,9 +536,9 @@ def test_load_afq_data(dwi_metrics): ) means_ref = ( - nodes.groupby(["subjectID", "tractID"]) + nodes.drop(["nodeID", "sessionID"], axis="columns") + .groupby(["subjectID", "tractID"]) .agg("mean") - .drop("nodeID", axis="columns") .unstack("tractID") ) assert np.allclose(X, means_ref.to_numpy(), equal_nan=True) # nosec diff --git a/afqinsight/transform.py b/afqinsight/transform.py index d40075a6..689a61eb 100755 --- a/afqinsight/transform.py +++ b/afqinsight/transform.py @@ -1,4 +1,5 @@ """Transform AFQ data.""" + import numpy as np import pandas as pd from collections import OrderedDict diff --git a/pyproject.toml b/pyproject.toml index 24e23566..0e5fdce1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -target-version = ['py38'] +target-version = ['py310'] extend-exclude = ''' ( @@ -22,6 +22,7 @@ extend-exclude = ''' | \.venv | afqinsight.egg-info | doc + | examples | build | dist )/ diff --git a/setup.cfg b/setup.cfg index feca5487..74d704a1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,20 +30,21 @@ platforms = OS Independent [options] setup_requires = setuptools_scm -python_requires = >=3.7 +python_requires = >=3.10 install_requires = dipy>=1.0.0 - groupyr>=0.2.7 + groupyr>=0.3.3 matplotlib - numpy - pandas>=1.1.0 + numpy==1.23.5 + pandas==2.1.4 requests - seaborn - scikit-learn>=1.0.0 + seaborn==0.13.0 + scikit-learn==1.2.1 sklearn_pandas>=2.0.0 - tables>=3.0.0 + tables==3.9.1 tqdm - statsmodels + statsmodels==0.14.0 + copt==0.9.1 zip_safe = False include_package_data = True packages = find: @@ -66,7 +67,6 @@ dev = pytest-xdist[psutil] pytest s3fs - scipy<=1.7.3 sphinx sphinx-gallery sphinx-panels diff --git a/setup.py b/setup.py index 820a485f..4dc5bfa6 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """Statistical learning for tractometry data, especially within the AFQ software ecosystem.""" + from setuptools import setup import string import os.path as op diff --git a/tox.ini b/tox.ini index f3118213..5f85f388 100644 --- a/tox.ini +++ b/tox.ini @@ -7,18 +7,17 @@ isolated_build = True usedevelop = True deps = dipy>=1.0.0 - groupyr==0.2.7 + groupyr==0.3.3 h5py>=3.0.0 keras-tuner matplotlib - numpy + numpy<2 pandas>=1.1.0 pytest pytest-cov pytest-xdist[psutil] requests - scikit-learn>=1.0.0 - scipy<=1.7.3 + scikit-learn==1.2.1 seaborn setuptools_scm sklearn_pandas>=2.0.0