From 8589218a63c01d71c08129f354ac8f52bf19a5bc Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Tue, 21 Mar 2023 15:53:10 -0700 Subject: [PATCH 01/33] Update black formatting for a few files. --- .pre-commit-config.yaml | 2 +- afqinsight/cnn.py | 1 - afqinsight/tests/test_bagging.py | 1 - afqinsight/tests/test_cnn.py | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b17d1dbd..3e6fa4bc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,4 @@ repos: rev: 22.3.0 hooks: - id: black - language_version: python3.8 + language_version: python diff --git a/afqinsight/cnn.py b/afqinsight/cnn.py index c32207af..16f03fd3 100644 --- a/afqinsight/cnn.py +++ b/afqinsight/cnn.py @@ -141,7 +141,6 @@ def __init__( project_name=None, **tuner_kwargs, ): - self.tuner_type = tuner_type self.layers = layers self.input_shape = input_shape diff --git a/afqinsight/tests/test_bagging.py b/afqinsight/tests/test_bagging.py index a89fdd7a..4612e1db 100644 --- a/afqinsight/tests/test_bagging.py +++ b/afqinsight/tests/test_bagging.py @@ -213,7 +213,6 @@ def fit(self, X, y): X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: - # Trained on sparse format sparse_classifier = SerialBaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params diff --git a/afqinsight/tests/test_cnn.py b/afqinsight/tests/test_cnn.py index 10898ffd..8899d912 100644 --- a/afqinsight/tests/test_cnn.py +++ b/afqinsight/tests/test_cnn.py @@ -150,7 +150,6 @@ def test_random_cnn(): def test_fail_cnn(): - with pytest.raises(ValueError): # passing in wrong shape of X (not 2d): model = CNN(100, 6, 5, 64) From 03173f37c41898ca5274ce8487ea93f76ca76a68 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 12:26:00 -0800 Subject: [PATCH 02/33] Upgrade python version support. Based on https://scientific-python.org/specs/spec-0000/ --- .github/workflows/test.yml | 2 +- setup.cfg | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aff5cdcb..42208a99 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: ["3.10", "3.11", "3.12"] steps: - name: Checkout repo diff --git a/setup.cfg b/setup.cfg index feca5487..1fb0e4e2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ platforms = OS Independent [options] setup_requires = setuptools_scm -python_requires = >=3.7 +python_requires = >=3.10 install_requires = dipy>=1.0.0 groupyr>=0.2.7 @@ -39,7 +39,7 @@ install_requires = pandas>=1.1.0 requests seaborn - scikit-learn>=1.0.0 + scikit-learn==1.2.1 sklearn_pandas>=2.0.0 tables>=3.0.0 tqdm From bee0776f9a5f7dd16645ff2f18fd29654ec2d58d Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 12:28:04 -0800 Subject: [PATCH 03/33] Upgrade docbuild Python version as well. --- .github/workflows/docbuild.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docbuild.yml b/.github/workflows/docbuild.yml index 679bd09a..392444d7 100644 --- a/.github/workflows/docbuild.yml +++ b/.github/workflows/docbuild.yml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: ["3.10", "3.11"] steps: - name: Checkout repo From 6082fea411071a7c8d41737f1c7113c39807f717 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 12:34:27 -0800 Subject: [PATCH 04/33] Support for 3.12 will have to wait for groupyr. Which in turn needs to wait for numba, which will happen soon. --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 42208a99..928f10cd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11"] steps: - name: Checkout repo From 3391b212af8556cea33f485020b7a9ae3cd14e7f Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 13:49:56 -0800 Subject: [PATCH 05/33] Pin a few of the dependencies to make sure things install. --- setup.cfg | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 1fb0e4e2..454cb8ff 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,12 +38,13 @@ install_requires = numpy pandas>=1.1.0 requests - seaborn + seaborn==0.13.0 scikit-learn==1.2.1 sklearn_pandas>=2.0.0 - tables>=3.0.0 + tables==3.9.1 tqdm - statsmodels + statsmodels==0.14.0 + copt==0.9.1 zip_safe = False include_package_data = True packages = find: From cb1c7133b63b47a182b3c19d22c0e05af926cc89 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 13:58:18 -0800 Subject: [PATCH 06/33] Pin a more advanced pandas, don't pin scipy. --- setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 454cb8ff..040e7775 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,7 @@ install_requires = groupyr>=0.2.7 matplotlib numpy - pandas>=1.1.0 + pandas==2.1.4 requests seaborn==0.13.0 scikit-learn==1.2.1 @@ -67,7 +67,6 @@ dev = pytest-xdist[psutil] pytest s3fs - scipy<=1.7.3 sphinx sphinx-gallery sphinx-panels From 28c43686785d551549d4aa02990675a945b56698 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 18 Jan 2024 11:23:42 -0800 Subject: [PATCH 07/33] Be explicit about trip_msg input for optional_pkg. This is now a requirement of the API in dipy. --- afqinsight/cnn.py | 4 ++-- afqinsight/datasets.py | 4 ++-- afqinsight/nn/tf_models.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/afqinsight/cnn.py b/afqinsight/cnn.py index 16f03fd3..4027bc0f 100644 --- a/afqinsight/cnn.py +++ b/afqinsight/cnn.py @@ -18,8 +18,8 @@ "with `pip install tensorflow keras-tuner`." ) -kt, _, _ = optional_package("keras_tuner", keras_msg) -tf, has_tf, _ = optional_package("tensorflow", keras_msg) +kt, _, _ = optional_package("keras_tuner", trip_msg=keras_msg) +tf, has_tf, _ = optional_package("tensorflow", trip_msg=keras_msg) if has_tf: from tensorflow.keras.models import Sequential diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index d75ba20b..690b2f63 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -22,7 +22,7 @@ "afqinsight[torch]`, or by separately installing these packages with " "`pip install torch`." ) -torch, HAS_TORCH, _ = optional_package("torch", torch_msg) +torch, HAS_TORCH, _ = optional_package("torch", trip_msg=torch_msg) tf_msg = ( "To use AFQ-Insight's tensorflow classes, you will need to have tensorflow " @@ -30,7 +30,7 @@ "afqinsight[tensorflow]`, or by separately installing these packages with " "`pip install tensorflow`." ) -tf, _, _ = optional_package("tensorflow", tf_msg) +tf, _, _ = optional_package("tensorflow", trip_msg=tf_msg) __all__ = ["AFQDataset", "load_afq_data", "bundles2channels"] _DATA_DIR = op.join(op.expanduser("~"), ".cache", "afq-insight") diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index a01ae4d7..e685170c 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -9,7 +9,7 @@ "tensorflow`." ) -tf, has_tf, _ = optional_package("tensorflow", keras_msg) +tf, has_tf, _ = optional_package("tensorflow", trip_msg=keras_msg) if has_tf: from tensorflow.keras.models import Model From 6cb6148c73c694a8a3c499ed9bb8d97fe7f1d138 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 9 Feb 2024 08:09:25 -0800 Subject: [PATCH 08/33] Upgrade to newest groupyr. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 040e7775..8eddacfe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ setup_requires = python_requires = >=3.10 install_requires = dipy>=1.0.0 - groupyr>=0.2.7 + groupyr>=0.3.2 matplotlib numpy pandas==2.1.4 From ef6cd1f9cc9bc00558c335ed3158900d70f556ad Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 9 Feb 2024 08:13:48 -0800 Subject: [PATCH 09/33] Pin numpy version under 2.0 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 8eddacfe..7162ca07 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,7 +35,7 @@ install_requires = dipy>=1.0.0 groupyr>=0.3.2 matplotlib - numpy + numpy<2 pandas==2.1.4 requests seaborn==0.13.0 From 87f1e4e3a6234691375360b61ced1acbafab0feb Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 11 Feb 2024 22:21:30 -0800 Subject: [PATCH 10/33] Make sure this is the right type. --- afqinsight/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index 690b2f63..bbf82670 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -763,7 +763,7 @@ def drop_target_na(self): This method modifies the ``X``, ``y``, and ``subjects`` attributes in-place. """ if self.y is not None: - nan_mask = np.isnan(self.y) + nan_mask = np.isnan(self.y.astype(float)) if len(self.y.shape) > 1: nan_mask = nan_mask.astype(int).sum(axis=1).astype(bool) From 576cdc1b1c96c28cf9b673f880573e54a6889552 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 21:45:53 -0800 Subject: [PATCH 11/33] Upgrade to new groupyr release (0.3.3). --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 7162ca07..83cef3a4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ setup_requires = python_requires = >=3.10 install_requires = dipy>=1.0.0 - groupyr>=0.3.2 + groupyr>=0.3.3 matplotlib numpy<2 pandas==2.1.4 From 7e3996ce8d244a13c0d381421b3c2f1acf0eede2 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 21:56:09 -0800 Subject: [PATCH 12/33] Update black setup. --- .pre-commit-config.yaml | 2 +- pyproject.toml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3e6fa4bc..2e49df86 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/python/black - rev: 22.3.0 + rev: 24.2.0 hooks: - id: black language_version: python diff --git a/pyproject.toml b/pyproject.toml index 24e23566..0e5fdce1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -target-version = ['py38'] +target-version = ['py310'] extend-exclude = ''' ( @@ -22,6 +22,7 @@ extend-exclude = ''' | \.venv | afqinsight.egg-info | doc + | examples | build | dist )/ From f47c250c45c138feceedf16239e5a5ea4bcefc78 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 22:03:33 -0800 Subject: [PATCH 13/33] Linting. --- afqinsight/transform.py | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/afqinsight/transform.py b/afqinsight/transform.py index d40075a6..689a61eb 100755 --- a/afqinsight/transform.py +++ b/afqinsight/transform.py @@ -1,4 +1,5 @@ """Transform AFQ data.""" + import numpy as np import pandas as pd from collections import OrderedDict diff --git a/setup.py b/setup.py index 820a485f..4dc5bfa6 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """Statistical learning for tractometry data, especially within the AFQ software ecosystem.""" + from setuptools import setup import string import os.path as op From 02c236ddfe276f302ff2f6235052c5ccdf674839 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 22:08:06 -0800 Subject: [PATCH 14/33] More linting. --- afqinsight/__init__.py | 1 + afqinsight/_serial_bagging.py | 1 + afqinsight/augmentation/__init__.py | 1 + afqinsight/augmentation/augmentation.py | 1 + afqinsight/augmentation/dtw.py | 1 + afqinsight/datasets.py | 9 ++++++--- afqinsight/pipeline.py | 1 + 7 files changed, 12 insertions(+), 3 deletions(-) diff --git a/afqinsight/__init__.py b/afqinsight/__init__.py index 2a2b1873..c0b40cf4 100755 --- a/afqinsight/__init__.py +++ b/afqinsight/__init__.py @@ -1,4 +1,5 @@ """AFQ-Insight is a Python library for statistical learning of tractometry data.""" + from . import datasets # noqa from . import utils # noqa from .cross_validate import * # noqa diff --git a/afqinsight/_serial_bagging.py b/afqinsight/_serial_bagging.py index fc46b166..c706e6c4 100644 --- a/afqinsight/_serial_bagging.py +++ b/afqinsight/_serial_bagging.py @@ -9,6 +9,7 @@ parallelism when using a dask.distributed backend, I will gladly remove this private module. @richford """ + import itertools import numbers import numpy as np diff --git a/afqinsight/augmentation/__init__.py b/afqinsight/augmentation/__init__.py index b40f6984..69033fef 100644 --- a/afqinsight/augmentation/__init__.py +++ b/afqinsight/augmentation/__init__.py @@ -13,4 +13,5 @@ augmentation for time series classification with neural networks," PLOS ONE 16(7): e0254841. DOI: https://doi.org/10.1371/journal.pone.0254841 """ + from .augmentation import * # noqa: F401,F403 diff --git a/afqinsight/augmentation/augmentation.py b/afqinsight/augmentation/augmentation.py index 57131646..916cbb9e 100644 --- a/afqinsight/augmentation/augmentation.py +++ b/afqinsight/augmentation/augmentation.py @@ -13,6 +13,7 @@ augmentation for time series classification with neural networks," PLOS ONE 16(7): e0254841. DOI: https://doi.org/10.1371/journal.pone.0254841 """ + import numpy as np from tqdm import tqdm diff --git a/afqinsight/augmentation/dtw.py b/afqinsight/augmentation/dtw.py index 4007c904..4d99519a 100644 --- a/afqinsight/augmentation/dtw.py +++ b/afqinsight/augmentation/dtw.py @@ -11,6 +11,7 @@ augmentation for time series classification with neural networks," PLOS ONE 16(7): e0254841. DOI: https://doi.org/10.1371/journal.pone.0254841 """ + import numpy as np import sys diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index bbf82670..acc7ef33 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -1,4 +1,5 @@ """Generate samples of synthetic data sets or extract AFQ data.""" + import hashlib import numpy as np import os @@ -705,9 +706,11 @@ def __getitem__(self, indices): target_cols=self.target_cols, group_names=self.group_names, subjects=np.array(self.subjects)[indices].tolist(), - sessions=np.array(self.sessions)[indices].tolist() - if self.sessions is not None - else None, + sessions=( + np.array(self.sessions)[indices].tolist() + if self.sessions is not None + else None + ), classes=self.classes, ) diff --git a/afqinsight/pipeline.py b/afqinsight/pipeline.py index c60179b6..ad5e8a6b 100755 --- a/afqinsight/pipeline.py +++ b/afqinsight/pipeline.py @@ -1,4 +1,5 @@ """sklearn-compatible pipelines for AFQ data.""" + import inspect import groupyr as gpr From 17ce505a8918eb22529eb6899b3f60451ec6418c Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 22:16:39 -0800 Subject: [PATCH 15/33] Update testing tox.ini environment. --- tox.ini | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index f3118213..fec7d946 100644 --- a/tox.ini +++ b/tox.ini @@ -7,17 +7,17 @@ isolated_build = True usedevelop = True deps = dipy>=1.0.0 - groupyr==0.2.7 + groupyr==0.3.3 h5py>=3.0.0 keras-tuner matplotlib - numpy + numpy<2 pandas>=1.1.0 pytest pytest-cov pytest-xdist[psutil] requests - scikit-learn>=1.0.0 + scikit-learn>=1.2.1 scipy<=1.7.3 seaborn setuptools_scm From 1b88ac6d806d27b1233f1bfaf39afeaed64d52f3 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 22:27:37 -0800 Subject: [PATCH 16/33] Try to get a reasonable scipy by pinning sklearn. --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index fec7d946..5f85f388 100644 --- a/tox.ini +++ b/tox.ini @@ -17,8 +17,7 @@ deps = pytest-cov pytest-xdist[psutil] requests - scikit-learn>=1.2.1 - scipy<=1.7.3 + scikit-learn==1.2.1 seaborn setuptools_scm sklearn_pandas>=2.0.0 From 707fabdb3677d591e912ffe04a663c68d55dd505 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 07:11:22 -0800 Subject: [PATCH 17/33] Avoid tox. --- .github/workflows/test.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 928f10cd..4b5e2dfd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,12 +30,11 @@ jobs: pydocstyle - name: Test run: | - cp $(python -c 'import site; print(site.getsitepackages()[0])')/afqinsight/_version.py afqinsight/_version.py - tox + cd && mkdir for_test && cd for_test && pytest --pyargs AFQ --cov-report term-missing --cov=AFQ - name: Coveralls run: | coveralls - if: matrix.python-version == 3.8 + if: matrix.python-version == 3.10 env: COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From b0fe8907c627d7751a928124c2e167984413835b Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 07:20:31 -0800 Subject: [PATCH 18/33] Fix copy-paste artifacts. --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4b5e2dfd..3783f73e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,7 +30,7 @@ jobs: pydocstyle - name: Test run: | - cd && mkdir for_test && cd for_test && pytest --pyargs AFQ --cov-report term-missing --cov=AFQ + cd && mkdir for_test && cd for_test && pytest --pyargs afqinsight --cov-report term-missing --cov=afqinsight - name: Coveralls run: | coveralls From 9d0772e4e01bca8c6405a8c4a3db0947b0f8d4dc Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 09:35:04 -0800 Subject: [PATCH 19/33] Replaces sklearn testing functions with pytest/numpy. --- afqinsight/tests/test_bagging.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/afqinsight/tests/test_bagging.py b/afqinsight/tests/test_bagging.py index 4612e1db..1bfe1d9e 100644 --- a/afqinsight/tests/test_bagging.py +++ b/afqinsight/tests/test_bagging.py @@ -7,17 +7,16 @@ import numpy as np import joblib +import pytest from afqinsight._serial_bagging import SerialBaggingClassifier, SerialBaggingRegressor from sklearn.base import BaseEstimator -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import assert_raise_message +from numpy.testing import assert_array_equal +from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_raises +from numpy.testing import assert_warns from sklearn.utils._testing import ignore_warnings from sklearn.dummy import DummyClassifier, DummyRegressor @@ -504,15 +503,14 @@ def test_parallel_classification(): assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) - assert_raise_message( + with pytest.raises( ValueError, "Number of features of the model " "must match the input. Model n_features is {0} " "and input n_features is {1} " "".format(X_test.shape[1], X_err.shape[1]), - ensemble.decision_function, - X_err, - ) + ): + ensemble.decision_function(X_err) ensemble = SerialBaggingClassifier( SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0 @@ -689,14 +687,11 @@ def test_warm_start_equal_n_estimators(): y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 + with pytest.warns( + UserWarning, match="Warm-start fitting without increasing n_estimators does not" + ): + clf.fit(X_train, y_train) - assert_warns_message( - UserWarning, - "Warm-start fitting without increasing n_estimators does not", - clf.fit, - X_train, - y_train, - ) assert_array_equal(y_pred, clf.predict(X_test)) From 0a7e9ea0abf2d744c6f36304b0ac8f9dbf4aec0e Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 09:57:21 -0800 Subject: [PATCH 20/33] Remove CNN module that is no longer used. We are using afqinsight/nn/* instead. --- afqinsight/cnn.py | 620 ----------------------------------- afqinsight/tests/test_cnn.py | 211 ------------ 2 files changed, 831 deletions(-) delete mode 100644 afqinsight/cnn.py delete mode 100644 afqinsight/tests/test_cnn.py diff --git a/afqinsight/cnn.py b/afqinsight/cnn.py deleted file mode 100644 index 4027bc0f..00000000 --- a/afqinsight/cnn.py +++ /dev/null @@ -1,620 +0,0 @@ -"""Build, fit, and predict with 1-D convolutional neural networks.""" - -import functools -import numpy as np -import os.path as op -import tempfile - -from dipy.utils.optpkg import optional_package -from sklearn.impute import SimpleImputer -from sklearn.metrics import r2_score -from sklearn.model_selection import train_test_split -from sklearn.utils.validation import check_X_y, check_is_fitted - -keras_msg = ( - "To use afqinsight's convolutional neural nets for tractometry data, you will need " - "to have tensorflow and kerastuner installed. You can do this by installing " - "afqinsight with `pip install afqinsight[tf]`, or by separately installing these packages " - "with `pip install tensorflow keras-tuner`." -) - -kt, _, _ = optional_package("keras_tuner", trip_msg=keras_msg) -tf, has_tf, _ = optional_package("tensorflow", trip_msg=keras_msg) - -if has_tf: - from tensorflow.keras.models import Sequential - from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPool1D, Dropout - from tensorflow.keras.callbacks import ModelCheckpoint - - -def build_model(hp, conv_layers, input_shape): - """Build a keras model. - - Uses keras tuner to build model - can control # layers, # filters in each layer, kernel size, - regularization etc - - Parameters - ---------- - hp : tensorflow.keras.HyperParameters() - Hyperparameters class from which to sample hyperparameters - - conv_layers : int - number of layers (one layer is Conv and MaxPool) in the sequential model. - - input_shape : int - input shape of X so the model gets built continuously as you are adding layers - - Returns - ------- - model : tensorflow.keras.Model - compiled model that uses hyperparameters defined inline to hypertune the model - - """ - model = Sequential() - model.add( - Conv1D( - filters=hp.Int("init_conv_filters", min_value=32, max_value=512, step=32), - kernel_size=hp.Int("init_conv_kernel", min_value=1, max_value=4, step=1), - activation="relu", - input_shape=input_shape, - ) - ) - - for i in range(conv_layers - 1): - model.add( - Conv1D( - filters=hp.Int( - "conv_filters" + str(i), min_value=32, max_value=512, step=32 - ), - kernel_size=hp.Int( - "conv_kernel" + str(i), min_value=1, max_value=4, step=1 - ), - activation="relu", - ) - ) - - model.add(MaxPool1D(pool_size=2, padding="same")) - - model.add(Dropout(0.25)) - model.add(Flatten()) - - dense_filters_2 = hp.Int("dense_filters_2", min_value=32, max_value=512, step=32) - model.add(Dense(dense_filters_2, activation="relu")) - model.add(Dropout(0.25)) - model.add(Dense(64, activation="relu")) - model.add(Dense(1, activation="linear")) - - model.compile( - loss="mean_squared_error", optimizer="adam", metrics=["mean_squared_error"] - ) - - return model - - -class ModelBuilder: - """Build a complex model architecture with the specified number of layers. - - Parameters - ---------- - tuner_type : str or class. - Tuner to use. One of {"hyperband", "bayesian", "random"}. - - input_shape : tuple - Expected shape of the input data. - - layers : int - Number of layers in the model. - - max_epochs : int - Number of epochs to train the model. - - X_test : numpy.ndarray - Test data. - - y_test : numpy.ndarray - Test labels or test values. - - batch_size : int - Batch size to use when training. - - directory : str - Directory to save the model to. - - project_name : str, optional - A string, the name to use as prefix for files saved by the tuner object. Defaults to None - - tuner_kwargs : dict, optional - Keyword arguments to pass to the tuner class on initialization. - Defaults to tuner defaults. - """ - - def __init__( - self, - tuner_type, - input_shape, - layers, - max_epochs, - X_test, - y_test, - batch_size, - directory=None, - project_name=None, - **tuner_kwargs, - ): - self.tuner_type = tuner_type - self.layers = layers - self.input_shape = input_shape - self.max_epochs = max_epochs - self.batch_size = batch_size - self.X_test = X_test - self.y_test = y_test - self.directory = directory - self.project_name = project_name - self.tuner_kwargs = tuner_kwargs - - def _get_tuner(self): - """Call build_model and instantiate a Keras Tuner for the returned model depending on user choice of tuner. - - Returns - ------- - tuner : kerastuner.tuners - BayesianOptimization, Hyperband, or RandomSearch tuner - - """ - # setting parameters beforehand - hypermodel = functools.partial( - build_model, conv_layers=self.layers, input_shape=self.input_shape - ) - if isinstance(self.tuner_type, str): - # instantiating tuner based on user's choice - if self.tuner_type == "hyperband": - tuner = kt.Hyperband( - hypermodel=hypermodel, - objective="mean_squared_error", - max_epochs=10, - overwrite=True, - project_name=self.project_name, - directory=self.directory, - **self.tuner_kwargs, - ) - - elif self.tuner_type == "bayesian": - tuner = kt.BayesianOptimization( - hypermodel=hypermodel, - objective="mean_squared_error", - max_trials=10, - overwrite=True, - project_name=self.project_name, - directory=self.directory, - **self.tuner_kwargs, - ) - - elif self.tuner_type == "random": - tuner = kt.RandomSearch( - hypermodel=hypermodel, - objective="mean_squared_error", - max_trials=10, - overwrite=True, - project_name=self.project_name, - directory=self.directory, - **self.tuner_kwargs, - ) - else: - raise ValueError( - f"tuner parameter expects 'hyperband', 'bayesian', or 'random', but you provided {self.tuner_type}" - ) - return tuner - # We do not cover the following line, because CNN also handles this - # error: - else: # pragma: no cover - raise TypeError( - f"`tuner` parameter should be a string, but you provided {self.tuner_type}" - ) - - def _get_best_weights(self, model, X, y): - """Fit a CNN and save the best weights. - - Use keras ModelCheckpoint to fit CNN and save the weights from the epoch - that produced the lowest validation loss to a temporary file. Uses - temporary file to load the best weights into the CNN model and returns - this best model. - - Parameters - ---------- - model : tensorflow.keras.Sequential() - Hyperparameters class from which to sample hyperparameters - - X : array-like of shape (n_samples, n_features) - The feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - model : tensorflow.keras.Model - fitted keras model with best weights loaded - - """ - weights_path = op.join(tempfile.mkdtemp(), "weights.hdf5") - # making model checkpoint to save best model (# epochs) to file - model_checkpoint_callback = ModelCheckpoint( - filepath=weights_path, - monitor="val_loss", - mode="auto", - save_best_only=True, - save_weights_only=True, - verbose=True, - ) - - # Fitting model using model checkpoint callback to find best model which is saved to 'weights' - model.fit( - X, - y, - epochs=self.max_epochs, - batch_size=self.batch_size, - callbacks=[model_checkpoint_callback], - validation_data=(self.X_test, self.y_test), - ) - # loading in weights - model.load_weights(weights_path) - - # return the model - return model - - def build_basic_model(self, X, y): - """Build a sequential model without hyperparameter tuning. - - Builds a static baseline sequential model with no hyperparameter tuning. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - model : tensorflow.keras.Model - compiled model using basic Weston Havens architecture - - """ - model = Sequential() - model.add(Dense(128, activation="relu", input_shape=X.shape[1:])) - model.add(Conv1D(24, kernel_size=2, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(32, kernel_size=2, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(64, kernel_size=3, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(128, kernel_size=4, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Conv1D(256, kernel_size=4, activation="relu")) - model.add(MaxPool1D(pool_size=2, padding="same")) - model.add(Dropout(0.25)) - model.add(Flatten()) - model.add(Dense(128, activation="relu")) - model.add(Dropout(0.25)) - model.add(Dense(64, activation="relu")) - model.add(Dense(1, activation="linear")) - - model.compile( - loss="mean_squared_error", optimizer="adam", metrics=["mean_squared_error"] - ) - - best_model = self._get_best_weights(model, X, y) - return best_model - - def build_tuned_model(self, X, y): - """Build a tuned model using Keras tuner. - - Initializes a Keras tuner on user's model, searches for best hyperparameters, and saves them. - Then builds "best" model using saved best hyperparameters found during the search and returns model - with best weights loaded from _get_best_weights. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - model : tensorflow.keras.Model - compiled model that uses hyperparameters defined inline to hypertune the model - - """ - # initialize tuner - tuner = self._get_tuner() - - # Find the optimal hyperparameters - tuner.search(X, y, epochs=50, validation_split=0.2) - - # Save the optimal hyperparameters - best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] - - # make CNN model using best hyperparameters - model = tuner.hypermodel.build(best_hps) - - best_model = self._get_best_weights(model, X, y) - return best_model - - -class CNN: - """A Convolutional Neural Network model with a fit/predict interface. - - Parameters - ---------- - n_nodes : int - Number of nodes in each bundle profile. - - n_channels : int - Number of metrics in each bundle profile. - - max_epochs : int - Maximum number of epochs to train model. - - batch_size : int - Number of samples per batch. - - tuner_type : str - Type of hyperparameter tuner to use. One of 'hyperband', 'bayesian', or - 'random'. - - layers : int - Number of convolutional layers to use. - - test_size : float - Fraction of data to use as test set. - - impute_strategy : str, optional - Imputation strategy to use. One of 'mean', 'median', or 'knn'. - Default: "median". - - random_state : int or RandomState instance, optional - Default: None. - - directory : str, optional - Directory to save model and hyperparameters. Default: "." - - project_name : str, optional - A string, the name to use as prefix for files saved by the tuner - object. Defaults to None - - tuner_kwargs : dict, optional - Keyword arguments to pass to tuner. Default: tuner defaults. - """ - - def __init__( - self, - n_nodes, - n_channels, - max_epochs=50, - batch_size=32, - tuner_type=None, - layers=1, - test_size=0.2, - impute_strategy="median", - random_state=None, - directory=None, - project_name=None, - **tuner_kwargs, - ): - # checking n_nodes is passed as int - if not isinstance(n_nodes, int): - raise TypeError("Parameter n_nodes must be an integer.") - else: - self.n_nodes = n_nodes - - # checking n_channels is passed as int - if not isinstance(n_channels, int): - raise TypeError("Parameter n_channels must be an integer.") - else: - self.n_channels = n_channels - - # checking layers is passed as int - if not isinstance(layers, int): - raise TypeError("Parameter layers must be an integer.") - else: - self.layers = layers - - # checking max epochs is passed as int - if not isinstance(max_epochs, int): - raise TypeError("Parameter max_epochs must be an integer.") - else: - self.max_epochs = max_epochs - - if not isinstance(batch_size, int): - raise TypeError("Parameter batch_size must be an integer.") - else: - self.batch_size = batch_size - - # checking tiner is passed as str or None - if not isinstance(tuner_type, str) and tuner_type is not None: - raise TypeError("Parameter tuner must be str.") - else: - # tuner can be None (no tuning) BayesianOptimization, Hyperband, or RandomSearch - self.tuner_type = tuner_type - - # checking val split is passed as float - if not isinstance(test_size, float): - raise TypeError("Parameter test_size must be a float.") - else: - self.test_size = test_size - - # checking strategy is passed as str and has value of 'median', 'mean', or 'knn' - if not isinstance(impute_strategy, str): - raise TypeError("Parameter impute_strategy must be a string.") - elif impute_strategy not in ["median", "mean", "knn"]: - raise ValueError( - f"Parameter impute_strategy must be 'median', 'mean', or 'knn' but you provided {impute_strategy}" - ) - else: - self.impute_strategy = impute_strategy - - if random_state is not None: - if not (isinstance(random_state, int) or isinstance(np.random.RandomState)): - raise TypeError( - f"Parameter random_state must be an int or RandomState, but you provided {random_state}" - ) - self.random_state = random_state - - self.directory = directory - self.project_name = project_name - self.tuner_kwargs = tuner_kwargs - self.model_ = None - self.best_hps_ = None - - def _preprocess(self, X, y=None): - """Convert feature matrix for input into a CNN. - - Masks NAN values for X and y (if y is given), imputes X, and reshapes X - to be in proper form for CNN model. In more conventional machine - learning, X has shape (n_samples, n_features), where n_features is - n_nodes * n_bundles * n_metrics. However, in our CNN approach, we treat - each bundle/metric combination as a separate channel, analogous to RGB - channels in a 2D image. The remaining one dimension is the nodes - dimension. Thus the output has shape (n_samples, n_channels, n_nodes), - where n_channels = n_metrics * n_bundles. - - Parameters - ---------- - X : array-like of shape (n_samples, n_metrics * n_nodes) - Diffusion MRI tractometry features (columns) for each subject in the sample (rows). - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - X : array-like of shape (n_samples, n_channels, n_nodes) - The imputed and reshaped feature samples - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - """ - # n_nodes * n_channels must = X.shape[1] - if self.n_nodes * self.n_channels != X.shape[1]: - raise ValueError( - "The product n_nodes and n_channels is not the correct shape." - ) - - # We don't cover the following line, because this case is also handled - # in the fall to fit: - if len(X.shape) > 2: # pragma: no cover - raise ValueError("Expected X to be a 2D matrix.") - if y is not None: - nan_mask = np.logical_not(np.isnan(y)) - X = X[nan_mask, :] - y = y[nan_mask] - - imp = SimpleImputer(strategy=self.impute_strategy) - X = imp.fit_transform(X) - - if y is not None: - X, y = check_X_y(X, y) - - n_subjects = X.shape[0] - - X = np.swapaxes(X.reshape((n_subjects, self.n_channels, self.n_nodes)), 1, 2) - - if y is not None: - return X, y - else: - return X - - def fit(self, X, y): - """Fit the model. - - Preprocesses X and y, builds CNN model, tunes model hyperparameters and - fits the model to given X and y, using X_test and y_test to validate and - find best weights and hyperparameters. - - Parameters - ---------- - X : array-like of shape (n_samples, n_metrics * n_nodes) - Diffusion MRI tractometry features (columns) for each subject (rows). - - y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values - - Returns - ------- - self : CNN - updated CNN instantiation - - """ - X, y = self._preprocess(X, y) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=self.test_size, random_state=self.random_state - ) - # CNN gets n_nodes, n_channels, max_epochs, tuner=None, layers=None - # Model Builder takes tuner_type, input_shape, layers, max_epochs, **kwargs - builder = ModelBuilder( - self.tuner_type, - X_train.shape[1:], - self.layers, - self.max_epochs, - X_test, - y_test, - self.batch_size, - self.directory, - self.project_name, - **self.tuner_kwargs, - ) - if self.tuner_type is None: - self.model_ = builder.build_basic_model(X_train, y_train) - else: - self.model_ = builder.build_tuned_model(X_train, y_train) - - self.is_fitted_ = True - - return self - - def predict(self, X): - """Predict target values. - - Preprocesses X and returns predicted y values for X from fitted CNN model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_metrics * n_nodes) - Tractometry features (columns) for each subject in the sample (rows). - - Returns - ------- - pred : array-like of shape (n_samples,) or (n_samples, n_targets) - predicted values - """ - X = self._preprocess(X) - check_is_fitted(self, "is_fitted_") - pred = self.model_.predict(X).squeeze() - return pred - - def score(self, y_test, y_hat): - """Score the performance of the model. - - Masks out NaN values from y_test and returns $R^2$ score for the CNN model comparing to y_hat - - Parameters - ---------- - y_test : array-like of shape (n_samples,) or (n_samples, n_targets) - Testing target values - - y_hat : array-like of shape (n_samples,) or (n_samples, n_targets) - Predicted target values - - Returns - ------- - r2_score : float - r-squared score for y_test and y_hat for CNN model - - """ - nan_mask = np.logical_not(np.isnan(y_test)) - y_test = y_test[nan_mask] - return r2_score(y_test, y_hat) diff --git a/afqinsight/tests/test_cnn.py b/afqinsight/tests/test_cnn.py deleted file mode 100644 index 8899d912..00000000 --- a/afqinsight/tests/test_cnn.py +++ /dev/null @@ -1,211 +0,0 @@ -import afqinsight as afqi -import os.path as op -import pytest -import tempfile - -from afqinsight.cnn import CNN -from afqinsight.datasets import load_afq_data - -data_path = op.join(afqi.__path__[0], "data") -test_data_path = op.join(data_path, "test_data") - -X, y, groups, feature_names, group_names, subjects, _, _ = load_afq_data( - fn_nodes=op.join(test_data_path, "nodes.csv"), - fn_subjects=op.join(test_data_path, "subjects.csv"), - target_cols=["test_class"], - label_encode_cols=["test_class"], -) - - -def test_basic_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN(100, 6, 5, project_name="test-project", directory=tdir) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - -def test_hyperband_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN( - 100, 6, 5, 64, "hyperband", project_name="test-project", directory=tdir - ) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - model2 = CNN( - 100, 6, 5, 64, "hyperband", 4, project_name="test-project", directory=tdir - ) - model2.fit(X, y) - assert model2.is_fitted_ is True - y_hat2 = model2.predict(X) - _ = model2.score(y, y_hat2) - - model3 = CNN( - 100, - 6, - 5, - 64, - "hyperband", - 4, - 0.3, - project_name="test-project", - directory=tdir, - ) - model3.fit(X, y) - assert model3.is_fitted_ is True - y_hat3 = model3.predict(X) - _ = model3.score(y, y_hat3) - - model4 = CNN( - 100, - 6, - 5, - 64, - "hyperband", - 4, - 0.3, - factor=2, - hyperband_iterations=2, - seed=2, - project_name="test-project", - directory=tdir, - ) - model4.fit(X, y) - assert model4.is_fitted_ is True - y_hat4 = model4.predict(X) - _ = model4.score(y, y_hat4) - - -def test_bayesian_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN(100, 6, 5, 64, "bayesian", directory=tdir) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - model2 = CNN(100, 6, 5, 64, "bayesian", 4, directory=tdir) - model2.fit(X, y) - assert model2.is_fitted_ is True - y_hat2 = model2.predict(X) - _ = model2.score(y, y_hat2) - - model3 = CNN(100, 6, 5, 64, "bayesian", 4, 0.3, directory=tdir) - model3.fit(X, y) - assert model3.is_fitted_ is True - y_hat3 = model3.predict(X) - _ = model3.score(y, y_hat3) - - model4 = CNN( - 100, - 6, - 5, - 64, - "bayesian", - 4, - 0.3, - num_initial_points=2, - alpha=0.02, - beta=0.5, - seed=5, - directory=tdir, - ) - model4.fit(X, y) - assert model4.is_fitted_ is True - y_hat4 = model4.predict(X) - _ = model4.score(y, y_hat4) - - -def test_random_cnn(): - with tempfile.TemporaryDirectory() as tdir: - model = CNN(100, 6, 5, 64, "random", directory=tdir) - model.fit(X, y) - assert model.is_fitted_ is True - y_hat = model.predict(X) - _ = model.score(y, y_hat) - - model2 = CNN(100, 6, 5, 64, "random", 4, directory=tdir) - model2.fit(X, y) - assert model2.is_fitted_ is True - y_hat2 = model2.predict(X) - _ = model2.score(y, y_hat2) - - model3 = CNN(100, 6, 5, 64, "random", 4, 0.3, directory=tdir) - model3.fit(X, y) - assert model3.is_fitted_ is True - y_hat3 = model3.predict(X) - _ = model3.score(y, y_hat3) - - model4 = CNN( - 100, 6, 5, 64, "random", 4, 0.3, impute_strategy="mean", directory=tdir - ) - model4.fit(X, y) - assert model4.is_fitted_ is True - y_hat4 = model4.predict(X) - _ = model4.score(y, y_hat4) - - -def test_fail_cnn(): - with pytest.raises(ValueError): - # passing in wrong shape of X (not 2d): - model = CNN(100, 6, 5, 64) - model.fit(X.reshape((7, 100, -1)), y) - - with pytest.raises(ValueError): - # passing in wrong tuner value - model = CNN(100, 6, 5, 64, "wrong") - model.fit(X, y) - - with pytest.raises(TypeError): - # passing in int for tuner - model = CNN(100, 6, 5, 64, 0) - - with pytest.raises(ValueError): - # passing in n_nodes and n_channels that multiply to equal - # proper dimension for given x - model = CNN(78, 6, 5, 64, "random") - model.fit(X, y) - - with pytest.raises(TypeError): - # passing in float for tuner_type - model = CNN(100, 6, 5, 64, 0.0) - - with pytest.raises(TypeError): - # passing in float for n_nodes - model = CNN(1.1, 6, 5, 64, "random") - - with pytest.raises(TypeError): - # passing in float for n_channels - model = CNN(100, 6.0, 5, 64, "random") - - with pytest.raises(TypeError): - # passing in float for layers - model = CNN(100, 6, layers=5.0) - - with pytest.raises(TypeError): - # passing in float for batch size - model = CNN(100, 6, 5, 6.4, "random") - - with pytest.raises(TypeError): - # passing in string for batch size - model = CNN(100, 6, 5, "64", "random") - - with pytest.raises(TypeError): - # passing in an integer for test_size - model = CNN(100, 6, test_size=20) - - with pytest.raises(TypeError): - # passing in an integer for impute_strategy (this should be a string). - model = CNN(100, 6, impute_strategy=20) - - with pytest.raises(ValueError): - # passing in the wrong string for impute_strategy: - model = CNN(100, 6, impute_strategy="foo") - - with pytest.raises(TypeError): - # passing in a string for random_state (should be int or RandomState). - model = CNN(100, 6, random_state="foo") From 6c99748b5cda9dae050f49405caeea626fefe963 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 15:50:26 -0800 Subject: [PATCH 21/33] Deprecated np.float not used. --- afqinsight/_serial_bagging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/afqinsight/_serial_bagging.py b/afqinsight/_serial_bagging.py index c706e6c4..fe06d6ee 100644 --- a/afqinsight/_serial_bagging.py +++ b/afqinsight/_serial_bagging.py @@ -384,7 +384,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features - elif isinstance(self.max_features, np.float): + elif isinstance(self.max_features, float): max_features = self.max_features * self.n_features_in_ else: raise ValueError("max_features must be int or float") @@ -898,7 +898,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features - elif isinstance(self.max_features, np.float): # pragma: no cover + elif isinstance(self.max_features, float): # pragma: no cover max_features = self.max_features * self.n_features_in_ else: # pragma: no cover raise ValueError("max_features must be int or float") From 3d14cce7619dc278b8a249a92fe5d7828ba90abd Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 15:54:14 -0800 Subject: [PATCH 22/33] Pin numpy 1.23.5 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 83cef3a4..74d704a1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,7 +35,7 @@ install_requires = dipy>=1.0.0 groupyr>=0.3.3 matplotlib - numpy<2 + numpy==1.23.5 pandas==2.1.4 requests seaborn==0.13.0 From 2ee4e8e6958928ebff51a7440d48454bd44df6f5 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 16:08:08 -0800 Subject: [PATCH 23/33] Implement the other required attribute of this dummy class. --- afqinsight/tests/test_bagging.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/afqinsight/tests/test_bagging.py b/afqinsight/tests/test_bagging.py index 1bfe1d9e..2a12f40b 100644 --- a/afqinsight/tests/test_bagging.py +++ b/afqinsight/tests/test_bagging.py @@ -240,6 +240,9 @@ def fit(self, X, y): self.training_size_ = X.shape[0] self.training_hash_ = joblib.hash(X) + def predict(self, X): + return np.zeros(X.shape[0]) + def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. From fdcecda1a6675677263223cb49e99f27625dcb3c Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Fri, 23 Feb 2024 16:31:59 -0800 Subject: [PATCH 24/33] Coerce a float type for y. --- afqinsight/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index acc7ef33..09918e01 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -311,7 +311,7 @@ def load_afq_data( else: classes = None - y = np.squeeze(y.to_numpy()) + y = np.squeeze(y.to_numpy()).astype(float) return AFQData( X=X, From e18b6fcc3ee24a1613c947de3f828ed863ca14dc Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 12:27:07 -0800 Subject: [PATCH 25/33] Initial draft implementation of an autoencoder model. --- afqinsight/nn/tf_models.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index e685170c..df9916d0 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -307,3 +307,27 @@ def cnn_resnet(input_shape, n_classes, output_activation="softmax", verbose=Fals model.summary() return model + + +def autoencoder(input_shape, n_hidden=None, verbose=False): + """ + Fully connected autoencoder + """ + ip = Input(shape=input_shape) + if n_hidden is None: + n_hidden = input_shape[0] // 8 + + fc = Flatten()(ip) + fc = Dense(input_shape, activation="relu")(fc) + fc = Dense(input_shape // 2, activation="relu")(fc) + fc = Dense(input_shape // 4, activation="relu")(fc) + fc = Dense(n_hidden, activation="relu")(fc) + fc = Dense(input_shape // 4, activation="relu")(fc) + fc = Dense(input_shape // 2, activation="relu")(fc) + out = Dense(input_shape)(fc) + + model = Model([ip], [out]) + if verbose: + model.summary() + + return model From 7e924391b1e6c9558ab4f4f7c3cf7cab3ef498b8 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 14:56:25 -0800 Subject: [PATCH 26/33] Fixes this test, that was failing because of non-numeric values in ses ID. --- afqinsight/tests/test_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/afqinsight/tests/test_datasets.py b/afqinsight/tests/test_datasets.py index 7a807a96..b6ac9e5f 100644 --- a/afqinsight/tests/test_datasets.py +++ b/afqinsight/tests/test_datasets.py @@ -536,9 +536,9 @@ def test_load_afq_data(dwi_metrics): ) means_ref = ( - nodes.groupby(["subjectID", "tractID"]) + nodes.drop(["nodeID", "sessionID"], axis="columns") + .groupby(["subjectID", "tractID"]) .agg("mean") - .drop("nodeID", axis="columns") .unstack("tractID") ) assert np.allclose(X, means_ref.to_numpy(), equal_nan=True) # nosec From 573c29d1529de8ab5253fe646e774f0752854b36 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 15:47:26 -0800 Subject: [PATCH 27/33] Be a little more liberal with dtype here. This is to that we can deal with nans in y. --- afqinsight/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index 09918e01..08159046 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -311,7 +311,7 @@ def load_afq_data( else: classes = None - y = np.squeeze(y.to_numpy()).astype(float) + y = np.squeeze(y.to_numpy().astype(float)) return AFQData( X=X, From 8777f9d661599fbfc574fc27c76e4866f8db326a Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Sun, 14 Jan 2024 16:08:45 -0800 Subject: [PATCH 28/33] Include dl_qc_score when loading the hbn data. --- afqinsight/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/afqinsight/datasets.py b/afqinsight/datasets.py index 08159046..0925ba8b 100755 --- a/afqinsight/datasets.py +++ b/afqinsight/datasets.py @@ -664,7 +664,7 @@ def from_study(study, verbose=None): "weston-havens": dict(dwi_metrics=["md", "fa"], target_cols=["Age"]), "hbn": dict( dwi_metrics=["dki_md", "dki_fa"], - target_cols=["age", "sex", "scan_site_id"], + target_cols=["age", "sex", "scan_site_id", "dl_qc_score"], label_encode_cols=["sex", "scan_site_id"], index_col="subject_id", ), From 6d2da2647f2bde7e91b0fdbe98860e95a2343bf5 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 18 Jan 2024 11:18:32 -0800 Subject: [PATCH 29/33] A bit more progress on autoencoder. --- afqinsight/nn/tf_models.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index df9916d0..be74d8bb 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -13,7 +13,7 @@ if has_tf: from tensorflow.keras.models import Model - from tensorflow.keras.layers import Dense, Flatten, Dropout, Input + from tensorflow.keras.layers import Dense, Flatten, Dropout, Input, Reshape from tensorflow.keras.layers import MaxPooling1D, Conv1D from tensorflow.keras.layers import LSTM, Bidirectional from tensorflow.keras.layers import ( @@ -309,22 +309,23 @@ def cnn_resnet(input_shape, n_classes, output_activation="softmax", verbose=Fals return model -def autoencoder(input_shape, n_hidden=None, verbose=False): +def autoencoder(input_shape, encoding_dim=None, verbose=False): """ Fully connected autoencoder """ ip = Input(shape=input_shape) - if n_hidden is None: - n_hidden = input_shape[0] // 8 + if encoding_dim is None: + encoding_dim = (input_shape[0] * input_shape[1]) // 8 fc = Flatten()(ip) - fc = Dense(input_shape, activation="relu")(fc) - fc = Dense(input_shape // 2, activation="relu")(fc) - fc = Dense(input_shape // 4, activation="relu")(fc) - fc = Dense(n_hidden, activation="relu")(fc) - fc = Dense(input_shape // 4, activation="relu")(fc) - fc = Dense(input_shape // 2, activation="relu")(fc) - out = Dense(input_shape)(fc) + fc = Dense(input_shape[0] * input_shape[1], activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + fc = Dense(encoding_dim, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + pre_out = Dense((input_shape[0] * input_shape[1]))(fc) + out = Reshape(input_shape)(pre_out) model = Model([ip], [out]) if verbose: From dfb102ddf79f69fd98886a4cde8419d2c84cf74a Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 12 Feb 2024 13:48:15 -0800 Subject: [PATCH 30/33] Adds a conv autoencoder. --- afqinsight/nn/tf_models.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index be74d8bb..dc382a83 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -14,7 +14,7 @@ if has_tf: from tensorflow.keras.models import Model from tensorflow.keras.layers import Dense, Flatten, Dropout, Input, Reshape - from tensorflow.keras.layers import MaxPooling1D, Conv1D + from tensorflow.keras.layers import MaxPooling1D, Conv1D, Conv1DTranspose from tensorflow.keras.layers import LSTM, Bidirectional from tensorflow.keras.layers import ( BatchNormalization, @@ -23,6 +23,7 @@ concatenate, Activation, add, + Layer, ) else: # Since all model building functions start with Input, we make Input the @@ -309,7 +310,7 @@ def cnn_resnet(input_shape, n_classes, output_activation="softmax", verbose=Fals return model -def autoencoder(input_shape, encoding_dim=None, verbose=False): +def fc_autoencoder(input_shape, encoding_dim=None, verbose=False): """ Fully connected autoencoder """ @@ -328,6 +329,28 @@ def autoencoder(input_shape, encoding_dim=None, verbose=False): out = Reshape(input_shape)(pre_out) model = Model([ip], [out]) + if verbose: + model.summary() + return model + + +def cnn_autoencoder(input_shape, verbose=False): + """ + Convolutional autoencoder + """ + ip = Input(shape=input_shape) + # Encoder + x = Conv1D(32, (3), activation="relu", padding="same")(ip) + x = MaxPooling1D((2), padding="same")(x) + x = Conv1D(32, (3), activation="relu", padding="same")(x) + x = MaxPooling1D((2), padding="same")(x) + + # Decoder + x = Conv1DTranspose(32, (3), strides=2, activation="relu", padding="same")(x) + x = Conv1DTranspose(32, (3), strides=2, activation="relu", padding="same")(x) + x = Conv1D(1, (3), activation="sigmoid", padding="same")(x) + + model = Model([ip], [x]) if verbose: model.summary() From 951b3f666eb4fe78192c0138218404703a9e64ae Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 12 Feb 2024 13:51:54 -0800 Subject: [PATCH 31/33] Adds Initial implementation of a VAE. --- afqinsight/nn/tf_models.py | 99 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index dc382a83..c1da05e6 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -25,6 +25,8 @@ add, Layer, ) + from tensorflow.keras.losses import binary_crossentropy + else: # Since all model building functions start with Input, we make Input the # tripwire instance for cases where tensorflow is not installed. @@ -355,3 +357,100 @@ def cnn_autoencoder(input_shape, verbose=False): model.summary() return model + + +class _Sampling(Layer): + """ + Sample the latent layer of a VAE + """ + + def call(self, inputs): + z_mean, z_log_var = inputs + batch = tf.shape(z_mean)[0] + dim = tf.shape(z_mean)[1] + epsilon = tf.random.normal(shape=(batch, dim)) + return z_mean + tf.exp(0.5 * z_log_var) * epsilon + + +def _fc_vae_encoder(input_shape, encoding_dim=None, verbose=False): + """ + Encoder section for a fully connected variational autoencoder + """ + ip = Input(shape=input_shape) + if encoding_dim is None: + encoding_dim = (input_shape[0] * input_shape[1]) // 8 + + fc = Flatten()(ip) + fc = Dense(input_shape[0] * input_shape[1], activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + + z_mean = Dense(encoding_dim, activation="relu")(fc) + z_log_var = Dense(encoding_dim, name="z_mean")(fc) + z = _Sampling()([z_mean, z_log_var]) + return Model(ip, [z_mean, z_log_var, z], name="encoder") + + +def _fc_vae_decoder(input_shape, encoding_dim=None, verbose=False): + """ + Decoder section for a fully connected variational autoencoder + """ + + fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) + fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) + pre_out = Dense((input_shape[0] * input_shape[1]))(fc) + out = Reshape(input_shape)(pre_out) + + +def _VAE(Model): + """ + A variational autoencoder class + """ + + def __init__(self, encoder, decoder, **kwargs): + super().__init__(**kwargs) + self.encoder = encoder + self.decoder = decoder + self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss") + self.reconstruction_loss_tracker = tf.keras.metrics.Mean( + name="reconstruction_loss" + ) + self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss") + + @property + def metrics(self): + return [ + self.total_loss_tracker, + self.reconstruction_loss_tracker, + self.kl_loss_tracker, + ] + + def train_step(self, data): + with tf.GradientTape() as tape: + z_mean, z_log_var, z = self.encoder(data) + reconstruction = self.decoder(z) + reconstruction_loss = tf.reduce_mean( + tf.reduce_sum(binary_crossentropy(data, reconstruction), axis=1) + ) + kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)) + kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1)) + total_loss = reconstruction_loss + kl_loss + grads = tape.gradient(total_loss, self.trainable_weights) + self.optimizer.apply_gradients(zip(grads, self.trainable_weights)) + self.total_loss_tracker.update_state(total_loss) + self.reconstruction_loss_tracker.update_state(reconstruction_loss) + self.kl_loss_tracker.update_state(kl_loss) + return { + "loss": self.total_loss_tracker.result(), + "reconstruction_loss": self.reconstruction_loss_tracker.result(), + "kl_loss": self.kl_loss_tracker.result(), + } + + +def fc_vae(input_shape, encoding_dim=None, verbose=False): + """ + Fully connected variational autoencoder. + """ + encoder = _fc_vae_encoder(input_shape, encoding_dim, verbose) + decoder = _fc_vae_decoder(input_shape, encoding_dim, verbose) + return _VAE(encoder, decoder) From e31f680cdea2e70b2708aad5238bf2a19e377508 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 12 Feb 2024 15:58:26 -0800 Subject: [PATCH 32/33] Make sure that encoding dimension takes a non-None value. And that it's consistent between the encoder and decoder. --- afqinsight/nn/tf_models.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index c1da05e6..eba0e9b8 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -377,6 +377,7 @@ def _fc_vae_encoder(input_shape, encoding_dim=None, verbose=False): Encoder section for a fully connected variational autoencoder """ ip = Input(shape=input_shape) + if encoding_dim is None: encoding_dim = (input_shape[0] * input_shape[1]) // 8 @@ -388,21 +389,22 @@ def _fc_vae_encoder(input_shape, encoding_dim=None, verbose=False): z_mean = Dense(encoding_dim, activation="relu")(fc) z_log_var = Dense(encoding_dim, name="z_mean")(fc) z = _Sampling()([z_mean, z_log_var]) - return Model(ip, [z_mean, z_log_var, z], name="encoder") + return Model([ip], [z_mean, z_log_var, z], name="encoder") def _fc_vae_decoder(input_shape, encoding_dim=None, verbose=False): """ Decoder section for a fully connected variational autoencoder """ - + ip = Input(shape=(encoding_dim,)) + fc = Flatten()(ip) fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) pre_out = Dense((input_shape[0] * input_shape[1]))(fc) - out = Reshape(input_shape)(pre_out) + return Reshape(input_shape)(pre_out) -def _VAE(Model): +class _VAE(Model): """ A variational autoencoder class """ @@ -451,6 +453,9 @@ def fc_vae(input_shape, encoding_dim=None, verbose=False): """ Fully connected variational autoencoder. """ + if encoding_dim is None: + encoding_dim = (input_shape[0] * input_shape[1]) // 8 + encoder = _fc_vae_encoder(input_shape, encoding_dim, verbose) decoder = _fc_vae_decoder(input_shape, encoding_dim, verbose) return _VAE(encoder, decoder) From 68f03d51577c36738940768aa9c9d2d1876b28c4 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Thu, 22 Feb 2024 21:41:47 -0800 Subject: [PATCH 33/33] More work on autoencoder. Also, use groupyr 0.3.3 --- afqinsight/nn/tf_models.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/afqinsight/nn/tf_models.py b/afqinsight/nn/tf_models.py index eba0e9b8..61637c0b 100644 --- a/afqinsight/nn/tf_models.py +++ b/afqinsight/nn/tf_models.py @@ -336,21 +336,25 @@ def fc_autoencoder(input_shape, encoding_dim=None, verbose=False): return model -def cnn_autoencoder(input_shape, verbose=False): +def cnn_autoencoder(input_shape, encoding_dim=8, verbose=False): """ Convolutional autoencoder """ ip = Input(shape=input_shape) # Encoder - x = Conv1D(32, (3), activation="relu", padding="same")(ip) - x = MaxPooling1D((2), padding="same")(x) - x = Conv1D(32, (3), activation="relu", padding="same")(x) - x = MaxPooling1D((2), padding="same")(x) - + x = Conv1D(32, 3, activation="relu", padding="same")(ip) + x = MaxPooling1D(2, padding="same")(x) + x = Conv1D(16, 3, activation="relu", padding="same")(x) + x = MaxPooling1D(2, padding="same")(x) + shape = x.shape + # Latent + x = Flatten()(x) + x = Dense(encoding_dim, activation="relu")(x) # Decoder - x = Conv1DTranspose(32, (3), strides=2, activation="relu", padding="same")(x) - x = Conv1DTranspose(32, (3), strides=2, activation="relu", padding="same")(x) - x = Conv1D(1, (3), activation="sigmoid", padding="same")(x) + x = Reshape(shape)(x) + x = Conv1DTranspose(32, 3, strides=2, activation="relu", padding="same")(x) + x = Conv1DTranspose(16, 3, strides=2, activation="relu", padding="same")(x) + x = Conv1DTranspose(1, 3, activation="sigmoid", padding="same")(x) model = Model([ip], [x]) if verbose: @@ -401,7 +405,8 @@ def _fc_vae_decoder(input_shape, encoding_dim=None, verbose=False): fc = Dense((input_shape[0] * input_shape[1]) // 4, activation="relu")(fc) fc = Dense((input_shape[0] * input_shape[1]) // 2, activation="relu")(fc) pre_out = Dense((input_shape[0] * input_shape[1]))(fc) - return Reshape(input_shape)(pre_out) + out = Reshape(input_shape)(pre_out) + return Model([ip], [out], name="decoder") class _VAE(Model): @@ -427,6 +432,11 @@ def metrics(self): self.kl_loss_tracker, ] + def call(self, inputs): + z_mean, z_log_var, z = self.encoder(inputs) + reconstructed = self.decoder(z) + return reconstructed + def train_step(self, data): with tf.GradientTape() as tape: z_mean, z_log_var, z = self.encoder(data)