add test for new hyperparameters fitting

liam-o-marsh · liam-o-marsh · commit 7448d1369374 · 2026-01-24T13:05:55.000+01:00
diff --git a/qstack/qml/b2r2.py b/qstack/qml/b2r2.py
@@ -13,6 +13,10 @@
 
 defaults = SimpleNamespace(rcut=3.5, gridspace=0.03)
 
+class Reaction:
+    def __init__(self, reactants, products):
+        self.reactants = reactants
+        self.products = products
 
 def get_bags(unique_ncharges):
     """Generate all unique element pair combinations including self-interactions.
diff --git a/qstack/regression/hyperparameters2.py b/qstack/regression/hyperparameters2.py
@@ -87,7 +87,7 @@ def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0
     # after this point, either we are exiting early or we have found the right bounds
     all_errs.sort()
     logger.debug('local minimum in bounds, proceeding with parabolic search (bounds at: %r)', all_errs)
-    logger.debug('chosen: %f\%f/%f', x_left, x_center, x_right)
+    logger.debug('chosen: %f\\%f/%f', x_left, x_center, x_right)
     while n_iter > 0:
         a,b,c = fit_quadratic(x_left, x_center, x_right, y_left, y_center, y_right)
         if a<=0:  # lol no local minimum
@@ -103,6 +103,7 @@ def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0
             ypred_new = -0.25*b**2/a + c
         y_new = get_err(x_new)
         n_iter -=1
+        logger.debug('from chosen points %f\\%f/%f', x_left, x_center, x_right)
         logger.debug('predicted local minimum at %f->%f, true error %f', x_new, ypred_new, y_new)
         all_errs.append((x_new, y_new)) ; all_errs.sort()
         logger.debug('current data: %r', all_errs)
@@ -116,20 +117,21 @@ def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0
             x_right, y_right = all_errs[new_index+1]
             x_center, y_center = all_errs[new_index]
 
+        elif max(y_right,y_left, y_new)-min(y_new, y_center) < y_thres:
+            break
         elif y_new > y_center:
             if x_new > x_center:
                 x_right, y_right = x_new, y_new
             else:
                 x_left, y_left = x_new, y_new
-        elif y_left < y_right:
-            if max(y_right,y_left, y_new)-min(y_new, y_center) < y_thres:
-                break
+        else:  # if y_new <= y_center
             if x_new > x_center:
                 x_left, y_left = x_center, y_center
                 x_center, y_center = x_new, y_new
             else:
                 x_right, y_right = x_center, y_center
                 x_center, y_center = x_new, y_new
+        
         if abs(x_right - x_left) < x_thres:
             break
 
@@ -198,7 +200,7 @@ def inner_loop(split_i, alpha_i, train_idx, val_idx, alpha):
         if not np.isfinite(maes[:, alpha_i]).any():
             pass
         else:
-            res = maes[alpha_i]
+            res = maes[:,alpha_i]
             res = res[np.isfinite(res)]
             concat_results[alpha_i,0] = res.mean()
             concat_results[alpha_i,1] = res.std()
@@ -257,6 +259,7 @@ def get_err(log_sigma):
         )
         err_dict[log_sigma] = (alpha,costs)
         cost_res = costs.mean() + stddev_portion*costs.std()
+        #print("now eval'ing σ=", sigma, '... α=', alpha, costs.shape, costs.mean(), costs.std())
         return cost_res
 
     log_sigma_selected, cost_selected = parabolic_search(
@@ -334,7 +337,7 @@ def hyperparameters(X, y,
         sparse_idx = np.arange(X_train.shape[0])
 
     errors = []
-    with Parallel(n_jobs=-1, return_as="generator_unordered") as parallel:
+    with Parallel(n_jobs=1, return_as="generator_unordered") as parallel:
         if optimise_sigma:
             err_append = lambda sigma,alpha,err,stderr: errors.append((err,stderr, alpha,sigma))
             _,_,_ = search_sigma(
@@ -357,7 +360,7 @@ def hyperparameters(X, y,
 
 
     errors = np.array(errors)
-    ind = np.argsort(errors[:,0]+stddev_portion*errors[:,-1])[::-1]
+    ind = np.argsort(errors[:,0]+stddev_portion*errors[:,1])[::-1]
     errors = errors[ind]
     return errors
 
diff --git a/qstack/regression/skl_objects.py b/qstack/regression/skl_objects.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing._data import KernelCenterer
+from sklearn.utils.validation import (
+    FLOAT_DTYPES,
+    _check_sample_weight,
+    check_is_fitted,
+    validate_data,
+)
+
+from qstack.qml import b2r2, slatm
+
+def _restore_from_pickle(objname: str, version: int, hypers: dict, params: dict|None):
+    pass
+
+
+
+
+
+
+class B2R2Representation(TransformerMixin, BaseEstimator):
+    """Transform reactions into their B2R2 representations
+
+    Reference:
+        P. van Gerwen, A. Fabrizio, M. D. Wodrich, C. Corminboeuf,
+        "Physics-based representations for machine learning properties of chemical reactions",
+        Mach. Learn.: Sci. Technol. 3, 045005 (2022), doi:10.1088/2632-2153/ac8f1a
+
+    This representation can be computed for molecules or for reactions,
+    by giving to this transformer a list of one or a list of the other.
+    Note that no fitting is required, and this object is a simple shim best used
+    in pipeline objects.
+
+    Molecules are ASE molecule objects, or any object with `.numbers` and `.positions` (in Å) properties.
+    Reactions, however, are ``qstack.qml.b2r2.Reaction objects,
+    or tuples of two lists of molecules (reactants, products).
+
+    Parameters
+    ----------
+    variant: str, default "l"
+        B2R2 variant to compute. Options:
+            - 'l': Local variant with element-resolved skewed Gaussians (default).
+            - 'a': Agnostic variant with element-pair Gaussians.
+            - 'n': Nuclear variant with combined skewed Gaussians.
+
+    progress: bool, default False
+        If True, displays progress bar
+
+    rcut: float, default 3.5
+        Cutoff radius for bond detection in Å
+
+    gridspace: float, default 0.03
+        Grid spacing for discretization in Å
+
+
+    Attributes
+    ----------
+    None
+
+    Examples
+    --------
+    [ fixme ]
+    """
+
+    def __init__(
+        self,
+        variant='l',
+        progress=False,
+        rcut=b2r2.defaults.rcut,
+        gridspace=b2r2.defaults.gridspace,
+    ):
+        """Initialize StandardFlexibleScaler."""
+        self.variant = variant
+        self.progress = progress
+        self.rcut = rcut
+        self.gridspace = gridspace
+        self.elements_ = []
+
+    def __reduce__(self):
+        return (
+            _restore_from_pickle,
+            "B2R2", 1,
+            dict(
+                variant = self.variant,
+                progress = self.progress,
+                rcut = self.rcut,
+                gridspace = self.gridspace,
+            ),
+            {'elements_': self.elements} if self.elements else None,
+        )
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Determine the types of elements to consider, by feeding them from all objects to consider later.
+
+        Parameters
+        ----------
+        X : numpy.ndarray of shape (n_samples, n_features)
+            The data used to compute the mean and standard deviation
+            used for later scaling along the features axis.
+        y: None
+            Ignored.
+        sample_weight: numpy.ndarray of shape (n_samples,)
+            Weights for each sample. Sample weighting can be used to center
+            (and scale) data using a weighted mean. Weights are internally
+            normalized before preprocessing.
+
+        Returns
+        -------
+        self : object
+            Fitted scaler.
+        """
+
+        elems = set()
+        for obj in X:
+            if isinstance(obj, tuple) and len(obj) == 2:
+                reac_mols = obj[0] + obj[1]
+            elif isinstance(obj, b2r2.Reaction):
+                reac_mols = obj.reactants + obj.products
+            elif hasattr(X[0], "numbers") and hasattr(X[0], "positions"):
+                reac_mols = [obj]
+            for mol in reac_mols:
+                elems.update(mol.numbers)
+        self.elements_ = sorted(elems)
+
+        return self
+
+    def transform(self, X, y=None, copy=None):
+        """Normalize a vector based on previously computed mean and scaling.
+
+        Parameters
+        ----------
+        X : list of length n_samples, of molecules or list of reactions
+            The chemical objects to compute representations of.
+            Please note they should all be of the same type (reactions OR molecules)
+        y: None
+            Ignored.
+        copy : bool, default=None
+            Ignored
+
+        Returns
+        -------
+        X : {array-like} of shape (n_samples, n_features)
+            Transformed data.
+        """
+
+        if self.variant=='l':
+            get_b2r2_molecular=b2r2.get_b2r2_l_molecular
+            combine = lambda r,p: p-r
+        elif self.variant=='a':
+            get_b2r2_molecular = b2r2.get_b2r2_a_molecular
+            combine = lambda r,p: p-r
+        elif self.variant=='n':
+            get_b2r2_molecular=b2r2.get_b2r2_n_molecular
+            combine = lambda r,p: np.hstack((r,p))
+        else:
+            raise RuntimeError(f'Unknown B2R2 {variant=}')
+
+        if isinstance(X[0], tuple) and len(X[0]) == 2:
+            mode = "reac-2"
+            first_array = self._get_reac_array(X[0][0], X[0][1], get_b2r2_molecular, combine)
+        elif isinstance(X[0], b2r2.Reaction):
+            mode = "reac"
+            first_array = self._get_reac_array(X[0].reactants, X[0].products, get_b2r2_molecular, combine)
+        elif hasattr(X[0], "numbers") and hasattr(X[0], "positions"):
+            mode = "mol"
+            first_array = self._get_mol_array(X[0], get_b2r2_molecular)
+        else:
+            raise ValueError("unknown type of input")
+
+        assert first_array.ndim==1
+        full_array = np.empty_like(first_array, shape=(len(X), *first_array.shape))
+        full_array[0] = first_array
+
+        for object_i,x in enumerate(X[1:]):
+            if mode == "reac-2":
+                full_array[object_i+1] = self._get_reac_array(x[0], x[1], get_b2r2_molecular, combine)
+            elif mode == "reac":
+                full_array[object_i+1] = self._get_reac_array(x.reactants, x.products, get_b2r2_molecular, combine)
+            elif mode == "mol":
+                full_array[object_i+1] = self._get_mol_array(x, get_b2r2_molecular)
+        return full_array
+
+    def _get_reac_array(self, reactants, products, mol_rep_func, combine):
+        reac_repr = self._get_mol_array(reactants[0], mol_rep_func)
+        for reac in reactants[1:]:
+            reac_repr += self._get_mol_array(reac, mol_rep_func)
+        prod_repr = self._get_mol_array(products[0], mol_rep_func)
+        for prod in products[1:]:
+            prod_repr += self._get_mol_array(prod, mol_rep_func)
+        return combine(reac_repr, prod_repr)
+
+    def _get_mol_array(self, mol, mol_rep_func):
+        return mol_rep_func(mol.numbers, mol.positions, self.rcut, self.gridspace, self.elements_)
diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -3,6 +3,7 @@
 import os
 import numpy as np
 import qstack.regression.hyperparameters as hyperparameters
+import qstack.regression.hyperparameters2 as hyperparameters2
 import qstack.regression.regression as regression
 import qstack.regression.final_error as final_error
 import qstack.regression.condition as condition
@@ -25,6 +26,25 @@ def test_hyperparameters():
 
     assert (np.allclose(hyper, true_hyper))
 
+def test_hyperparameters2():
+    #import logging
+    #logging.basicConfig()
+    #logging.getLogger("qstack").setLevel('DEBUG')
+    path = os.path.dirname(os.path.realpath(__file__))
+    xfile = os.path.join(path, 'data/mols/X_lb.npy')
+    X = np.load(xfile)
+    yfile = os.path.join(path, 'data/mols/dipole.dat')
+    y = np.loadtxt(yfile)
+
+    hyper = hyperparameters2.hyperparameters(X, y, random_state=42)[-4:]
+    true_hyper = [
+        [5.15813198e-01, 2.37774396e-01, 3.16227766e-08, 3.64079252e+04],
+        [5.15719232e-01, 2.37430538e-01, 1.00000000e-10, 1.00000000e+06],
+        [5.15657638e-01, 2.37472003e-01, 1.00000000e-10, 3.64079252e+04],
+        [5.15699162e-01, 2.37420839e-01, 1.00000000e-10, 1.71990639e+05],
+    ]
+
+    assert (np.allclose(hyper, true_hyper))
 
 def test_regression():
     path = os.path.dirname(os.path.realpath(__file__))