Skip to content

Commit 7448d13

Browse files
committed
add test for new hyperparameters fitting
1 parent 033127d commit 7448d13

4 files changed

Lines changed: 229 additions & 7 deletions

File tree

qstack/qml/b2r2.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313

1414
defaults = SimpleNamespace(rcut=3.5, gridspace=0.03)
1515

16+
class Reaction:
17+
def __init__(self, reactants, products):
18+
self.reactants = reactants
19+
self.products = products
1620

1721
def get_bags(unique_ncharges):
1822
"""Generate all unique element pair combinations including self-interactions.

qstack/regression/hyperparameters2.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0
8787
# after this point, either we are exiting early or we have found the right bounds
8888
all_errs.sort()
8989
logger.debug('local minimum in bounds, proceeding with parabolic search (bounds at: %r)', all_errs)
90-
logger.debug('chosen: %f\%f/%f', x_left, x_center, x_right)
90+
logger.debug('chosen: %f\\%f/%f', x_left, x_center, x_right)
9191
while n_iter > 0:
9292
a,b,c = fit_quadratic(x_left, x_center, x_right, y_left, y_center, y_right)
9393
if a<=0: # lol no local minimum
@@ -103,6 +103,7 @@ def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0
103103
ypred_new = -0.25*b**2/a + c
104104
y_new = get_err(x_new)
105105
n_iter -=1
106+
logger.debug('from chosen points %f\\%f/%f', x_left, x_center, x_right)
106107
logger.debug('predicted local minimum at %f->%f, true error %f', x_new, ypred_new, y_new)
107108
all_errs.append((x_new, y_new)) ; all_errs.sort()
108109
logger.debug('current data: %r', all_errs)
@@ -116,20 +117,21 @@ def parabolic_search(x_left, x_right, get_err, n_iter=10, x_thres=0.1, y_thres=0
116117
x_right, y_right = all_errs[new_index+1]
117118
x_center, y_center = all_errs[new_index]
118119

120+
elif max(y_right,y_left, y_new)-min(y_new, y_center) < y_thres:
121+
break
119122
elif y_new > y_center:
120123
if x_new > x_center:
121124
x_right, y_right = x_new, y_new
122125
else:
123126
x_left, y_left = x_new, y_new
124-
elif y_left < y_right:
125-
if max(y_right,y_left, y_new)-min(y_new, y_center) < y_thres:
126-
break
127+
else: # if y_new <= y_center
127128
if x_new > x_center:
128129
x_left, y_left = x_center, y_center
129130
x_center, y_center = x_new, y_new
130131
else:
131132
x_right, y_right = x_center, y_center
132133
x_center, y_center = x_new, y_new
134+
133135
if abs(x_right - x_left) < x_thres:
134136
break
135137

@@ -198,7 +200,7 @@ def inner_loop(split_i, alpha_i, train_idx, val_idx, alpha):
198200
if not np.isfinite(maes[:, alpha_i]).any():
199201
pass
200202
else:
201-
res = maes[alpha_i]
203+
res = maes[:,alpha_i]
202204
res = res[np.isfinite(res)]
203205
concat_results[alpha_i,0] = res.mean()
204206
concat_results[alpha_i,1] = res.std()
@@ -257,6 +259,7 @@ def get_err(log_sigma):
257259
)
258260
err_dict[log_sigma] = (alpha,costs)
259261
cost_res = costs.mean() + stddev_portion*costs.std()
262+
#print("now eval'ing σ=", sigma, '... α=', alpha, costs.shape, costs.mean(), costs.std())
260263
return cost_res
261264

262265
log_sigma_selected, cost_selected = parabolic_search(
@@ -334,7 +337,7 @@ def hyperparameters(X, y,
334337
sparse_idx = np.arange(X_train.shape[0])
335338

336339
errors = []
337-
with Parallel(n_jobs=-1, return_as="generator_unordered") as parallel:
340+
with Parallel(n_jobs=1, return_as="generator_unordered") as parallel:
338341
if optimise_sigma:
339342
err_append = lambda sigma,alpha,err,stderr: errors.append((err,stderr, alpha,sigma))
340343
_,_,_ = search_sigma(
@@ -357,7 +360,7 @@ def hyperparameters(X, y,
357360

358361

359362
errors = np.array(errors)
360-
ind = np.argsort(errors[:,0]+stddev_portion*errors[:,-1])[::-1]
363+
ind = np.argsort(errors[:,0]+stddev_portion*errors[:,1])[::-1]
361364
errors = errors[ind]
362365
return errors
363366

qstack/regression/skl_objects.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python3
2+
3+
import numpy as np
4+
from sklearn.base import BaseEstimator, TransformerMixin
5+
from sklearn.preprocessing._data import KernelCenterer
6+
from sklearn.utils.validation import (
7+
FLOAT_DTYPES,
8+
_check_sample_weight,
9+
check_is_fitted,
10+
validate_data,
11+
)
12+
13+
from qstack.qml import b2r2, slatm
14+
15+
def _restore_from_pickle(objname: str, version: int, hypers: dict, params: dict|None):
16+
pass
17+
18+
19+
20+
21+
22+
23+
class B2R2Representation(TransformerMixin, BaseEstimator):
24+
"""Transform reactions into their B2R2 representations
25+
26+
Reference:
27+
P. van Gerwen, A. Fabrizio, M. D. Wodrich, C. Corminboeuf,
28+
"Physics-based representations for machine learning properties of chemical reactions",
29+
Mach. Learn.: Sci. Technol. 3, 045005 (2022), doi:10.1088/2632-2153/ac8f1a
30+
31+
This representation can be computed for molecules or for reactions,
32+
by giving to this transformer a list of one or a list of the other.
33+
Note that no fitting is required, and this object is a simple shim best used
34+
in pipeline objects.
35+
36+
Molecules are ASE molecule objects, or any object with `.numbers` and `.positions` (in Å) properties.
37+
Reactions, however, are ``qstack.qml.b2r2.Reaction objects,
38+
or tuples of two lists of molecules (reactants, products).
39+
40+
Parameters
41+
----------
42+
variant: str, default "l"
43+
B2R2 variant to compute. Options:
44+
- 'l': Local variant with element-resolved skewed Gaussians (default).
45+
- 'a': Agnostic variant with element-pair Gaussians.
46+
- 'n': Nuclear variant with combined skewed Gaussians.
47+
48+
progress: bool, default False
49+
If True, displays progress bar
50+
51+
rcut: float, default 3.5
52+
Cutoff radius for bond detection in Å
53+
54+
gridspace: float, default 0.03
55+
Grid spacing for discretization in Å
56+
57+
58+
Attributes
59+
----------
60+
None
61+
62+
Examples
63+
--------
64+
[ fixme ]
65+
"""
66+
67+
def __init__(
68+
self,
69+
variant='l',
70+
progress=False,
71+
rcut=b2r2.defaults.rcut,
72+
gridspace=b2r2.defaults.gridspace,
73+
):
74+
"""Initialize StandardFlexibleScaler."""
75+
self.variant = variant
76+
self.progress = progress
77+
self.rcut = rcut
78+
self.gridspace = gridspace
79+
self.elements_ = []
80+
81+
def __reduce__(self):
82+
return (
83+
_restore_from_pickle,
84+
"B2R2", 1,
85+
dict(
86+
variant = self.variant,
87+
progress = self.progress,
88+
rcut = self.rcut,
89+
gridspace = self.gridspace,
90+
),
91+
{'elements_': self.elements} if self.elements else None,
92+
)
93+
94+
def fit(self, X, y=None, sample_weight=None):
95+
"""Determine the types of elements to consider, by feeding them from all objects to consider later.
96+
97+
Parameters
98+
----------
99+
X : numpy.ndarray of shape (n_samples, n_features)
100+
The data used to compute the mean and standard deviation
101+
used for later scaling along the features axis.
102+
y: None
103+
Ignored.
104+
sample_weight: numpy.ndarray of shape (n_samples,)
105+
Weights for each sample. Sample weighting can be used to center
106+
(and scale) data using a weighted mean. Weights are internally
107+
normalized before preprocessing.
108+
109+
Returns
110+
-------
111+
self : object
112+
Fitted scaler.
113+
"""
114+
115+
elems = set()
116+
for obj in X:
117+
if isinstance(obj, tuple) and len(obj) == 2:
118+
reac_mols = obj[0] + obj[1]
119+
elif isinstance(obj, b2r2.Reaction):
120+
reac_mols = obj.reactants + obj.products
121+
elif hasattr(X[0], "numbers") and hasattr(X[0], "positions"):
122+
reac_mols = [obj]
123+
for mol in reac_mols:
124+
elems.update(mol.numbers)
125+
self.elements_ = sorted(elems)
126+
127+
return self
128+
129+
def transform(self, X, y=None, copy=None):
130+
"""Normalize a vector based on previously computed mean and scaling.
131+
132+
Parameters
133+
----------
134+
X : list of length n_samples, of molecules or list of reactions
135+
The chemical objects to compute representations of.
136+
Please note they should all be of the same type (reactions OR molecules)
137+
y: None
138+
Ignored.
139+
copy : bool, default=None
140+
Ignored
141+
142+
Returns
143+
-------
144+
X : {array-like} of shape (n_samples, n_features)
145+
Transformed data.
146+
"""
147+
148+
if self.variant=='l':
149+
get_b2r2_molecular=b2r2.get_b2r2_l_molecular
150+
combine = lambda r,p: p-r
151+
elif self.variant=='a':
152+
get_b2r2_molecular = b2r2.get_b2r2_a_molecular
153+
combine = lambda r,p: p-r
154+
elif self.variant=='n':
155+
get_b2r2_molecular=b2r2.get_b2r2_n_molecular
156+
combine = lambda r,p: np.hstack((r,p))
157+
else:
158+
raise RuntimeError(f'Unknown B2R2 {variant=}')
159+
160+
if isinstance(X[0], tuple) and len(X[0]) == 2:
161+
mode = "reac-2"
162+
first_array = self._get_reac_array(X[0][0], X[0][1], get_b2r2_molecular, combine)
163+
elif isinstance(X[0], b2r2.Reaction):
164+
mode = "reac"
165+
first_array = self._get_reac_array(X[0].reactants, X[0].products, get_b2r2_molecular, combine)
166+
elif hasattr(X[0], "numbers") and hasattr(X[0], "positions"):
167+
mode = "mol"
168+
first_array = self._get_mol_array(X[0], get_b2r2_molecular)
169+
else:
170+
raise ValueError("unknown type of input")
171+
172+
assert first_array.ndim==1
173+
full_array = np.empty_like(first_array, shape=(len(X), *first_array.shape))
174+
full_array[0] = first_array
175+
176+
for object_i,x in enumerate(X[1:]):
177+
if mode == "reac-2":
178+
full_array[object_i+1] = self._get_reac_array(x[0], x[1], get_b2r2_molecular, combine)
179+
elif mode == "reac":
180+
full_array[object_i+1] = self._get_reac_array(x.reactants, x.products, get_b2r2_molecular, combine)
181+
elif mode == "mol":
182+
full_array[object_i+1] = self._get_mol_array(x, get_b2r2_molecular)
183+
return full_array
184+
185+
def _get_reac_array(self, reactants, products, mol_rep_func, combine):
186+
reac_repr = self._get_mol_array(reactants[0], mol_rep_func)
187+
for reac in reactants[1:]:
188+
reac_repr += self._get_mol_array(reac, mol_rep_func)
189+
prod_repr = self._get_mol_array(products[0], mol_rep_func)
190+
for prod in products[1:]:
191+
prod_repr += self._get_mol_array(prod, mol_rep_func)
192+
return combine(reac_repr, prod_repr)
193+
194+
def _get_mol_array(self, mol, mol_rep_func):
195+
return mol_rep_func(mol.numbers, mol.positions, self.rcut, self.gridspace, self.elements_)

tests/test_regression.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import numpy as np
55
import qstack.regression.hyperparameters as hyperparameters
6+
import qstack.regression.hyperparameters2 as hyperparameters2
67
import qstack.regression.regression as regression
78
import qstack.regression.final_error as final_error
89
import qstack.regression.condition as condition
@@ -25,6 +26,25 @@ def test_hyperparameters():
2526

2627
assert (np.allclose(hyper, true_hyper))
2728

29+
def test_hyperparameters2():
30+
#import logging
31+
#logging.basicConfig()
32+
#logging.getLogger("qstack").setLevel('DEBUG')
33+
path = os.path.dirname(os.path.realpath(__file__))
34+
xfile = os.path.join(path, 'data/mols/X_lb.npy')
35+
X = np.load(xfile)
36+
yfile = os.path.join(path, 'data/mols/dipole.dat')
37+
y = np.loadtxt(yfile)
38+
39+
hyper = hyperparameters2.hyperparameters(X, y, random_state=42)[-4:]
40+
true_hyper = [
41+
[5.15813198e-01, 2.37774396e-01, 3.16227766e-08, 3.64079252e+04],
42+
[5.15719232e-01, 2.37430538e-01, 1.00000000e-10, 1.00000000e+06],
43+
[5.15657638e-01, 2.37472003e-01, 1.00000000e-10, 3.64079252e+04],
44+
[5.15699162e-01, 2.37420839e-01, 1.00000000e-10, 1.71990639e+05],
45+
]
46+
47+
assert (np.allclose(hyper, true_hyper))
2848

2949
def test_regression():
3050
path = os.path.dirname(os.path.realpath(__file__))

0 commit comments

Comments
 (0)