Skip to content

Commit 3564384

Browse files
committed
casm-learn set sample weights using SVD; addresses #15 "Matrix is not positive definite" error when using 0-value weights
1 parent ec7bb50 commit 3564384

2 files changed

Lines changed: 66 additions & 17 deletions

File tree

python/casm/casm/learn/fit.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -474,12 +474,30 @@ def print_input_help():
474474
# The "problem_specs"/"weight" options specify the method to use for weighting
475475
# training data.
476476
#
477-
# If weights are included, then the linear model is changed from
478-
# X*b = y -> L*X*b = L*y,
477+
# Ordinary least squares minimizes
478+
# (y-X*b).transpose() * (y-X*b)
479+
#
480+
# where 'X' is the correlation matrix of shape (Nvalue, Nbfunc), and 'y'
481+
# is a vector of Nvalue calculated properties, and 'b' are the fitting
482+
# coefficients (ECI).
479483
#
480-
# where 'X' is the correlation matrix of shape (Nvalue, Nbfunc),
481-
# and 'property' is a vector of Nvalue calculated properties, and
482-
# W = L*L.transpose() is the weight matrix.
484+
# Weighted least squares minimizes
485+
# (y-X*b).transpose() * W * (y-X*b)
486+
#
487+
# Using the SVD, and given that W is Hermitian:
488+
# U * S * U.transpose() == W
489+
#
490+
# Define L such that:
491+
# L.transpose() = U * sqrt(S)
492+
#
493+
# Then we can write the weighted least squares problem using:
494+
# (y-X*b).transpose() * L.transpose() * L * (y-X*b)
495+
#
496+
# Or:
497+
# (L*y-L*X*b).transpose() * (L*y-L*X*b)
498+
#
499+
# So, if weights are included, then the linear model is changed from
500+
# X*b = y -> L*X*b = L*y
483501
#
484502
# By default, W = np.matlib.eye(Nvalue) (unweighted).
485503
#
@@ -797,19 +815,27 @@ def print_input_help():
797815
# with.
798816
#
799817
# "pop_begin_filename": string, optional, default="population_begin.pkl"
800-
# Filename where the initial population is read from, if it exists.
801-
#
818+
# Filename suffix where the initial population is read from, if it
819+
# exists. For example, if "filename_prefix" is "Ef_kfold10" and
820+
# "pop_begin_filename" is "population_begin.pkl", then the initial
821+
# population is read from the file "Ef_kfold10_population_begin.pkl".
822+
#
802823
# "pop_end_filename": string, optional, default="population_end.pkl"
803-
# Filename where the final population is saved.
824+
# Filename where the final population is saved. For example, if
825+
# "filename_prefix" is "Ef_kfold10" and "pop_end_filename" is
826+
# "population_end.pkl", then the final population is saved to the
827+
# file "Ef_kfold10_population_end.pkl".
804828
#
805829
# "halloffame_filename": string, optional, default="evolve_halloffame.pkl"
806830
# Filename where a hall of fame is saved holding the best individuals
807-
# encountered in any generation.
831+
# encountered in any generation. For example, if "filename_prefix" is
832+
# "Ef_kfold10" and "halloffame_filename" is "evolve_halloffame.pkl",
833+
# then it is saved to the file "Ef_kfold10_evolve_halloffame.pkl".
808834
#
809835
# "filename_prefix": string, optional
810836
# Prefix for filenames, default uses input file filename excluding
811837
# extension. For example, if input file is named "Ef_kfold10.json", then
812-
# "Ef_kfold10_population_begin.pkl", "specs2_population_end.pkl", and
838+
# "Ef_kfold10_population_begin.pkl", "Ef_kfold10_population_end.pkl", and
813839
# "Ef_kfold10_evolve_halloffame.pkl" are used.
814840
815841
"feature_selection" : {
@@ -1414,11 +1440,11 @@ def read_sample_weight(input, tdata, verbose=True):
14141440
# use method to get weights
14151441
if specs["weight"]["method"] == "wCustom":
14161442
if verbose:
1417-
print "Reading custom weights"
1443+
print "# Reading custom weights"
14181444
sample_weight = tdata.data["weight"].values
14191445
elif specs["weight"]["method"] == "wCustom2d":
14201446
if verbose:
1421-
print "Reading custom2d weights"
1447+
print "# Reading custom2d weights"
14221448
cols = ["weight(" + str(i) + ")" for i in xrange(tdata.n_samples)]
14231449
sample_weight = tdata.data.loc[:,cols].values
14241450
elif specs["weight"]["method"] == "wHullDist":
@@ -1543,7 +1569,9 @@ def check_input(name):
15431569
pickle.dump(fdata, open(fit_data_filename, 'wb'))
15441570

15451571
if verbose:
1546-
print "# Writing problem specs to:", fit_data_filename, "\n"
1572+
print "# Writing problem specs to:", fit_data_filename
1573+
print "# To inspect or customize the problem specs further, use the '--checkspecs' method\n"
1574+
15471575

15481576
# during runtime only, if LinearRegression and LeaveOneOut, update fdata.cv and fdata.scoring
15491577
# to use optimized LOOCV score method

python/casm/casm/learn/tools.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,32 @@ def set_sample_weight(sample_weight, y=None, X=None):
7676
"""
7777
Calculate weighted data and weighted target values.
7878
79-
Uses sample weights to calculate
79+
Ordinary least squares minimizes
80+
(y-X*b).transpose() * (y-X*b)
8081
81-
L*X * b = L*y
82+
where 'X' is the correlation matrix of shape (Nvalue, Nbfunc), and 'y'
83+
is a vector of Nvalue calculated properties, and 'b' are the fitting
84+
coefficients (ECI).
85+
86+
Weighted least squares minimizes
87+
(y-X*b).transpose() * W * (y-X*b)
88+
89+
Using the SVD, and given that W is Hermitian:
90+
U * S * U.transpose() == W
91+
92+
Define L such that:
93+
L.transpose() = U * sqrt(S)
8294
83-
a weighted linear model where the weights are given by W = L * L.transpose().
95+
Then we can write the weighted least squares problem using:
96+
(y-X*b).transpose() * L.transpose() * L * (y-X*b)
8497
98+
Or:
99+
(L*y-L*X*b).transpose() * (L*y-L*X*b)
100+
101+
So, if weights are included, then the linear model is changed from
102+
X*b = y -> L*X*b = L*y
103+
104+
85105
Arguments
86106
---------
87107
@@ -142,7 +162,8 @@ def set_sample_weight(sample_weight, y=None, X=None):
142162
raise Exception("Error in set_sample_weight: sample_weight dimension > 2")
143163

144164
# weighted data
145-
L = np.linalg.cholesky(W)
165+
U, S, V = np.linalg.svd(W)
166+
L = U.dot(np.diag(np.sqrt(S))).transpose()
146167

147168
if X is not None:
148169
weighted_X = np.dot(L, X)

0 commit comments

Comments
 (0)