Merge remote-tracking branch 'public/0.2.X' into 0.2.X_merge_public

bpuchala · bpuchala · commit 04ad28d56bc9 · 2017-04-07T23:53:26.000-04:00
Should check python/vasp/vasp/io/incar.py merge

Conflicts:
	python/vasp/vasp/io/incar.py
	src/casm/app/enum.cc
	src/casm/app/format.cc
diff --git a/INSTALL.md b/INSTALL.md
@@ -5,7 +5,7 @@
 
 **C++11**
 
-CASM must be compiled with a compiler that supports the C++11 standard.
+CASM must be compiled with a compiler that supports the C++11 standard. Testing is done with gcc-4.8.5 and clang-800.0.42.1.
 
 **If using Mac OS X - Xcode command-line tools**
 
diff --git a/python/casm/casm/learn/evolve.py b/python/casm/casm/learn/evolve.py
@@ -8,6 +8,7 @@
 import deap.tools
 import deap.algorithms
 from operator import attrgetter
+from deap.tools import HallOfFame
 
 
 def initNRandomOn(container, n_features, n_features_init):
@@ -204,6 +205,7 @@ def initialize_population(n_population, toolbox, filename=None, verbose=True):
   """
   if filename is not None and os.path.exists(filename):
     load initial population
+    # may be List[Individual] or HallOfFame, which is converted to List[Individual]
   else:
     create random initial population of size n_population via toolbox.population
   """
@@ -214,6 +216,9 @@ def initialize_population(n_population, toolbox, filename=None, verbose=True):
       print "Loading initial population:", filename
     with open(filename, 'rb') as f:
       pop = pickle.load(f)
+    if isinstance(pop, HallOfFame):
+      # convert to List
+      pop = [indiv for indiv in pop]
   else:
     if verbose:
       print "Constructing initial population"
diff --git a/python/casm/casm/learn/fit.py b/python/casm/casm/learn/fit.py
@@ -475,12 +475,30 @@ def print_input_help():
   # The "problem_specs"/"weight" options specify the method to use for weighting 
   # training data. 
   #
-  #   If weights are included, then the linear model is changed from
-  #     X*b = y  ->  L*X*b = L*y, 
+  #   Ordinary least squares minimizes
+  #     (y-X*b).transpose() * (y-X*b)
+  #  
+  #   where 'X' is the correlation matrix of shape (Nvalue, Nbfunc), and 'y' 
+  #   is a vector of Nvalue calculated properties, and 'b' are the fitting 
+  #   coefficients (ECI).
   #
-  #   where 'X' is the correlation matrix of shape (Nvalue, Nbfunc),
-  #   and 'property' is a vector of Nvalue calculated properties, and 
-  #   W = L*L.transpose() is the weight matrix.
+  #   Weighted least squares minimizes
+  #     (y-X*b).transpose() * W * (y-X*b)
+  #  
+  #   Using the SVD, and given that W is Hermitian:
+  #     U * S * U.transpose() == W
+  #  
+  #   Define L such that:
+  #     L.transpose() = U * sqrt(S)
+  #  
+  #   Then we can write the weighted least squares problem using:
+  #     (y-X*b).transpose() * L.transpose() * L * (y-X*b)
+  #  
+  #   Or:
+  #     (L*y-L*X*b).transpose() * (L*y-L*X*b)
+  #    
+  #   So, if weights are included, then the linear model is changed from
+  #     X*b = y  ->  L*X*b = L*y
   #
   #   By default, W = np.matlib.eye(Nvalue) (unweighted).
   #
@@ -776,8 +794,8 @@ def print_input_help():
   #   Options for "evolve_params_kwargs":
   #
   #     "n_population": int, optional, default=100
-  #        Population size. This many random initial starting individuals are 
-  #        created.
+  #        Initial population size. This many random initial starting individuals 
+  #         are created if no "pop_begin_filename" file exists.
   #     
   #     "n_halloffame": int, optional, default=25
   #        Maxsize of the hall of fame which holds the best individuals 
@@ -798,19 +816,32 @@ def print_input_help():
   #        with.
   #
   #     "pop_begin_filename": string, optional, default="population_begin.pkl"
-  #        Filename where the initial population is read from, if it exists.
-  #
+  #        Filename suffix where the initial population is read from, if it 
+  #        exists. For example, if "filename_prefix" is "Ef_kfold10" and 
+  #        "pop_begin_filename" is "population_begin.pkl", then the initial
+  #        population is read from the file "Ef_kfold10_population_begin.pkl".
+  #
+  #        The population file may contain either a  list of individual,
+  #        as written to the "population_end.pkl" file, or a HallOfFame
+  #        instance, as written to either an "evolve_halloffame.pkl" file or
+  #        overall casm-learn "halloffame.pkl" file.
+  #        
   #     "pop_end_filename": string, optional, default="population_end.pkl"
-  #        Filename where the final population is saved.
+  #        Filename where the final population is saved. For example, if 
+  #        "filename_prefix" is "Ef_kfold10" and "pop_end_filename" is 
+  #        "population_end.pkl", then the final population is saved to the 
+  #        file "Ef_kfold10_population_end.pkl".
   #      
   #     "halloffame_filename": string, optional, default="evolve_halloffame.pkl"
   #        Filename where a hall of fame is saved holding the best individuals 
-  #        encountered in any generation.
+  #        encountered in any generation. For example, if "filename_prefix" is 
+  #        "Ef_kfold10" and "halloffame_filename" is "evolve_halloffame.pkl", 
+  #        then it is saved to the file "Ef_kfold10_evolve_halloffame.pkl".
   #      
   #     "filename_prefix": string, optional
   #        Prefix for filenames, default uses input file filename excluding 
   #        extension. For example, if input file is named "Ef_kfold10.json", then
-  #        "Ef_kfold10_population_begin.pkl", "specs2_population_end.pkl", and 
+  #        "Ef_kfold10_population_begin.pkl", "Ef_kfold10_population_end.pkl", and 
   #        "Ef_kfold10_evolve_halloffame.pkl" are used.
   
     "feature_selection" : {
@@ -919,6 +950,10 @@ def print_input_help():
   #     "uncalculated" : Predicted ground states and near ground states that have not been calculated
   #     "below_hull" : All configurations predicted below the prediction of the DFT hull
   #
+  # primitive_only: bool, optional, default=True
+  #   If True, only use primitive configurations to construct the convex hull,
+  #   else use all selected configurations. 
+  #
   # uncalculated_range: number, optional, default=0.0
   #   Include all configurations with clex_hull_dist less than this value (+hull_tol)
   #   in the "uncalculated" configurations results. Default only includes predicted
@@ -945,6 +980,7 @@ def print_input_help():
     "checkhull" : {
       "selection": "ALL",
       "write_results": true,
+      "primitive_only": true,
       "uncalculated_range": 1e-8,
       "ranged_rms": [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
       "composition": "atom_frac",
@@ -1410,11 +1446,11 @@ def read_sample_weight(input, tdata, verbose=True):
   # use method to get weights
   if specs["weight"]["method"] == "wCustom":
     if verbose:
-      print "Reading custom weights"
+      print "# Reading custom weights"
     sample_weight = tdata.data["weight"].values
   elif specs["weight"]["method"] == "wCustom2d":
     if verbose:
-      print "Reading custom2d weights"
+      print "# Reading custom2d weights"
     cols = ["weight(" + str(i) + ")" for i in xrange(tdata.n_samples)]
     sample_weight = tdata.data.loc[:,cols].values
   elif specs["weight"]["method"] == "wHullDist":
@@ -1539,7 +1575,9 @@ def check_input(name):
       pickle.dump(fdata, open(fit_data_filename, 'wb'))
     
     if verbose:
-      print "# Writing problem specs to:", fit_data_filename, "\n"
+      print "# Writing problem specs to:", fit_data_filename
+      print "# To inspect or customize the problem specs further, use the '--checkspecs' method\n"
+    
   
   # during runtime only, if LinearRegression and LeaveOneOut, update fdata.cv and fdata.scoring
   # to use optimized LOOCV score method
@@ -1813,12 +1851,16 @@ def checkhull(input, hall, indices=None, verbose=True):
             "gs_spurious" : Predicted ground states that are not DFT ground states
             "uncalculated" : Predicted ground states and near ground states that have not been calculated
             "below_hull" : All configurations predicted below the prediction of the DFT hull
-        
+
+        primitive_only: bool, optional, default=True
+          If True, only use primitive configurations to construct the convex hull,
+          else use all selected configurations. 
+
         uncalculated_range: number, optional, default=0.0
           Include all configurations with clex_hull_dist less than this value (+hull_tol)
           in the "uncalculated" configurations. Default only includes predicted
           ground states.
-        
+ 
         ranged_rms: List[number], optional, default=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
           Calculates the root-mean-square error for DFT calculated configurations
           within a particular range (in eV/unitcell) of the DFT hull. The list
@@ -1843,6 +1885,7 @@ def checkhull(input, hall, indices=None, verbose=True):
           "checkhull" : {
             "selection": "ALL",
             "write_results": True
+            "primitive_only": true,
             "uncalculated_range": 0.0,
             "ranged_rms": [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
             "composition": "atom_frac",
@@ -1878,6 +1921,7 @@ def checkhull(input, hall, indices=None, verbose=True):
   opt = {
     "selection":"ALL", 
     "write_results":False,
+    "primitive_only":True,
     "uncalculated_range":0.0,
     "hull_tol":1e-8,
     "composition":"atom_frac",
@@ -1905,6 +1949,7 @@ def checkhull(input, hall, indices=None, verbose=True):
   uncalculated_range = d["uncalculated_range"]
   dim_tol = d["dim_tol"]
   bottom_tol = d["bottom_tol"]
+  primitive_only = d["primitive_only"]
   
   # save current default clex
   orig_clex = proj.settings.default_clex
@@ -1946,8 +1991,11 @@ def checkhull(input, hall, indices=None, verbose=True):
   comp = "comp"
   dft_Eform = "formation_energy"
   clex_Eform = "clex(formation_energy)"
-  
-  sel.query([comp, is_primitive, is_calculated, configname, dft_hull_dist_long, dft_Eform])
+
+  query_cols = [comp, is_calculated, configname, dft_hull_dist_long, dft_Eform]
+  if primitive_only:
+    query_cols.append(is_primitive)
+  sel.query(query_cols)
   
   
   compcol = []
@@ -1979,7 +2027,10 @@ def checkhull(input, hall, indices=None, verbose=True):
     indiv = hall[indiv_i]
     write_eci(proj, indiv.eci, fit_details=casm.learn.to_json(indiv_i, indiv), clex=clex, verbose=verbose)
     
-    df = sel.data[sel.data.loc[:,is_primitive] == 1].sort_values(compcol)
+    if primitive_only:
+      df = sel.data[sel.data.loc[:,is_primitive] == 1].sort_values(compcol)
+    else:
+      df = sel.data.sort_values(compcol)
     df_calc = df[df.loc[:,is_calculated] == 1].apply(pandas.to_numeric, errors='ignore')
     dft_gs = df_calc[df_calc.loc[:,dft_hull_dist_long] < hull_tol]
 
@@ -1990,7 +2041,10 @@ def checkhull(input, hall, indices=None, verbose=True):
     # query:
     sel.query([clex_hull_dist_long, clex_Eform, clex_dft_hull_dist_long], force=True)
     
-    df = sel.data[sel.data.loc[:,is_primitive] == 1].sort_values(compcol)
+    if primitive_only:
+      df = sel.data[sel.data.loc[:,is_primitive] == 1].sort_values(compcol)
+    else:
+      df = sel.data.sort_values(compcol)
     df.rename(
       inplace=True, 
       columns={
@@ -2029,7 +2083,7 @@ def printer(attr, title):
       if df.shape[0]:
         if verbose:
           print title + ":"
-          print df.drop(to_drop, axis=1).to_string(**kwargs)
+          print df.drop(to_drop, axis=1, errors='ignore').to_string(**kwargs)
           if write_results:
             print "write:", output_name, "\n"
           else:
diff --git a/python/casm/casm/learn/tools.py b/python/casm/casm/learn/tools.py
@@ -76,12 +76,32 @@ def set_sample_weight(sample_weight, y=None, X=None):
   """ 
   Calculate weighted data and weighted target values.
   
-  Uses sample weights to calculate
+  Ordinary least squares minimizes
+    (y-X*b).transpose() * (y-X*b)
   
-    L*X * b = L*y 
+  where 'X' is the correlation matrix of shape (Nvalue, Nbfunc), and 'y' 
+  is a vector of Nvalue calculated properties, and 'b' are the fitting 
+  coefficients (ECI).
+
+  Weighted least squares minimizes
+    (y-X*b).transpose() * W * (y-X*b)
+  
+  Using the SVD, and given that W is Hermitian:
+    U * S * U.transpose() == W
+  
+  Define L such that:
+    L.transpose() = U * sqrt(S)
   
-  a weighted linear model where the weights are given by W = L * L.transpose().
+  Then we can write the weighted least squares problem using:
+    (y-X*b).transpose() * L.transpose() * L * (y-X*b)
   
+  Or:
+    (L*y-L*X*b).transpose() * (L*y-L*X*b)
+    
+  So, if weights are included, then the linear model is changed from
+    X*b = y  ->  L*X*b = L*y
+    
+    
   Arguments
   ---------
     
@@ -142,7 +162,8 @@ def set_sample_weight(sample_weight, y=None, X=None):
     raise Exception("Error in set_sample_weight: sample_weight dimension > 2")
   
   # weighted data
-  L = np.linalg.cholesky(W)
+  U, S, V = np.linalg.svd(W)
+  L = U.dot(np.diag(np.sqrt(S))).transpose()
   
   if X is not None:
     weighted_X = np.dot(L, X)
diff --git a/src/casm/app/format.cc b/src/casm/app/format.cc
@@ -734,27 +734,27 @@ LCHARG = .FALSE.\n";
       args.log << "This JSON file contains specifications for generating the cluster\n" <<
                "basis functions.                                                    \n\n";
 
-      args.log << "'site_basis_functions' may specify a string, which can be either 'occupation' or \n"
-               << "'chebychev'. Otherwise, specifies a JSON object containing a composition vector or\n"
-               << "a JSON array containing multiple composition vectors. A single composition vector\n"
-               << "is formatted as, e.g.\n"
-               << "   \"composition\" : [\"Au\" : 0.25, \"Cu\" : 0.75] \n"
-               << "The site basis functions will then be constructed as to be optimized for that composition.\n\n"
+      std::cout << "'site_basis_functions' may specify a string, which can be either 'occupation' or \n"
+                << "'chebychev'. Otherwise, specifies a JSON object containing a composition vector or\n"
+                << "a JSON array containing multiple composition vectors. A single composition vector\n"
+                << "is formatted as, e.g.\n"
+                << "   \"composition\" : {\"Au\" : 0.25, \"Cu\" : 0.75} \n"
+                << "The site basis functions will then be constructed as to be optimized for that composition.\n\n"
 
                << "To specify different compositions on multiple sublattices, an array can be used. \n"
                << "As an example, the following specifies a different composition on sublattice 0 than\n"
                << "on sublattices 1 and 3: \n\n"
 
-               << "   \"site_basis_functions\" : [\n"
-               << "                                {\n"
-               << "                                  \"composition\" : [\"Ga\" : 0.3, \"In\" : 0.7],\n"
-               << "                                  \"sublat_indices\" : [0]\n"
-               << "                                },\n"
-               << "                                {\n"
-               << "                                  \"composition\" : [\"Ga\" : 1.0, \"In\" : 0.0],\n"
-               << "                                  \"sublat_indices\" : [1,2]\n"
-               << "                                }\n"
-               << "                             ]\n\n"
+                << "   \"site_basis_functions\" : [\n"
+                << "                                {\n"
+                << "                                  \"composition\" : {\"Ga\" : 0.3, \"In\" : 0.7},\n"
+                << "                                  \"sublat_indices\" : [0]\n"
+                << "                                },\n"
+                << "                                {\n"
+                << "                                  \"composition\" : {\"Ga\" : 1.0, \"In\" : 0.0},\n"
+                << "                                  \"sublat_indices\" : [1,2]\n"
+                << "                                }\n"
+                << "                             ]\n\n"
 
                << "Sublattices are specified in the same order as in prim.json. Sublattice compositions\n"
                << "are not allowed to break the symmetry of the crystal. If equivalent sublattices are\n"
diff --git a/src/casm/monte_carlo/grand_canonical/GrandCanonical.cc b/src/casm/monte_carlo/grand_canonical/GrandCanonical.cc
@@ -423,8 +423,7 @@ namespace CASM {
     auto corr = correlations(config, _clexulator());
     double formation_energy = _eci() * corr.data();
     auto comp_x = primclex().composition_axes().param_composition(CASM::comp_n(config));
-    double Ep = formation_energy - comp_x.dot(m_condition.param_chem_pot());
-    return Ep / supercell().volume();
+    return formation_energy - comp_x.dot(m_condition.param_chem_pot());
   }
 
   /// \brief Calculate delta correlations for an event
diff --git a/src/casm/monte_carlo/grand_canonical/GrandCanonicalIO.cc b/src/casm/monte_carlo/grand_canonical/GrandCanonicalIO.cc
@@ -286,7 +286,8 @@ namespace CASM {
 
     DataFormatter<ConstMonteCarloPtr> formatter;
 
-    formatter.push_back(ConstantValueFormatter<std::string, ConstMonteCarloPtr>("configname", configname));
+    bool print_json = true;
+    formatter.push_back(ConstantValueFormatter<std::string, ConstMonteCarloPtr>("configname", configname, print_json));
     formatter.push_back(MonteCarloTFormatter<GrandCanonical>());
     formatter.push_back(GrandCanonicalLTEFormatter(phi_LTE1));
     std::set<std::string> exclude;

Original file line number	Diff line number	Diff line change
`@@ -423,8 +423,7 @@ namespace CASM {`
`423`	`423`	`auto corr = correlations(config, _clexulator());`
`424`	`424`	`double formation_energy = _eci() * corr.data();`
`425`	`425`	`auto comp_x = primclex().composition_axes().param_composition(CASM::comp_n(config));`
`426`		`- double Ep = formation_energy - comp_x.dot(m_condition.param_chem_pot());`
`427`		`- return Ep / supercell().volume();`
	`426`	`+ return formation_energy - comp_x.dot(m_condition.param_chem_pot());`
`428`	`427`	`}`
`429`	`428`
`430`	`429`	`/// \brief Calculate delta correlations for an event`