Merge branch 'devel' of https://github.com/Quantum-Dynamics-Hub/libra-code into devel

alexvakimov · alexvakimov · commit 4fb52eccc19c · 2024-08-06T01:46:20.000-04:00
diff --git a/src/libra_py/packages/cp2k/methods.py b/src/libra_py/packages/cp2k/methods.py
@@ -23,6 +23,7 @@
 import re
 import numpy as np
 import scipy.sparse as sp
+import scipy.linalg
 import time
 import glob 
 from libra_py.workflows.nbra import step2_many_body
@@ -2126,6 +2127,36 @@ def compute_energies_coeffs(ks_mat, overlap):
     eigenvectors = eigenvectors[:,sorted_indices].T
     
     
+    return eigenvalues[sorted_indices], eigenvectors
+
+
+def compute_energies_coeffs_scipy(ks_mat, overlap):
+    """
+    This function solves the general eigenvalue problem described above using a Cholesky decomposition
+    of the overlap matrix. The eigenvalues are sorted.
+    More information: https://doi.org/10.1016/j.cpc.2004.12.014
+    Args:
+        ks_mat (numpy array): The Kohn-Sham matrix
+        overlap (numpy array): The atomic orbital overlap matrix
+    Returns:
+        eigenvalues (numpy array): The energies (eigenvalues)
+        eigenvectors (numpy array): The MO coefficients
+    """
+    # Cholesky decomposition of the overlap matrix
+    U = scipy.linalg.cholesky( overlap ).T
+    # One ca also use the following as well but it is computationally more demanding
+    # U = scipy.linalg.fractional_matrix_power(S, 0.5)
+    U_inv = scipy.linalg.inv( U )
+    UT_inv = scipy.linalg.inv( U.T )
+    #K_prime = scipy.linalg.multi_dot( [UT_inv, ks_mat, U_inv] )
+    K_prime = UT_inv @ ks_mat @ U_inv
+    eigenvalues, eigenvectors = scipy.linalg.eig( K_prime )
+    # Transform back the coefficients 
+    eigenvectors = U_inv @ eigenvectors
+    sorted_indices = np.argsort(eigenvalues) 
+    eigenvectors = eigenvectors[:,sorted_indices].T
+    
+    
     return eigenvalues[sorted_indices], eigenvectors
 
 def compute_density_matrix(eigenvectors, homo_index):
diff --git a/src/libra_py/packages/dftbplus/methods.py b/src/libra_py/packages/dftbplus/methods.py
@@ -685,7 +685,7 @@ def dftb_distribute( istep, fstep, nsteps_this_job, trajectory_xyz_file, dftb_in
     for step in range( nsteps_this_job ):
 
         # extract the curr_step xyz coordianates from the trajectory file and write it to another xyz file
-        CP2K_methods.read_trajectory_xyz_file( trajectory_xyz_file, curr_step )
+        _, _ = CP2K_methods.read_trajectory_xyz_file( trajectory_xyz_file, curr_step )
         curr_step += 1
 
     # Go back to the main directory
diff --git a/src/libra_py/packages/gaussian/methods.py b/src/libra_py/packages/gaussian/methods.py
@@ -289,7 +289,7 @@ def gaussian_distribute( istep, fstep, nsteps_this_job, trajectory_xyz_file, gau
     for step in range( nsteps_this_job ):
 
         # Extract the coordinates and write them to a xyz file
-        CP2K_methods.read_trajectory_xyz_file( trajectory_xyz_file, curr_step )
+        _, _ = CP2K_methods.read_trajectory_xyz_file( trajectory_xyz_file, curr_step )
 
         # Now, we need to edit the gaussian_input file by adding the 
         # coordinates to the input file
diff --git a/src/libra_py/workflows/nbra/generate_data.py b/src/libra_py/workflows/nbra/generate_data.py
@@ -46,7 +46,7 @@ def make_input(prefix, input_template, software, trajectory_xyz_file, step):
     lines_input = f.readlines()
     f.close()
 
-    CP2K_methods.read_trajectory_xyz_file(trajectory_xyz_file, step)
+    _, _ = CP2K_methods.read_trajectory_xyz_file(trajectory_xyz_file, step)
 
     if software.lower()=="cp2k":
         f = open(F"input_{prefix}_{step}.inp", "w")
diff --git a/src/libra_py/workflows/nbra/ml_map.py b/src/libra_py/workflows/nbra/ml_map.py
@@ -34,7 +34,8 @@
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.kernel_ridge import KernelRidge
-from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score
+from sklearn.cluster import KMeans
+from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score, pairwise_distances
 from liblibra_core import *
 import libra_py.packages.cp2k.methods as CP2K_methods
 import libra_py.packages.dftbplus.methods as DFTB_methods
@@ -517,6 +518,76 @@ def find_indices_inputs(params):
     return list(indices)
 
 
+def read_trajectory_xyz_file(file_name: str, istep: int, fstep: int):
+    """
+    """
+    f = open(file_name,'r')
+    lines = f.readlines()
+    f.close()
+    # The number of atoms for each time step in the .xyz file of the trajectory.
+    number_of_atoms = int(lines[0].split()[0])
+
+    # This is used to skip the first two lines for each time step.
+    n = number_of_atoms+2
+
+    # Write the coordinates of the 'step'th time step into the file
+    coords = []
+    for step in range(istep, fstep+1):
+        coord = []
+        for i in range( n * step + 2, n * ( step + 1 ) ):
+            tmp = lines[ i ].split()
+#             print(tmp)
+            x = float( tmp[1])
+            y = float( tmp[2])
+            z = float( tmp[3])
+            coord.append([x,y,z])
+        coords.append(coord)
+    
+    coords = np.array(coords)
+    labels = []
+    for i in range(2, number_of_atoms+2):
+        tmp = lines[i].split()
+        labels.append( tmp[0])
+
+    return labels, coords
+
+def rmsd(p1, p2):
+    """
+    Calculate RMSD between two geometries
+    """
+    return np.sqrt(np.mean((p1 - p2)**2))
+
+
+def find_kmeans_indices(trajectory_file, istep, fstep, ncluster=10, random_state=0):
+    """
+    """
+    # Read the XYZ trajectory file
+    t1 = time.time()
+    labels, coords = read_trajectory_xyz_file(trajectory_file, istep, fstep)
+    print('Finished reading trajectory file: ', time.time()-t1)
+    # Vectorize the coordinates nparray
+    flattened_coords = coords.reshape(coords.shape[0], -1)
+    t1 = time.time()
+    rmsd_matrix = pairwise_distances(flattened_coords, metric=rmsd)
+    print('Finished computing the distance matrix with RMSD metric: ', time.time()-t1)
+    # Do the K-means clustering
+    t1 = time.time()
+    kmeans = KMeans(n_clusters=ncluster, random_state=random_state).fit(rmsd_matrix) 
+    print(f'Finished clustering for ncluster={ncluster}: ', time.time()-t1)
+    clusters = kmeans.labels_
+    indices = []
+    for cluster_id in range(ncluster):
+        cluster_members = np.where(clusters == cluster_id)[0]
+        # Select the first member of the cluster as representative
+        indices.append(np.sort(cluster_members)[0])
+    # Sort the indices
+    indices = list(np.sort(indices))
+    # Print the geometries indices
+    print("Selected geometries indices are:", indices)
+
+    return indices
+
+
 def rebuild_matrix_from_partitions(params, partitions, output_shape):
     """
     This function is one of the most important here. It will
@@ -598,24 +669,43 @@ def compute_properties(params, models, input_scalers, output_scalers):
     for i, step in enumerate(indices):
         print("======================== \n Performing calculations for step ", step)
         print("*** Generating guess Hamiltonian for step ", step)
+        tt = time.time()
         generate_data.gen_data(data_gen_params, step) 
+        print('data generation time:', time.time()-tt, ' seconds')
         input_mat = np.load(f'{params["path_to_input_mats"]}/{params["prefix"]}_{params["input_property"]}_{step}.npy')
         if i==0: 
             ref_mat_files = glob.glob(f'{params["path_to_output_mats"]}/{params["prefix"]}_ref_{params["output_property"]}_*.npy')
             #output_mat = np.load(f'{params["path_to_output_mats"]}/{params["prefix"]}_ref_{params["output_property"]}_{step}.npy')
             output_mat = np.load(ref_mat_files[0])
         params["input_partition"] = True
+        tt = time.time()
         partitioned_input = partition_matrix(params, input_mat)
+        print('input partitioning time:', time.time()-tt, ' seconds')
         # Now apply the models to each partition
+        tt = time.time()
         outputs = []
         for j in range(len(input_scalers)):
             input_scaled = input_scalers[j].transform(np.array(partitioned_input[j]).reshape(1,-1))
             output_scaled = models[j].predict(input_scaled)#.reshape(1,-1))
             output = output_scalers[j].inverse_transform(output_scaled)
             outputs.append(output.reshape(output.shape[1]))
+        print('scaling data time:', time.time()-tt, ' seconds')
+        tt = time.time()
         ks_ham_mat = rebuild_matrix_from_partitions(params, outputs, output_mat.shape)
+        print('rebuilding matrix from partitions time:', time.time()-tt, ' seconds')
+        tt = time.time()
         atomic_overlap = compute_atomic_orbital_overlap_matrix(params, step)
+        print('atomic orbital overlap calculation time:', time.time()-tt, ' seconds')
+        tt = time.time()
+        #os.environ['OMP_NUM_THREADS'] = '%d'%params['nprocs']
+        #print(type(ks_ham_mat))
+        #print(type(atomic_overlap))
+        #np.save('k.npy', ks_ham_mat)
+        #np.save('s.npy', atomic_overlap)
         eigenvalues, eigenvectors = CP2K_methods.compute_energies_coeffs(ks_ham_mat, atomic_overlap)
+        #eigenvalues, eigenvectors = CP2K_methods.compute_energies_coeffs_scipy(ks_ham_mat, atomic_overlap)
+        #os.environ['OMP_NUM_THREADS'] = '1'
+        print('diagonalizing the KS Hamiltonian matrix time:', time.time()-tt, ' seconds')
         if params["do_error_analysis"]:
             if not os.path.exists("../error_data"):
                 os.system(f"mkdir ../error_data")
@@ -625,6 +715,7 @@ def compute_properties(params, models, input_scalers, output_scalers):
             try:
                 ks_ham_mat_ref = np.load(f'{params["path_to_output_mats"]}/{params["prefix"]}_ref_{params["output_property"]}_{step}.npy')
                 eigenvalues_ref, eigenvectors_ref = CP2K_methods.compute_energies_coeffs(ks_ham_mat_ref, atomic_overlap)
+                #eigenvalues_ref, eigenvectors_ref = CP2K_methods.compute_energies_coeffs_scipy(ks_ham_mat_ref, atomic_overlap)
                 # We only save the eigenvalues but not the eigenvectors of the reference calculations
                 # The first reason is because we want to plot them and then we'll do the error analysis of all
                 # molecular orbitals. The second reason is that we compute the \epsilon_i=<\psi_{i_{ref}}|\psi_{i_{ml}}> for
@@ -635,6 +726,7 @@ def compute_properties(params, models, input_scalers, output_scalers):
                         os.system(f"mkdir {params['path_to_save_ref_mos']}")
                 if params["save_ref_eigenvalues"]:
                     np.save(f"{params['path_to_save_ref_mos']}/E_ref_{step}.npy", eigenvalues_ref) # [lowest_orbital-1:highest_orbital])
+                    np.save(f"{params['path_to_save_ref_mos']}/E_ml_{step}.npy", eigenvalues) # [lowest_orbital-1:highest_orbital]) 
                 if params["save_ref_eigenvectors"]:
                     np.save(f"{params['path_to_save_ref_mos']}/mos_ref_{step}.npy", eigenvectors_ref) #[lowest_orbital-1:highest_orbital,:][:,lowest_orbital-1:highest_orbital])
                 ml_ref_overlap = compute_mo_overlaps(params, eigenvectors_ref, eigenvectors, step, step) #[lowest_orbital-1:highest_orbital,:][:,lowest_orbital-1:highest_orbital]
@@ -713,10 +805,10 @@ def compute_properties(params, models, input_scalers, output_scalers):
             #if params["compute_total_energy"]:
             # we have to make a cp2k input file based on the reference input
             # and then run it. Then since the files are large we need to remove them
-    os.system("rm *.log *.npy *.wfn* *.inp *.xyz")
+    #os.system("rm *.log *.npy *.wfn* *.inp *.xyz")
     os.system("mkdir ../ml_total_energy")
     os.system("mv output*.out ../ml_total_energy/.")
-    os.system("rm *.out")
+    #os.system("rm *.out")
     #os.chdir("../")
     #os.system(f"rm -rf tmp_guess_ham_{params['job']}")
 
diff --git a/src/libra_py/workflows/nbra/step2.py b/src/libra_py/workflows/nbra/step2.py
@@ -395,7 +395,7 @@ def run_cp2k_libint_step2(params):
 
     # Now try to get parameters from the input
     critical_params = [ "cp2k_ot_input_template", "cp2k_diag_input_template", "trajectory_xyz_filename" ]
-    default_params = {"res_dir": os.getcwd() + "/res", "all_logfiles": os.getcwd() + "/all_logfiles", "all_pdosfiles": os.getcwd() + "/all_pdosfiles", "all_images": os.getcwd() + "/all_images", "image_format": 'bmp', "istep": 0, "fstep": 2, "lowest_orbital": 1, "highest_orbital": 2, "is_spherical": True, "isXTB": False, "isUKS": False, "remove_molden": True, "nprocs": 2, "cp2k_exe": "cp2k.psmp", "mpi_executable": "mpirun", "cube_visualization": False, "vmd_input_template": "vmd.tcl", "states_to_plot": [1], "plot_phase_corrected": True, "vmd_exe": "vmd", "tachyon_exe": "tachyon_LINIXAMD64", "x_pixels": 1024, "y_pixels": 1024, "remove_cube": True, 'together_mode': False}
+    default_params = {"res_dir": os.getcwd() + "/res", "all_logfiles": os.getcwd() + "/all_logfiles", "all_pdosfiles": os.getcwd() + "/all_pdosfiles", "all_images": os.getcwd() + "/all_images", "image_format": 'bmp', "istep": 0, "fstep": 2, "lowest_orbital": 1, "highest_orbital": 2, "is_spherical": True, "isXTB": False, "isUKS": False, "remove_molden": True, "nprocs": 2, "cp2k_exe": "cp2k.psmp", "mpi_executable": "mpirun", "cube_visualization": False, "vmd_input_template": "vmd.tcl", "states_to_plot": [1], "plot_phase_corrected": True, "vmd_exe": "vmd", "tachyon_exe": "tachyon_LINIXAMD64", "x_pixels": 1024, "y_pixels": 1024, "remove_cube": True, 'together_mode': False, 'restart_file': True}
     comn.check_input(params, default_params, critical_params)
 
 
diff --git a/src/libra_py/workflows/nbra/step3.py b/src/libra_py/workflows/nbra/step3.py
@@ -2545,11 +2545,12 @@ def run_step3_sd_nacs_libint(params):
             # Since we have performed state-reordering we need to 
             # convert to scipy npz format now
             t2 = time.time()
-            for i in range(len(St_sds_cmatrix)):
+            for i in range(len(St_sds_cmatrix)-1):
                 St_sds[i] = data_conv.MATRIX2scipynpz( St_sds_cmatrix[i].real() )
-                sd2ci = SD2CI[i]
+                sd2ci_prev = SD2CI[i]
+                sd2ci_curr = SD2CI[i+1]
                 # Compute the St_ci
-                St_ci = np.linalg.multi_dot([sd2ci.T, St_sds[i].todense().real, sd2ci])
+                St_ci = np.linalg.multi_dot([sd2ci_prev.T, St_sds[i].todense().real, sd2ci_curr])
                 St_cis.append(sp.csc_matrix(St_ci))
 
             # Now we need to apply state-reordering to St_cis
@@ -2580,10 +2581,11 @@ def run_step3_sd_nacs_libint(params):
                 St_cis[i] = data_conv.MATRIX2scipynpz( St_cis_cmatrix[i].real() )
     
         else:
-            for i in range(len(St_sds)):
-                sd2ci = SD2CI[i]
+            for i in range(len(St_sds)-1):
+                sd2ci_prev = SD2CI[i]
+                sd2ci_curr = SD2CI[i+1]
                 # Compute the St_ci
-                St_ci = np.linalg.multi_dot([sd2ci.T, St_sds[i].todense().real, sd2ci])
+                St_ci = np.linalg.multi_dot([sd2ci_prev.T, St_sds[i].todense().real, sd2ci_curr])
                 St_cis.append(sp.csc_matrix(St_ci))
                 sp.save_npz(F'{params["path_to_save_sd_Hvibs"]}/St_ci_{step+start_time}_re.npz', sp.csc_matrix(St_ci))