3434import matplotlib .pyplot as plt
3535from sklearn .preprocessing import StandardScaler , MinMaxScaler
3636from sklearn .kernel_ridge import KernelRidge
37- from sklearn .metrics import mean_squared_error , accuracy_score , mean_absolute_error , r2_score
37+ from sklearn .cluster import KMeans
38+ from sklearn .metrics import mean_squared_error , accuracy_score , mean_absolute_error , r2_score , pairwise_distances
3839from liblibra_core import *
3940import libra_py .packages .cp2k .methods as CP2K_methods
4041import libra_py .packages .dftbplus .methods as DFTB_methods
@@ -517,6 +518,76 @@ def find_indices_inputs(params):
517518 return list (indices )
518519
519520
521+ def read_trajectory_xyz_file (file_name : str , istep : int , fstep : int ):
522+ """
523+ """
524+ f = open (file_name ,'r' )
525+ lines = f .readlines ()
526+ f .close ()
527+ # The number of atoms for each time step in the .xyz file of the trajectory.
528+ number_of_atoms = int (lines [0 ].split ()[0 ])
529+
530+ # This is used to skip the first two lines for each time step.
531+ n = number_of_atoms + 2
532+
533+ # Write the coordinates of the 'step'th time step into the file
534+ coords = []
535+ for step in range (istep , fstep + 1 ):
536+ coord = []
537+ for i in range ( n * step + 2 , n * ( step + 1 ) ):
538+ tmp = lines [ i ].split ()
539+ # print(tmp)
540+ x = float ( tmp [1 ])
541+ y = float ( tmp [2 ])
542+ z = float ( tmp [3 ])
543+ coord .append ([x ,y ,z ])
544+ coords .append (coord )
545+
546+ coords = np .array (coords )
547+ labels = []
548+ for i in range (2 , number_of_atoms + 2 ):
549+ tmp = lines [i ].split ()
550+ labels .append ( tmp [0 ])
551+
552+ return labels , coords
553+
554+ def rmsd (p1 , p2 ):
555+ """
556+ Calculate RMSD between two geometries
557+ """
558+ return np .sqrt (np .mean ((p1 - p2 )** 2 ))
559+
560+
561+ def find_kmeans_indices (trajectory_file , istep , fstep , ncluster = 10 , random_state = 0 ):
562+ """
563+ """
564+ # Read the XYZ trajectory file
565+ t1 = time .time ()
566+ labels , coords = read_trajectory_xyz_file (trajectory_file , istep , fstep )
567+ print ('Finished reading trajectory file: ' , time .time ()- t1 )
568+ # Vectorize the coordinates nparray
569+ flattened_coords = coords .reshape (coords .shape [0 ], - 1 )
570+ t1 = time .time ()
571+ rmsd_matrix = pairwise_distances (flattened_coords , metric = rmsd )
572+ print ('Finished computing the distance matrix with RMSD metric: ' , time .time ()- t1 )
573+ # Do the K-means clustering
574+ t1 = time .time ()
575+ kmeans = KMeans (n_clusters = ncluster , random_state = random_state ).fit (rmsd_matrix )
576+ print (f'Finished clustering for ncluster={ ncluster } : ' , time .time ()- t1 )
577+ clusters = kmeans .labels_
578+ indices = []
579+ for cluster_id in range (ncluster ):
580+ cluster_members = np .where (clusters == cluster_id )[0 ]
581+ # Select the first member of the cluster as representative
582+ indices .append (np .sort (cluster_members )[0 ])
583+ # Sort the indices
584+ indices = list (np .sort (indices ))
585+ # Print the geometries indices
586+ print ("Selected geometries indices are:" , indices )
587+
588+ return indices
589+
590+
520591def rebuild_matrix_from_partitions (params , partitions , output_shape ):
521592 """
522593 This function is one of the most important here. It will
@@ -598,24 +669,43 @@ def compute_properties(params, models, input_scalers, output_scalers):
598669 for i , step in enumerate (indices ):
599670 print ("======================== \n Performing calculations for step " , step )
600671 print ("*** Generating guess Hamiltonian for step " , step )
672+ tt = time .time ()
601673 generate_data .gen_data (data_gen_params , step )
674+ print ('data generation time:' , time .time ()- tt , ' seconds' )
602675 input_mat = np .load (f'{ params ["path_to_input_mats" ]} /{ params ["prefix" ]} _{ params ["input_property" ]} _{ step } .npy' )
603676 if i == 0 :
604677 ref_mat_files = glob .glob (f'{ params ["path_to_output_mats" ]} /{ params ["prefix" ]} _ref_{ params ["output_property" ]} _*.npy' )
605678 #output_mat = np.load(f'{params["path_to_output_mats"]}/{params["prefix"]}_ref_{params["output_property"]}_{step}.npy')
606679 output_mat = np .load (ref_mat_files [0 ])
607680 params ["input_partition" ] = True
681+ tt = time .time ()
608682 partitioned_input = partition_matrix (params , input_mat )
683+ print ('input partitioning time:' , time .time ()- tt , ' seconds' )
609684 # Now apply the models to each partition
685+ tt = time .time ()
610686 outputs = []
611687 for j in range (len (input_scalers )):
612688 input_scaled = input_scalers [j ].transform (np .array (partitioned_input [j ]).reshape (1 ,- 1 ))
613689 output_scaled = models [j ].predict (input_scaled )#.reshape(1,-1))
614690 output = output_scalers [j ].inverse_transform (output_scaled )
615691 outputs .append (output .reshape (output .shape [1 ]))
692+ print ('scaling data time:' , time .time ()- tt , ' seconds' )
693+ tt = time .time ()
616694 ks_ham_mat = rebuild_matrix_from_partitions (params , outputs , output_mat .shape )
695+ print ('rebuilding matrix from partitions time:' , time .time ()- tt , ' seconds' )
696+ tt = time .time ()
617697 atomic_overlap = compute_atomic_orbital_overlap_matrix (params , step )
698+ print ('atomic orbital overlap calculation time:' , time .time ()- tt , ' seconds' )
699+ tt = time .time ()
700+ #os.environ['OMP_NUM_THREADS'] = '%d'%params['nprocs']
701+ #print(type(ks_ham_mat))
702+ #print(type(atomic_overlap))
703+ #np.save('k.npy', ks_ham_mat)
704+ #np.save('s.npy', atomic_overlap)
618705 eigenvalues , eigenvectors = CP2K_methods .compute_energies_coeffs (ks_ham_mat , atomic_overlap )
706+ #eigenvalues, eigenvectors = CP2K_methods.compute_energies_coeffs_scipy(ks_ham_mat, atomic_overlap)
707+ #os.environ['OMP_NUM_THREADS'] = '1'
708+ print ('diagonalizing the KS Hamiltonian matrix time:' , time .time ()- tt , ' seconds' )
619709 if params ["do_error_analysis" ]:
620710 if not os .path .exists ("../error_data" ):
621711 os .system (f"mkdir ../error_data" )
@@ -625,6 +715,7 @@ def compute_properties(params, models, input_scalers, output_scalers):
625715 try :
626716 ks_ham_mat_ref = np .load (f'{ params ["path_to_output_mats" ]} /{ params ["prefix" ]} _ref_{ params ["output_property" ]} _{ step } .npy' )
627717 eigenvalues_ref , eigenvectors_ref = CP2K_methods .compute_energies_coeffs (ks_ham_mat_ref , atomic_overlap )
718+ #eigenvalues_ref, eigenvectors_ref = CP2K_methods.compute_energies_coeffs_scipy(ks_ham_mat_ref, atomic_overlap)
628719 # We only save the eigenvalues but not the eigenvectors of the reference calculations
629720 # The first reason is because we want to plot them and then we'll do the error analysis of all
630721 # molecular orbitals. The second reason is that we compute the \epsilon_i=<\psi_{i_{ref}}|\psi_{i_{ml}}> for
@@ -635,6 +726,7 @@ def compute_properties(params, models, input_scalers, output_scalers):
635726 os .system (f"mkdir { params ['path_to_save_ref_mos' ]} " )
636727 if params ["save_ref_eigenvalues" ]:
637728 np .save (f"{ params ['path_to_save_ref_mos' ]} /E_ref_{ step } .npy" , eigenvalues_ref ) # [lowest_orbital-1:highest_orbital])
729+ np .save (f"{ params ['path_to_save_ref_mos' ]} /E_ml_{ step } .npy" , eigenvalues ) # [lowest_orbital-1:highest_orbital])
638730 if params ["save_ref_eigenvectors" ]:
639731 np .save (f"{ params ['path_to_save_ref_mos' ]} /mos_ref_{ step } .npy" , eigenvectors_ref ) #[lowest_orbital-1:highest_orbital,:][:,lowest_orbital-1:highest_orbital])
640732 ml_ref_overlap = compute_mo_overlaps (params , eigenvectors_ref , eigenvectors , step , step ) #[lowest_orbital-1:highest_orbital,:][:,lowest_orbital-1:highest_orbital]
@@ -713,10 +805,10 @@ def compute_properties(params, models, input_scalers, output_scalers):
713805 #if params["compute_total_energy"]:
714806 # we have to make a cp2k input file based on the reference input
715807 # and then run it. Then since the files are large we need to remove them
716- os .system ("rm *.log *.npy *.wfn* *.inp *.xyz" )
808+ # os.system("rm *.log *.npy *.wfn* *.inp *.xyz")
717809 os .system ("mkdir ../ml_total_energy" )
718810 os .system ("mv output*.out ../ml_total_energy/." )
719- os .system ("rm *.out" )
811+ # os.system("rm *.out")
720812 #os.chdir("../")
721813 #os.system(f"rm -rf tmp_guess_ham_{params['job']}")
722814
0 commit comments