add averaged conditioning

geraseva · geraseva · commit dd5aefb008e2 · 2025-10-07T22:37:17.000+03:00
diff --git a/rfdiffusion/inference/model_runners.py b/rfdiffusion/inference/model_runners.py
@@ -428,12 +428,14 @@ def sample_init(self, return_forward_trajectory=False):
                 het_names = np.array([i['name'].strip() for i in self.target_feats['info_het']])
                 xyz_het = self.target_feats['xyz_het'][het_names == self._conf.potentials.substrate]
                 xyz_het = torch.from_numpy(xyz_het)
+                info_het={x: self.target_feats['info_het'][x][het_names == self._conf.potentials.substrate] for x in self.target_feats['info_het']}
                 assert xyz_het.shape[0] > 0, f'expected >0 heteroatoms from ligand with name {self._conf.potentials.substrate}'
                 xyz_motif_prealign = xyz_motif_prealign[0,0][self.diffusion_mask.squeeze()]
                 motif_prealign_com = xyz_motif_prealign[:,1].mean(dim=0)
                 xyz_het_com = xyz_het.mean(dim=0)
                 for pot in self.potential_manager.potentials_to_apply:
                     pot.motif_substrate_atoms = xyz_het
+                    pot.substrate_info = info_het
                     pot.diffusion_mask = self.diffusion_mask.squeeze()
                     pot.xyz_motif = xyz_motif_prealign
                     pot.diffuser = self.diffuser
@@ -444,9 +446,9 @@ def sample_init(self, return_forward_trajectory=False):
 
         if self.potential_conf.guiding_potentials is not None:
             if any(list(filter(lambda x: "na_" in x, self.potential_conf.guiding_potentials))):
-                assert len(self.target_feats['xyz_na']) > 0, "If you're using the NA Contact potential, \
+                assert len(self.target_feats['na_xyz']) > 0, "If you're using the NA Contact potential, \
                         you need to make sure there's a NA in the input_pdb file!"
-                info_na = self.target_feats["na_info"],
+                info_na = self.target_feats["na_info"]
                 xyz_het = self.target_feats['na_xyz']
                 xyz_het = torch.from_numpy(xyz_het)
                 xyz_motif_prealign = xyz_motif_prealign[0,0][self.diffusion_mask.squeeze()]
diff --git a/rfdiffusion/inference/utils.py b/rfdiffusion/inference/utils.py
@@ -122,6 +122,28 @@ def get_mu_xt_x0(xt, px0, t, beta_schedule, alphabar_schedule, eps=1e-6):
     return mu, sigma
 
 
+def rigid_rotation_from_grads(Cas, Ca_grads, eps=1e-8):
+    center=Cas.mean(dim=0) # (3,)
+    r=Cas-center # (L,3)
+
+    trans=Ca_grads.mean(dim=0, keepdim=True) # (L, 3)
+    d=Ca_grads-trans # (L,3)
+    eye=torch.eye(3, device=Cas.device, dtype=Cas.dtype)
+    r2=(r**2).sum(dim=1) # (L,)
+    rrT=r[:,:,None]*r[:,None,:] # (L,3,3)
+    I=(r2[:, None, None] * eye[None, :, :] - rrT).sum(dim=0) # (3,3)
+    tau=torch.cross(r,d, dim=1).sum(dim=0) # (3,)
+    try:
+        omega = torch.linalg.solve(I + eps*eye, tau) # (3,)
+    except RuntimeError:
+        omega = torch.linalg.lstsq(I + eps*eye, tau.unsqueeze(-1)).solution.squeeze(-1)
+
+    rot = torch.cross(omega.unsqueeze(0).expand_as(r), r, dim=1) #(L,3)
+
+    return trans, omega, center, rot
+
+
+
 def get_next_ca(
     xt,
     px0,
@@ -392,7 +414,11 @@ def get_potential_gradients(self, xyz, diffusion_mask, t, predicted=False):
 
         # Since we are not moving frames, Cb grads are same as Ca grads
         # Need access to calculated Cb coordinates to be able to get Cb grads though
-        Ca_grads = xyz.grad[:, 1, :]
+        if  xyz.grad is None:
+            print("WARNING: NaN in potential gradients, replacing with zero grad.")
+            Ca_grads=torch.zeros_like(xyz[:, 1, :])
+        else:
+            Ca_grads = xyz.grad[:, 1, :]
 
         if not diffusion_mask == None:
             Ca_grads[diffusion_mask, :] = 0
@@ -406,6 +432,20 @@ def get_potential_gradients(self, xyz, diffusion_mask, t, predicted=False):
             print("WARNING: NaN in potential gradients, replacing with zero grad.")
             Ca_grads[:] = 0
 
+        # smooth potential effects within protein subunits
+        smooth_scale=max([potential.smooth for potential in self.potential_manager.potentials_to_apply])
+        if smooth_scale>0:
+            Cas=xyz[:, 1, :]
+            binderlen=self.potential_manager.binderlen
+            if binderlen<0:
+                borders=[(0,Ca_grads.shape[0])]
+            else:
+                borders=[(0,binderlen),(binderlen,Ca_grads.shape[0])]
+            for a, b in borders:
+                with torch.no_grad():
+                    trans, omega, center, rot = rigid_rotation_from_grads(Cas[a:b],Ca_grads[a:b])             
+                    Ca_grads[a:b]=Ca_grads[a:b]*(1-smooth_scale)+(trans+rot)*smooth_scale
+
         return Ca_grads
 
     def get_next_pose(
@@ -620,38 +660,40 @@ def parse_pdb_lines(lines, parse_hetatom=False, parse_na=False, ignore_het_h=Tru
         res, pdb_idx = [],[]
         for l in lines:
             if l[:4] == "ATOM" and l[12:16].strip() == "C1'":
-                res.append((l[22:26], l[17:20]))
+                res.append((l[22:26], l[17:20].strip()))
                 # chain letter, res num
                 pdb_idx.append((l[21:22].strip(), int(l[22:26].strip())))
-        seq = [util.na2num[r[1]] if r[1] in util.na2num.keys() else 20 for r in res]
+        seq = [util.na2num[r[1]] if r[1] in util.na2num else 20 for r in res]
         pdb_idx = [
             (l[21:22].strip(), int(l[22:26].strip()))
             for l in lines
             if l[:4] == "ATOM" and l[12:16].strip() == "C1'"
         ]  # chain letter, res num
 
-        # 4 BB + up to 10 SC atoms
+        # 3 BB + up to 20 SC atoms
         xyz = np.full((len(res), 23, 3), np.nan, dtype=np.float32)
-        xyz_names = np.full((len(res), 23, 3), np.nan)
+        atom_id = np.full((len(res), 23), np.nan, dtype=np.object)
+        atom_type = np.full((len(res), 23), np.nan, dtype=np.object)
         for l in lines:
             if l[:4] != "ATOM":
                 continue
             chain, resNo, atom, aa = (
                 l[21:22],
                 int(l[22:26]),
                 " " + l[12:16].strip().ljust(3),
-                l[17:20],
+                l[17:20].strip(),
             )
             if (chain,resNo) in pdb_idx:
                 idx = pdb_idx.index((chain, resNo))
                 for i_atm, tgtatm in enumerate(
-                    util.na2long[util.na2num[aa]][:14]
+                    util.na2long[util.na2num[aa]][:23]
                     ):
                     if (
                         tgtatm is not None and tgtatm.strip() == atom.strip()
                         ):  # ignore whitespace
                         xyz[idx, i_atm, :] = [float(l[30:38]), float(l[38:46]), float(l[46:54])]
-                        xyz_names[idx, i_atm, :] = l[16:20]
+                        atom_id[idx, i_atm] = atom
+                        atom_type[idx, i_atm] = l[77]
                         break
 
         # save atom mask
@@ -674,7 +716,8 @@ def parse_pdb_lines(lines, parse_hetatom=False, parse_na=False, ignore_het_h=Tru
 
         out["na_xyz"]= xyz  # cartesian coordinates, [Lx23]
         out["na_mask"]= mask  # mask showing which atoms are present in the PDB file, [Lx23]
-        out['na_atom_names']= xyz_names
+        out['na_atom_id']= atom_id
+        out['na_atom_type']= atom_type
         out["na_seq"]= np.array(seq)  # amino acid sequence, [L]
         out["na_pdb_idx"]= pdb_idx # list of (chain letter, residue number) in the pdb file, [L]
 
@@ -713,7 +756,8 @@ def process_target(pdb_path, parse_hetatom=False, parse_na=False, center=True):
     
     if parse_na:
         out['na_info']={'mask':target_struct["na_mask"],
-                        'atom_names':target_struct['na_atom_names'],
+                        'atom_id':target_struct['na_atom_id'],
+                        'atom_type':target_struct['na_atom_type'],
                         'seq':target_struct["na_seq"],
                         'pdb_idx':target_struct["na_pdb_idx"]}
         out["na_xyz"]= target_struct["na_xyz"]
diff --git a/rfdiffusion/potentials/manager.py b/rfdiffusion/potentials/manager.py
@@ -93,6 +93,7 @@ def __init__(self,
         self.potentials_config = potentials_config
         self.ppi_config        = ppi_config
         self.inference_config  = inference_config
+        self.binderlen=binderlen
 
         self.guide_scale = potentials_config.guide_scale
         self.guide_decay = potentials_config.guide_decay
diff --git a/rfdiffusion/potentials/potentials.py b/rfdiffusion/potentials/potentials.py
@@ -14,8 +14,9 @@ def __init__(self):
         self.sidechain=False
         self.current_substrate_atoms=None
         self.current_na_atoms=None
+        self.smooth=0
 
-    def compute(self, xyz):
+    def compute(self, xyz,**kwargs):
         '''
             Given the current structure of the model prediction, return the current
             potential as a PyTorch tensor with a single entry
@@ -434,7 +435,7 @@ def compute(self, xyz):
             assert abs(first_distance - second_distance) < 0.01, "Alignment seems to be bad" 
         
         if self.sidechain:
-            d=self.get_sidechains(xyz, self.seq, self.mask_seq)
+            d=self.get_sidechains(xyz, self.seq, self.mask_seq, substrate_atoms, self.substrate_info['atom_type'])
             dgram = torch.cdist(d['atom_xyz_p1'][None,...].contiguous(), 
                             substrate_atoms.float()[None].to(d['atom_xyz_p1'].device), p=2)[0] # [Lb,Lb]
         else:
@@ -506,14 +507,17 @@ def _grab_motif_residues(self, xyz) -> None:
 
 class na_contacts(substrate_contacts):
 
-    def __init__(self, weight=1, r_0=8, d_0=2, s=1, eps=1e-6, rep_r_0=5, rep_s=2, rep_r_min=1, sidechain=False):
+    def __init__(self, weight=1, r_0=8, d_0=2, s=1, eps=1e-6, rep_r_0=5, rep_s=2, rep_r_min=1, 
+                 sidechain=False, smooth=0, predicted=False):
 
         super().__init__()
         self.r_0       = r_0
         self.weight    = weight
         self.d_0       = d_0
         self.eps       = eps
         self.sidechain=sidechain
+        self.predicted=predicted
+        self.smooth=smooth
         
         self.motif_frame = None # [4,3] xyz coordinates from 4 atoms of input motif
         self.motif_mapping = None # list of tuples giving positions of above atoms in design [(resi, atom_idx)]
@@ -540,7 +544,7 @@ def __init__(self, weight=1, r_0=8, d_0=2, s=1, eps=1e-6, rep_r_0=5, rep_s=2, re
     def compute(self, xyz):
         
         if self.xyz_motif==None or self.xyz_motif.shape[0]<3:
-            substrate_atoms=(self.na_atoms-self.na_atoms[:11].view(-1,3).mean(dim=0)).detach()
+            substrate_atoms=(self.na_atoms-self.na_atoms[:,:11,:].mean(dim=(0,1))[None,None,:]).detach()
             
         else:
             self._grab_motif_residues(self.xyz_motif)
@@ -557,11 +561,12 @@ def compute(self, xyz):
 
         self.current_na_atoms = substrate_atoms.clone().detach()
         substrate_atoms=substrate_atoms.view(-1,3)
-        mask=self.na_info['mask'].view(-1,3)
+        mask=torch.from_numpy(self.na_info['mask']).view(-1)
         substrate_atoms=substrate_atoms[mask,:]
         
         if self.sidechain:
-            d=self.get_sidechains(xyz, self.seq, self.mask_seq)
+            aatypes=self.na_info['atom_type'].reshape(-1,3)[mask,:]
+            d=self.get_sidechains(xyz, self.seq, self.mask_seq, substrate_atoms, aatypes)
             dgram = torch.cdist(d['atom_xyz_p1'][None,...].contiguous(), 
                             substrate_atoms.float()[None].to(d['atom_xyz_p1'].device), p=2)[0] # [Lb,Lb]
         else:
@@ -635,8 +640,9 @@ def compute(self, xyz):
                            'interface_ncontacts':  interface_ncontacts,
                            'monomer_contacts':     monomer_contacts,
                            'olig_contacts':        olig_contacts,
-                           'substrate_contacts':    substrate_contacts,
-                           'dmasif_interactions':   dmasif_interactions}
+                           'substrate_contacts':   substrate_contacts,
+                           'na_contacts':          na_contacts,
+                           'dmasif_interactions':  dmasif_interactions}
 
 require_binderlen      = { 'binder_ROG',
                            'binder_distance_ReLU',
diff --git a/rfdiffusion/recover_sidechains.py b/rfdiffusion/recover_sidechains.py
@@ -218,12 +218,12 @@ def __init__(self, binderlen=-1, seq_model_type='protein_mpnn'):
 
         import LigandMPNN
         from LigandMPNN.model_utils import ProteinMPNN
-        from LigandMPNN.data_utils import restype_str_to_int, restype_1to3, alphabet
-
+        from LigandMPNN.data_utils import restype_str_to_int, restype_1to3, alphabet, featurize, element_list
         
         restype_3to1={restype_1to3[x]: x for x in restype_1to3.keys()}
         self.renumber_aa_mpnn2rf=torch.tensor([restype_str_to_int[restype_3to1.get(x,'X')] for x in num2aa], dtype=int)
         self.renumber_aa_rf2mpnn=torch.tensor([aa2num[aa_123.get(x,'UNK')] for x in alphabet], dtype=int)
+        self.element_dict = dict(zip(element_list, range(1, len(element_list))))
 
         path_to_LigandMPNN=LigandMPNN.__path__._path[0]
 
@@ -256,6 +256,7 @@ def __init__(self, binderlen=-1, seq_model_type='protein_mpnn'):
         self.seq_model.load_state_dict(seq_checkpoint["model_state_dict"])
         self.seq_model.to(self.device)
         self.seq_model.eval()
+        self.featurize=featurize
         print('Load LigandMPNN model')
 
         self.recover_sc=None
@@ -268,50 +269,64 @@ def init_recover_sc(self):
                                          device=self.device)
         self.recover_sc.eval()
 
-
-    def run_LigandMPNN(self, xyz, seq, seq_mask):
+    def run_LigandMPNN(self, xyz, seq, seq_mask, ligand_xyz=None, ligand_aatypes=None):
 
         L=xyz.shape[0]
 
         xyz=get_O_from_3_points(xyz)
 
-        feature_dict = {}
-        feature_dict["batch_size"]=1
+        input_dict = {}
+        
+        input_dict["X"] = xyz[:,:4,:] # L*4*3 (bb atoms)  ? normalize
+        input_dict["mask"] = torch.ones([ L]).to(self.device)
+
         if seq==None:
-            feature_dict["S"] = torch.full((1, L),20,dtype=int).to(self.device) # encoded sequence
-            feature_dict["chain_mask"] = torch.full((1, L),True,dtype=bool).to(self.device)
+            input_dict["S"] = torch.full(( L),20,dtype=int).to(self.device) # encoded sequence
+            input_dict["chain_mask"] = torch.full(( L),True,dtype=bool).to(self.device)
             raise AttributeError
         else:
-            feature_dict["S"]=seq[None,:,self.renumber_aa_rf2mpnn].argmax(-1).detach()
-            feature_dict["chain_mask"] = ~seq_mask
+            input_dict["S"]=seq[:,self.renumber_aa_rf2mpnn].argmax(-1).squeeze().detach()
+            input_dict["chain_mask"] = ~seq_mask.squeeze().detach()
+
+        if ligand_xyz==None:
+            input_dict["Y"] = torch.zeros([1, 3]).to(self.device)
+            input_dict["Y_t"] = torch.zeros([1]).to(self.device)
+            input_dict["Y_m"] = torch.zeros([1]).to(self.device)
+        else:
+            input_dict["Y"] = ligand_xyz.to(self.device).detach()
+            input_dict["Y_t"] = torch.tensor([self.element_dict.get(x,0) for x in ligand_aatypes], 
+                                             dtype=torch.int32, device=self.device)
+            input_dict["Y_m"] = torch.ones_like(input_dict["Y_t"])
+
+        input_dict["R_idx"] = torch.arange(L).to(self.device) # L resnums
+
+        if self.binderlen>0:
+            input_dict["chain_labels"] = torch.cat((torch.zeros((self.binderlen)),
+                                              torch.ones((L-self.binderlen))),0).to(self.device)  # L Chain indices
+        else:
+            input_dict["chain_labels"]=torch.zeros((L)).to(self.device)
 
-        feature_dict["X"] = xyz[None,:,:4,:] # B*L*4*3 (bb atoms)  ? normalize
+        feature_dict = self.featurize(input_dict,
+                                      number_of_ligand_atoms=(self.seq_model.features.atom_context_num 
+                                                              if self.seq_model.model_type=='ligand_mpnn' 
+                                                              else 1) ,
+                                      model_type=self.seq_model.model_type)
 
-        feature_dict["mask"] = torch.ones([1, L]).to(self.device)
+        feature_dict["batch_size"]=1
         feature_dict["temperature"] = 0.1
         feature_dict["bias"] = torch.zeros((1,L,21)).to(self.device)
         feature_dict["randn"]=torch.randn((1,L)).to(self.device)
         feature_dict["symmetry_residues"] = [[]]
         feature_dict["symmetry_weights"]=[[]]
-        feature_dict["Y"] = torch.zeros([1, L, 16, 3]).to(self.device)
-        feature_dict["Y_t"] = torch.zeros([1, L, 16]).to(self.device)
-        feature_dict["Y_m"] = torch.zeros([1, L, 16]).to(self.device)
 
-        feature_dict["R_idx"] = torch.arange(L)[None,:].to(self.device) # B*L resnums
-
-        if self.binderlen>0:
-            feature_dict["chain_labels"] = torch.cat((torch.zeros((1,self.binderlen)),
-                                              torch.ones((1,L-self.binderlen))),1).to(self.device)  # B*L Chain indices
-        else:
-            feature_dict["chain_labels"]=torch.zeros((1,L)).to(self.device)
-    
-        output_dict = self.seq_model.score(feature_dict, use_sequence=False)
+        output_dict = self.seq_model.score(feature_dict, use_sequence=True)
 
         return output_dict
 
-    def get_aa_probs(self, xyz, seq, seq_mask):
 
-        output_dict=self.run_LigandMPNN(xyz, seq, seq_mask)
+    def get_aa_probs(self, xyz, seq, seq_mask, ligand_xyz=None, ligand_aatypes=None):
+
+        output_dict=self.run_LigandMPNN(xyz, seq, seq_mask, ligand_xyz, ligand_aatypes)
         probs=torch.nn.functional.softmax(output_dict['logits'], dim=-1)
         probs=probs[0,:,self.renumber_aa_mpnn2rf]
         probs[seq_mask.squeeze()]=seq[seq_mask.squeeze()]
@@ -334,7 +349,7 @@ def bb2martini(self, xyz, seq):
         return self.recover_sc(feature_dict)
 
     
-    def __call__(self, xyz, seq=None, seq_mask=None):
+    def __call__(self, xyz, seq=None, seq_mask=None, ligand_xyz=None, ligand_aatypes=None):
 
         xyz=xyz.clone().to(self.device)
         
@@ -348,7 +363,7 @@ def __call__(self, xyz, seq=None, seq_mask=None):
         if self.recover_sc==None:
             self.init_recover_sc()
 
-        seq=self.get_aa_probs(xyz, seq, seq_mask)
+        seq=self.get_aa_probs(xyz, seq, seq_mask, ligand_xyz, ligand_aatypes)
 
         d=self.bb2martini(xyz, seq)
 
diff --git a/rfdiffusion/util.py b/rfdiffusion/util.py
@@ -422,17 +422,16 @@ def writena(
     atomscpu = atoms.cpu().squeeze()
     if bfacts is None:
         bfacts = torch.zeros(atomscpu.shape[0])
-    if idx_pdb is None:
-        idx_pdb = 1 + torch.arange(atomscpu.shape[0])
 
     Bfacts = torch.clamp(bfacts.cpu(), 0, 1)
     for i, s in enumerate(info['pdb_idx']):
         chain=s[0]
         idx_pdb=s[1]
         s=info['seq'][i]
-        for j, atm_j in enumerate(info['atom_names'][i]):
+        atms = na2long[s][:23]
+        for j, atm_j in enumerate(atms):
             if (
-                j < sum(info['mask'][i]) and atm_j is not None
+                info['mask'][i][j]>0
             ):  # and not torch.isnan(atomscpu[i,j,:]).any()):
                 f.write(
                     "%-6s%5s %4s %3s %s%4d    %8.3f%8.3f%8.3f%6.2f%6.2f\n"
diff --git a/scripts/run_inference.py b/scripts/run_inference.py