add hf pretrained encoders

liugangcode · liugangcode · commit de781a63db4c · 2025-04-04T20:54:34.000-04:00
diff --git a/tests/encoder/run_hfpretrained.py b/tests/encoder/run_hfpretrained.py
@@ -0,0 +1,77 @@
+import numpy as np
+import torch
+from torch_molecule import HFPretrainedMolecularEncoder
+
+def test_hf_pretrained_encoder():
+    # Test molecules (simple examples)
+    molecules = [
+        "CC(=O)O",  # Acetic acid
+        "CCO",      # Ethanol
+        "CCCC",     # Butane
+        "c1ccccc1", # Benzene
+        "CCN",      # Ethylamine
+    ]
+
+    # Test different HuggingFace models
+    models_to_test = [
+        {"repo_id": "entropy/gpt2_zinc_87m", "model_name": "GPT-2_ZINC_87M"},
+        {"repo_id": "entropy/roberta_zinc_480m", "model_name": "RoBERTa_ZINC_480M"},
+        {"repo_id": "ncfrey/ChemGPT-1.2B", "model_name": "ChemGPT_1.2B"},
+        {"repo_id": "ncfrey/ChemGPT-19M", "model_name": "ChemGPT_19M"},
+        {"repo_id": "ncfrey/ChemGPT-4.7M", "model_name": "ChemGPT_4.7M"},
+        {"repo_id": "DeepChem/ChemBERTa-77M-MTR", "model_name": "ChemBERTa_77M_MTR"},
+        {"repo_id": "DeepChem/ChemBERTa-77M-MLM", "model_name": "ChemBERTa_77M_MLM"},
+        {"repo_id": "DeepChem/ChemBERTa-10M-MTR", "model_name": "ChemBERTa_10M_MTR"},
+        {"repo_id": "DeepChem/ChemBERTa-10M-MLM", "model_name": "ChemBERTa_10M_MLM"},
+        {"repo_id": "DeepChem/ChemBERTa-5M-MLM", "model_name": "ChemBERTa_5M_MLM"},
+        {"repo_id": "DeepChem/ChemBERTa-5M-MTR", "model_name": "ChemBERTa_5M_MTR"}
+        {"repo_id": "seyonec/ChemBERTa-zinc-base-v1", "model_name": "ChemBERTa_zinc_base_v1"},
+        {"repo_id": "unikei/bert-base-smiles", "model_name": "bert-base-smiles"}
+    ]
+
+    for model_config in models_to_test:
+        print(f"\n=== Testing {model_config['model_name']} ===")
+        
+        # Initialize model
+        model = HFPretrainedMolecularEncoder(repo_id=model_config["repo_id"], model_name=model_config["model_name"])
+        print(f"Model initialized successfully: {model_config['model_name']}")
+        
+        # Load the model
+        print("Loading model from HuggingFace...")
+        model.fit()
+        print("Model loaded successfully")
+        
+        # Encoding test
+        print("Testing molecule encoding...")
+        encodings_pt = model.encode(molecules, return_type="pt")
+        encodings_np = model.encode(molecules, return_type="np")
+        
+        print('model_config', model_config)
+        print(f"Encoded {len(molecules)} molecules")
+        print(f"PyTorch tensor shape: {encodings_pt.shape}")
+        print(f"NumPy array shape: {encodings_np.shape}")
+        
+        # Verify PyTorch and NumPy outputs match
+        if np.allclose(encodings_pt.cpu().numpy(), encodings_np):
+            print("PyTorch and NumPy encodings match!")
+        else:
+            print("Warning: PyTorch and NumPy encodings differ")
+        
+        # Print some stats about the embeddings
+        print(f"Embedding dimensionality: {encodings_pt.shape[1]}")
+        print(f"Mean embedding value: {encodings_pt.mean().item():.4f}")
+        print(f"Std of embedding values: {encodings_pt.std().item():.4f}")
+        
+        # Check if embeddings are different for different molecules
+        distances = []
+        for i in range(len(molecules)):
+            for j in range(i+1, len(molecules)):
+                dist = torch.norm(encodings_pt[i] - encodings_pt[j]).item()
+                distances.append(dist)
+        
+        print(f"Average L2 distance between embeddings: {np.mean(distances):.4f}")
+        print(f"Min L2 distance between embeddings: {np.min(distances):.4f}")
+        print(f"Max L2 distance between embeddings: {np.max(distances):.4f}")
+
+if __name__ == "__main__":
+    test_hf_pretrained_encoder()
diff --git a/torch_molecule/__init__.py b/torch_molecule/__init__.py
@@ -21,7 +21,7 @@
 from .encoder.edgepred import EdgePredMolecularEncoder
 from .encoder.moama import MoamaMolecularEncoder
 from .encoder.infograph import InfoGraphMolecularEncoder
-
+from .encoder.pretrained import HFPretrainedMolecularEncoder
 """
 generator module
 """
@@ -48,7 +48,8 @@
     'ContextPredMolecularEncoder',
     'EdgePredMolecularEncoder',
     'MoamaMolecularEncoder',
-    'InfographMolecularEncoder',
+    'InfoGraphMolecularEncoder',
+    'HFPretrainedMolecularEncoder',
     # generators
     'GraphDITMolecularGenerator',
     'GraphGAMolecularGenerator',
diff --git a/torch_molecule/encoder/pretrained/__init__.py b/torch_molecule/encoder/pretrained/__init__.py
@@ -0,0 +1,3 @@
+from .modeling_pretrained import HFPretrainedMolecularEncoder
+
+__all__ = ['HFPretrainedMolecularEncoder']
diff --git a/torch_molecule/encoder/pretrained/modeling_pretrained.py b/torch_molecule/encoder/pretrained/modeling_pretrained.py
@@ -1,29 +1,115 @@
-import numpy as np
+import warnings
 from tqdm import tqdm
 from typing import Optional, Union, Dict, Any, Tuple, List, Literal
-
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 import torch
+import numpy as np
 
 from ...base import BaseMolecularEncoder
+    
+known_repos = [
+    "entropy/gpt2_zinc_87m", 
+    "entropy/roberta_zinc_480m", 
+    "ncfrey/ChemGPT-1.2B", 
+    "ncfrey/ChemGPT-19M", 
+    "ncfrey/ChemGPT-4.7M",
+    "DeepChem/ChemBERTa-77M-MTR", 
+    "DeepChem/ChemBERTa-77M-MLM",
+    "DeepChem/ChemBERTa-10M-MTR", 
+    "DeepChem/ChemBERTa-10M-MLM",
+    "DeepChem/ChemBERTa-5M-MLM", 
+    "DeepChem/ChemBERTa-5M-MTR",
+    "unikei/bert-base-smiles",
+    'seyonec/ChemBERTa-zinc-base-v1'
+]
+
+known_add_bos_eos_list = ["entropy/gpt2_zinc_87m"]
+
+@dataclass(init=False)
+class HFPretrainedMolecularEncoder(BaseMolecularEncoder):
+    """Implements Hugging Face pretrained transformer models as molecular encoders.
+
+    This class provides an interface to use pretrained transformer models from Hugging Face
+    as molecular encoders. It handles tokenization, encoding, and pooling of molecular representations.
+
+    Tested models include:
+
+    - ChemGPT series (1.2B/19M/4.7M): GPT-Neo based models pretrained on PubChem10M dataset with SELFIES strings.
+      Output dimension: 2048.
+      repo_id: "ncfrey/ChemGPT-1.2B" (https://huggingface.co/ncfrey/ChemGPT-1.2B)
+      repo_id: "ncfrey/ChemGPT-19M" (https://huggingface.co/ncfrey/ChemGPT-19M)
+      repo_id: "ncfrey/ChemGPT-4.7M" (https://huggingface.co/ncfrey/ChemGPT-4.7M)
+
+    - GPT2-ZINC-87M: GPT-2 based model (87M parameters) pretrained on ZINC dataset with ~480M SMILES strings.
+      Output dimension: 768.
+      repo_id: "entropy/gpt2_zinc_87m" (https://huggingface.co/entropy/gpt2_zinc_87m)
+    
+    - RoBERTa-ZINC-480M: RoBERTa based model (102M parameters) pretrained on ZINC dataset with ~480M SMILES strings.
+      Output dimension: 768.
+      repo_id: "entropy/roberta_zinc_480m" (https://huggingface.co/entropy/roberta_zinc_480m)
+
+    - ChemBERTa series: Available in multiple sizes (77M/10M/5M) and training objectives (MTR/MLM).
+      Output dimension: 384.
+      repo_id: "DeepChem/ChemBERTa-77M-MTR" (https://huggingface.co/DeepChem/ChemBERTa-77M-MTR)
+      repo_id: "DeepChem/ChemBERTa-77M-MLM" (https://huggingface.co/DeepChem/ChemBERTa-77M-MLM)
+      repo_id: "DeepChem/ChemBERTa-10M-MTR" (https://huggingface.co/DeepChem/ChemBERTa-10M-MTR)
+      repo_id: "DeepChem/ChemBERTa-10M-MLM" (https://huggingface.co/DeepChem/ChemBERTa-10M-MLM)
+      repo_id: "DeepChem/ChemBERTa-5M-MLM" (https://huggingface.co/DeepChem/ChemBERTa-5M-MLM)
+      repo_id: "DeepChem/ChemBERTa-5M-MTR" (https://huggingface.co/DeepChem/ChemBERTa-5M-MTR)
+
+    - UniKi/bert-base-smiles: UniKi's BERT model pretrained on SMILES strings.
+      Output dimension: 768.
+      repo_id: "unikei/bert-base-smiles" (https://huggingface.co/unikei/bert-base-smiles)
+
+    - ChemBERTa-zinc-base-v1: RoBERTa model pretrained on ZINC dataset with ~100k SMILES strings.
+      Output dimension: 384.
+      repo_id: "seyonec/ChemBERTa-zinc-base-v1" (https://huggingface.co/seyonec/ChemBERTa-zinc-base-v1)
+
+    Parameters
+    ----------
+    repo_id : str
+        The Hugging Face repository ID of the pretrained model.
+    max_length : int, default=128
+        Maximum sequence length for tokenization. Longer sequences will be truncated.
+    batch_size : int, default=128
+        Batch size used when encoding multiple molecules.
+    add_bos_eos : Optional[bool], default=None
+        Whether to add beginning/end of sequence tokens. If None, determined automatically based on model type.
+    model_name : str, default="PretrainedMolecularEncoder"
+        Name identifier for the model instance.
+    verbose : bool, default=False
+        Whether to display progress information during encoding.
+    """
 
+    repo_id: str
 
-@dataclass
-class PretrainedMolecularEncoder(BaseMolecularEncoder):
-    """This encoder uses a pretrained transformer model from HuggingFace."""
-    # Task-related parameters
-    # repo_id: str = "huggingface/PretrainedMolecularEncoder"
+    # Default arguments
+    max_length: int = 128
+    batch_size: int = 128
+    add_bos_eos: Optional[bool] = None
     model_name: str = "PretrainedMolecularEncoder"
+    verbose: bool = False
+
+    def __init__(self, repo_id: str, max_length: int = 128, batch_size: int = 128, add_bos_eos: Optional[bool] = None,
+                 model_name: str = "PretrainedMolecularEncoder", verbose: bool = False, **kwargs):
+        self.repo_id = repo_id
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.add_bos_eos = add_bos_eos
+        self.model_name = model_name
+        self.verbose = verbose
+        super().__init__(**kwargs)
 
     def __post_init__(self):
-        """Initialize the model after dataclass initialization."""
         super().__post_init__()
         self._require_transformers()
-        self.is_fitted_ = True
         self.fitting_epoch = -1
         self.fitting_loss = -1
 
+        if self.repo_id not in known_repos:
+            warnings.warn(f"Unknown repo_id: {self.repo_id}. The class will try to load the model from HuggingFace, but it might fail.")
+
     @staticmethod
     def _get_param_names() -> List[str]:
         """Get parameter names for the estimator.
@@ -33,40 +119,69 @@ def _get_param_names() -> List[str]:
         List[str]
             List of parameter names that can be used for model configuration.
         """
-        return []
+        return ["repo_id", "max_length", "model_name", "add_bos_eos"]
 
-    def _get_model_params(self, checkpoint: Optional[Dict] = None) -> Dict[str, Any]:
-        params = ["model_name"]
-        if checkpoint is not None:
-            if "hyperparameters" not in checkpoint:
-                raise ValueError("Checkpoint missing 'hyperparameters' key")
-            return {k: checkpoint["hyperparameters"][k] for k in params}
-        return {k: getattr(self, k) for k in params}
+    def _get_model_params(self) -> Dict[str, Any]:
+        raise NotImplementedError("PretrainedMolecularEncoder does not support model parameters.")
+    
+    def _setup_optimizers(self) -> None:
+        raise NotImplementedError("PretrainedMolecularEncoder does not support training.")
 
-    def _setup_optimizers(self) -> Tuple[torch.optim.Optimizer, Optional[Any]]:
+    def _train_epoch(self) -> None:
         raise NotImplementedError("PretrainedMolecularEncoder does not support training.")
     
-    def save_to_local(self, path: str) -> None:
+    def save_to_local(self) -> None:
         raise NotImplementedError("PretrainedMolecularEncoder does not support saving to local.")
     
-    def load_from_local(self, path: str) -> None:
+    def load_from_local(self) -> None:
         raise NotImplementedError("PretrainedMolecularEncoder does not support loading from local.")
     
     def save_to_hf(self) -> None:
         raise NotImplementedError("PretrainedMolecularEncoder does not support saving to huggingface.")
     
-    def load_from_hf(self, repo_id: str) -> None:
-        # TODO: Implement this
-        raise NotImplementedError("Implements this.")
+    def load_from_hf(self) -> None:
+        self.fit()
     
-    def load(self, repo_id: str) -> None:
-        self.load_from_hf(repo_id)
+    def load(self) -> None:
+        self.fit()
+
+    def fit(self) -> "HFPretrainedMolecularEncoder":
+        """Load the pretrained model from HuggingFace."""
+        assert self.repo_id is not None, "repo_id is not set"
+        self._require_transformers()
+        import transformers
+
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.repo_id, max_length=self.max_length)
+        self.model = transformers.AutoModel.from_pretrained(self.repo_id)
+        self.model.to(self.device)
+
+        model_config = self.model.config
+        model_type = model_config.model_type
 
-    def fit(self, repo_id: str) -> "PretrainedMolecularEncoder":
-        self.load_from_hf(repo_id)
+        if self.add_bos_eos is None:
+            self.add_bos_eos = self.repo_id in known_add_bos_eos_list
+
+        if self.tokenizer.pad_token is None:
+            if self.tokenizer.eos_token is not None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            else:
+                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+                self.model.resize_token_embeddings(len(self.tokenizer))
+
+        if self.add_bos_eos:
+            if self.tokenizer.bos_token is None:
+                self.tokenizer.add_special_tokens({'bos_token': '[BOS]'})
+            if self.tokenizer.eos_token is None:
+                self.tokenizer.add_special_tokens({'eos_token': '[EOS]'})
+            self.model.resize_token_embeddings(len(self.tokenizer))
+
+            warnings.warn("BOS and EOS tokens are not found in the tokenizer. They are added to the tokenizer since add_bos_eos is set to True.")
+
+        self.collator = transformers.DataCollatorWithPadding(self.tokenizer, padding=True, return_tensors='pt')
+        self.is_fitted_ = True        
         return self
 
-    def encode(self, X: List[str], return_type: Literal["np", "pt"] = "pt") -> Union[np.ndarray, torch.Tensor]:
+    def encode(self, X: List[str], return_type: Literal["np", "pt"] = "pt", add_bos_eos: Optional[bool] = None) -> Union[np.ndarray, torch.Tensor]:
         """Encode molecules into vector representations.
 
         Parameters
@@ -75,22 +190,62 @@ def encode(self, X: List[str], return_type: Literal["np", "pt"] = "pt") -> Union
             List of SMILES strings
         return_type : Literal["np", "pt"], default="pt"
             Return type of the representations
+        add_bos_eos : Optional[bool], default=None
+            Whether to add BOS and EOS tokens. If None, will be determined based on model type.
 
         Returns
         -------
         representations : ndarray or torch.Tensor
             Molecular representations
         """
+        self._require_transformers()
         self._check_is_fitted()
-        X, _ = self._validate_inputs(X, return_rdkit_mol=True)
-        raise NotImplementedError("Implements this.")
-
-        # Placeholder for transformer-based encodings
-        # Replace with actual encoding logic when integrating the transformer model
-        encodings = [X]  # dummy list to allow concat
-
-        encodings = torch.cat(encodings, dim=0)
-        return encodings if return_type == "pt" else encodings.numpy()
+        X, _ = self._validate_inputs(X, return_rdkit_mol=False)
+        
+        # Process in batches
+        all_embeddings = []
+        iterator = tqdm(range(0, len(X), self.batch_size), desc="Encoding molecules", total=len(X) // self.batch_size, disable=not self.verbose)
+        for i in iterator:
+            batch_X = X[i:i + self.batch_size]
+            
+            if self.add_bos_eos:
+                # For decoding models (e.g. GPT2), manually add BOS and EOS tokens
+                processed_batch = [self.tokenizer.bos_token + x + self.tokenizer.eos_token for x in batch_X]
+                inputs = self.collator(self.tokenizer(processed_batch))
+            else:
+                inputs = self.collator(self.tokenizer(batch_X))
+            
+            # Move inputs to the same device as the model
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            
+            # Get model outputs
+            with torch.no_grad():
+                outputs = self.model(**inputs, output_hidden_states=True)
+            
+            # get all attributes of outputs
+            print('outputs', outputs.keys())
+            # Extract embeddings based on model type
+            if hasattr(outputs, 'hidden_states'):
+                # For models that return a named tuple
+                full_embeddings = outputs.hidden_states[-1]
+            elif isinstance(outputs, tuple) and len(outputs) > 1:
+                # For models that return a tuple with hidden states
+                full_embeddings = outputs[-1][-1]
+            else:
+                # For models that return last_hidden_state directly
+                full_embeddings = outputs.last_hidden_state
+            
+            # Apply attention mask to get meaningful embeddings
+            mask = inputs['attention_mask']
+            batch_embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / 
+                          mask.sum(-1).unsqueeze(-1))
+            
+            all_embeddings.append(batch_embeddings)
+        
+        # Concatenate all batch embeddings
+        embeddings = torch.cat(all_embeddings, dim=0)
+        
+        return embeddings if return_type == "pt" else embeddings.cpu().numpy()
 
     @staticmethod
     def _require_transformers():
diff --git a/torch_molecule/generator/pretrained/modeling_pretrained.py b/torch_molecule/generator/pretrained/modeling_pretrained.py
@@ -0,0 +1,3 @@
+# TODO
+
+# safe-100m: https://huggingface.co/anrilombard/safe-100m

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .modeling_pretrained import HFPretrainedMolecularEncoder`
	`2`	`+`
	`3`	`+__all__ = ['HFPretrainedMolecularEncoder']`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# TODO`
	`2`	`+`
	`3`	`+# safe-100m: https://huggingface.co/anrilombard/safe-100m`