OpenProteinAI
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎openprotein/embeddings/poet.py‎
Lines changed: 29 additions & 11 deletions b/‎openprotein/embeddings/poet.py‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎openprotein/embeddings/poet2.py‎
Lines changed: 12 additions & 0 deletions b/‎openprotein/embeddings/poet2.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎openprotein/fold/boltz.py‎
Lines changed: 112 additions & 28 deletions b/‎openprotein/fold/boltz.py‎
Lines changed: 112 additions & 28 deletions
@@ -367,3 +367,6 @@ __marimo__/
 
 .envrc
 /.direnv/
+
+scratch.org
+gptel
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from openprotein.base import APISession
-from openprotein.common import ModelMetadata, ReductionType
+from openprotein.common import ModelMetadata, Reduction, ReductionType
 from openprotein.data import AssayDataset, AssayMetadata
 from openprotein.prompt import Prompt
 
@@ -51,9 +51,9 @@ def __init__(
 
     def embed(
         self,
-        sequences: list[bytes],
-        prompt: str | Prompt | None = None,
+        sequences: list[bytes] | list[str],
         reduction: ReductionType | None = ReductionType.MEAN,
+        prompt: str | Prompt | None = None,
         **kwargs,
     ) -> EmbeddingsResultFuture:
         """
@@ -74,6 +74,12 @@ def embed(
         -------
         EmbeddingsResultFuture
             Future object that returns the embeddings of the submitted sequences.
+
+            Note: The embeddings for PoET can have an extra first dimension if using ensemble
+            prompts, where the first dimension is the number of replicates in the ensemble
+            prompt. i.e. the shape is ``(N, L, D)`` if ``N`` > 1 else ``(L, D)`` where ``N`` is
+            the number of replicates in the prompt, ``L`` is the length of the sequence, ``D`` is
+            the dimensions of the ensemble.
         """
         if prompt is None:
             prompt_id = None
@@ -88,7 +94,7 @@ def embed(
 
     def logits(
         self,
-        sequences: list[bytes],
+        sequences: list[bytes] | list[str],
         prompt: str | Prompt | None = None,
         **kwargs,
     ) -> EmbeddingsResultFuture:
@@ -108,6 +114,12 @@ def logits(
         -------
         EmbeddingsResultFuture
             Future object that returns the logits of the submitted sequences.
+
+            Note: The logits for PoET can have an extra first dimension if using ensemble
+            prompts, where the first dimension is the number of replicates in the ensemble
+            prompt. i.e. the shape is ``(N, L, D)`` if ``N`` > 1 else ``(L, D)`` where ``N`` is
+            the number of replicates in the prompt, ``L`` is the length of the sequence, ``D`` is
+            the size of the vocabulary.
         """
         if prompt is None:
             prompt_id = None
@@ -317,11 +329,11 @@ def generate(
 
     def fit_svd(
         self,
-        prompt: str | Prompt | None = None,
         sequences: list[bytes] | list[str] | None = None,
-        assay: AssayDataset | None = None,
+        assay: AssayDataset | AssayMetadata | None = None,
         n_components: int = 1024,
-        reduction: ReductionType | None = None,
+        reduction: Reduction | ReductionType | None = None,
+        prompt: str | Prompt | None = None,
         **kwargs,
     ) -> "SVDModel":
         """
@@ -365,11 +377,11 @@ def fit_svd(
 
     def fit_umap(
         self,
-        prompt: str | Prompt | None = None,
         sequences: list[bytes] | list[str] | None = None,
-        assay: AssayDataset | None = None,
+        assay: AssayDataset | AssayMetadata | None = None,
         n_components: int = 2,
-        reduction: ReductionType = ReductionType.MEAN,
+        reduction: Reduction | ReductionType = ReductionType.MEAN,
+        prompt: str | Prompt | None = None,
         **kwargs,
     ) -> "UMAPModel":
         """
@@ -413,8 +425,11 @@ def fit_umap(
 
     def fit_gp(
         self,
-        assay: AssayMetadata | AssayDataset | str,
+        assay: AssayDataset | AssayMetadata | str,
         properties: list[str],
+        reduction: ReductionType,
+        name: str | None = None,
+        description: str | None = None,
         prompt: str | Prompt | None = None,
         **kwargs,
     ) -> "PredictorModel":
@@ -444,6 +459,9 @@ def fit_gp(
         return super().fit_gp(
             assay=assay,
             properties=properties,
+            reduction=reduction,
+            name=name,
+            description=description,
             prompt_id=prompt_id,
             **kwargs,
         )
@@ -86,6 +86,12 @@ def embed(
         -------
         EmbeddingsResultFuture
             A future object that returns the embeddings of the submitted sequences.
+
+            Note: The embeddings for PoET can have an extra first dimension if using ensemble
+            prompts, where the first dimension is the number of replicates in the ensemble
+            prompt. i.e. the shape is ``(N, L, D)`` if ``N`` > 1 else ``(L, D)`` where ``N`` is
+            the number of replicates in the prompt, ``L`` is the length of the sequence, ``D`` is
+            the dimensions of the ensemble.
         """
         prompt_api = getattr(self.session, "prompt", None)
         assert isinstance(prompt_api, PromptAPI)
@@ -127,6 +133,12 @@ def logits(
         -------
         EmbeddingsResultFuture
             A future object that returns the logits of the submitted sequences.
+
+            Note: The logits for PoET can have an extra first dimension if using ensemble
+            prompts, where the first dimension is the number of replicates in the ensemble
+            prompt. i.e. the shape is ``(N, L, D)`` if ``N`` > 1 else ``(L, D)`` where ``N`` is
+            the number of replicates in the prompt, ``L`` is the length of the sequence, ``D`` is
+            the size of the vocabulary.
         """
         prompt_api = getattr(self.session, "prompt", None)
         assert isinstance(prompt_api, PromptAPI)
 
@@ -1,7 +1,7 @@
 """Community-based Boltz models for complex structure prediction with ligands/dna/rna."""
 
 import warnings
-from typing import Sequence
+from typing import Mapping, Sequence, cast
 
 from pydantic import BaseModel, Field, TypeAdapter, model_validator
 
@@ -10,6 +10,8 @@
 from openprotein.common import ModelMetadata
 from openprotein.fold.common import normalize_inputs, serialize_input
 from openprotein.molecules import Complex, Ligand, Protein
+from openprotein.molecules.template import Template
+from openprotein.prompt import PromptAPI
 
 from . import api
 from .complex import id_generator
@@ -40,7 +42,7 @@ def fold(
         num_steps: int = 200,
         step_scale: float = 1.638,
         use_potentials: bool = False,
-        constraints: list[dict] | None = None,
+        constraints: Sequence[Mapping] | None = None,
         **kwargs,
     ) -> FoldResultFuture:
         """
@@ -83,19 +85,9 @@ def fold(
 
         # build the normalized_models from msa
         if isinstance(sequences, MSAFuture):
-            id_gen = id_generator()
-            align_api = getattr(self.session, "align", None)
-            assert isinstance(align_api, AlignAPI)
-            msa = sequences  # rename
-            seed = align_api.get_seed(job_id=msa.job.job_id)
-            _proteins: dict[str, Protein] = {}
-            for seq in seed.split(":"):
-                protein = Protein(sequence=seq)
-                id = next(id_gen)
-                protein.msa = msa.id
-                _proteins[id] = protein
-            normalized_complexes = [Complex(chains=_proteins)]
-
+            normalized_complexes = [
+                _msa_future_to_complex(session=self.session, msa=sequences)
+            ]
         else:
             normalized_complexes = normalize_inputs(sequences)
 
@@ -139,9 +131,9 @@ def fold(
         num_steps: int = 200,
         step_scale: float = 1.638,
         use_potentials: bool = False,
-        constraints: list[dict] | None = None,
-        templates: list[dict] | None = None,
-        properties: list[dict] | None = None,
+        constraints: Sequence[Mapping] | None = None,
+        templates: Sequence[Protein | Complex | Template] | None = None,
+        properties: Sequence[Mapping] | None = None,
         method: str | None = None,
     ) -> FoldResultFuture:
         """
@@ -163,7 +155,7 @@ def fold(
             Whether or not to use potentials.
         constraints : list[dict] | None = None
             List of constraints.
-        templates: list[dict] | None = None
+        templates: list[Protein | Complex | Template] | None = None
             List of templates to use for structure prediction.
         properties: list[dict] | None = None
             List of additional properties to predict. Should match the `BoltzProperties`
@@ -180,24 +172,98 @@ def fold(
         Returns
         -------
         FoldResultFuture
-            Future for the folding result.
+             Future for the folding result.
         """
-
+        prompt_api = getattr(self.session, "prompt", None)
+        assert isinstance(prompt_api, PromptAPI)
+
+        # validate templates
+        # mapping chain_id (to predict) to template
+        # needs to be consistent
+        templates_: list[Template] = []
+        if not isinstance(sequences, MSAFuture):
+            first_chain_id_to_template = {}
+            for batch_idx, seq in enumerate(sequences):
+                # validate templates and normalize to complex
+                if isinstance(seq, str) or isinstance(seq, bytes):
+                    seq = Protein(seq)
+                seq._assert_valid_templates()
+                if isinstance(seq, Protein):
+                    complex = Complex({"A": seq})
+                else:
+                    complex = seq
+                # resolve chain-level templates
+                for chain_id, protein in complex.get_proteins().items():
+                    # Verify same chain_id should have same templates
+                    if batch_idx == 0:
+                        first_chain_id_to_template[chain_id] = protein.templates
+                        for template in protein.templates:
+                            templates_.append(_to_template(template, chain_id=chain_id))
+                    elif first_chain_id_to_template[chain_id] != protein.templates:
+                        raise ValueError(
+                            "Expected same chain across batches to have the same templates"
+                        )
+                # resolve complex-level templates
+                if batch_idx == 0:
+                    first_templates = complex.templates
+                    for template in complex.templates:
+                        templates_.append(_to_template(template))
+                elif first_templates != complex.templates:
+                    raise ValueError(
+                        "Expected templates across complexes in batch to be the same"
+                    )
+        # method level argument
         if templates is not None:
-            raise ValueError("`templates` not yet supported!")
+            if isinstance(sequences, MSAFuture):
+                # need to convert to complex for template validation
+                sequences = [
+                    _msa_future_to_complex(session=self.session, msa=sequences)
+                ]
+            for template in templates:
+                template = _to_template(template)
+                # validate the template for all sequences before accepting it
+                for seq in sequences:
+                    if isinstance(seq, str) or isinstance(seq, bytes):
+                        seq = Protein(seq)
+                    template.validate_for_target(seq)
+                templates_.append(template)
+
+        # resolve list of Templates into expected dict arg
+        template_dicts: list[dict] = []
+        # track resolved queries to reduce network calls - use id() for identity-based caching
+        struct_id_to_query_id = {}
+
+        for template in templates_:
+            # Use id() for caching - only resolve each unique structure once
+            struct_id = id(template.template)
+            if struct_id not in struct_id_to_query_id:
+                struct_id_to_query_id[struct_id] = prompt_api._resolve_query(
+                    query=template.template
+                )
+
+            template_dict = {"query_id": struct_id_to_query_id[struct_id]}
+
+            if template.mapping is not None:
+                if isinstance(template.mapping, str):
+                    template_dict["chain_id"] = template.mapping
+                else:
+                    template_dict["chain_id"] = list(template.mapping.values())
+                    template_dict["template_id"] = list(template.mapping.keys())
+
+            template_dicts.append(template_dict)
 
         # validate properties
         if properties is not None:
             props = TypeAdapter(list[BoltzProperty]).validate_python(properties)
             # Only allow affinity for ligands, and check binder refers to a ligand chain_id (str, not list)
             ligand_chain_ids = set()
-            if isinstance(sequences, list):
+            if not isinstance(sequences, MSAFuture):
                 for protein in sequences:
                     if isinstance(protein, Complex):
                         complex = protein
-                        for id, chain in complex.get_chains().items():
+                        for chain_id, chain in complex.get_chains().items():
                             if isinstance(chain, Ligand):
-                                ligand_chain_ids.add(id)
+                                ligand_chain_ids.add(chain_id)
             for prop in props:
                 if hasattr(prop, "affinity") and prop.affinity is not None:
                     binder_id = prop.affinity.binder
@@ -214,7 +280,7 @@ def fold(
             step_scale=step_scale,
             use_potentials=use_potentials,
             constraints=constraints,
-            templates=templates,
+            templates=template_dicts or None,
             properties=properties,
             method=method,
         )
@@ -235,7 +301,7 @@ def fold(
         num_steps: int = 200,
         step_scale: float = 1.638,
         use_potentials: bool = False,
-        constraints: list[dict] | None = None,
+        constraints: Sequence[Mapping] | None = None,
     ) -> FoldResultFuture:
         """
         Request structure prediction with Boltz-1 model.
@@ -305,7 +371,7 @@ def fold(
         num_recycles: int = 3,
         num_steps: int = 200,
         step_scale: float = 1.638,
-        constraints: list[dict] | None = None,
+        constraints: Sequence[Mapping] | None = None,
     ) -> FoldResultFuture:
         """
         Request structure prediction with Boltz-1x model. Uses potentials with Boltz-1 model.
@@ -516,3 +582,21 @@ class BoltzAffinity(BaseModel):
 
     class Config:
         extra = "allow"  # Allow extra fields
+
+
+def _msa_future_to_complex(session: APISession, msa: MSAFuture) -> Complex:
+    align_api = getattr(session, "align", None)
+    assert isinstance(align_api, AlignAPI)
+    seed = align_api.get_seed(job_id=msa.job.job_id)
+    proteins: dict[str, Protein] = {}
+    for chain_id, seq in zip(id_generator(), seed.split(":")):
+        protein = Protein(sequence=seq)
+        protein.msa = msa.id
+        proteins[chain_id] = protein
+    return Complex(chains=proteins)
+
+
+def _to_template(obj, chain_id: str | None = None):
+    if not isinstance(obj, Template):
+        obj = Template(template=obj, mapping=chain_id)
+    return obj