Open-Athena
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎biofoundation/data.py‎
Lines changed: 27 additions & 11 deletions b/‎biofoundation/data.py‎
Lines changed: 27 additions & 11 deletions
diff --git a/‎biofoundation/inference.py‎
Lines changed: 98 additions & 3 deletions b/‎biofoundation/inference.py‎
Lines changed: 98 additions & 3 deletions
@@ -13,6 +13,9 @@ uv venv
 source .venv/bin/activate
 ```
 
+Pre-requisites:
+- Pytorch (e.g. `uv pip install torch`)
+
 Install the package:
 
 ```bash
@@ -32,7 +35,7 @@ Run the example script:
 
 ```bash
 source .venv/bin/activate
-python examples/plantcad_evolutionary_constraint.py
+python examples/marin_evolutionary_constraint.py
 ```
 
 ## Development Setup
@@ -42,9 +45,6 @@ To set up the development environment with linting, formatting, type checking, a
 ```bash
 # Install development dependencies
 uv pip install --group dev
-
-# Install both main package and dev tools
-uv pip install -e . --group dev
 ```
 
 ## Development Tools
@@ -53,7 +53,7 @@ uv pip install -e . --group dev
 
 ```bash
 # Run all pre-commit hooks (linting, formatting, type checking)
-pre-commit run --all-files
+pre-commit run
 ```
 
 ### Running Tests
 
@@ -2,11 +2,12 @@
 from typing import Any
 
 
+NUCLEOTIDES = list("ACGT")
+
+
 def transform_reflogprob_mlm(
     example: dict[str, Any],
     tokenizer: PreTrainedTokenizerBase,
-    pos: int,
-    seq_col: str = "seq",
 ) -> dict[str, Any]:
     """Transform a sequence example for reference log probability MLM inference.
 
@@ -19,24 +20,39 @@ def transform_reflogprob_mlm(
         example: Dictionary containing the sequence data. Must have a key matching
             `seq_col` that contains the input sequence.
         tokenizer: HuggingFace tokenizer for converting text to token IDs.
-        pos: Position in the sequence to mask (0-indexed).
-        seq_col: Key in the example dictionary that contains the sequence.
-            Defaults to "seq".
 
     Returns:
         Dictionary with three keys:
-        - input_ids_BL: Token IDs with the specified position masked
-        - pos_B: The masked position
-        - ref_B: The reference token ID that was at the masked position
+        - input_ids: Token IDs with the specified position masked
+        - pos: The masked position
+        - ref: The reference token ID that was at the masked position
 
     Example:
         >>> example = {"seq": "ATCG"}
         >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         >>> result = transform_reflogprob_mlm(example, tokenizer, 1)
         >>> print(result)
-        {'input_ids_BL': tensor([...]), 'pos_B': 1, 'ref_B': 3}
+        {'input_ids': tensor([...]), 'pos': 1, 'ref': 3}
     """
-    input_ids = tokenizer(example[seq_col], return_tensors="pt")["input_ids"][0]
+    pos = example["pos"]
+    assert example["seq"][pos] in NUCLEOTIDES
+    input_ids = tokenizer(example["seq"], return_tensors="pt")["input_ids"][0]
     ref = input_ids[pos].item()
     input_ids[pos] = tokenizer.mask_token_id
-    return dict(input_ids_BL=input_ids, pos_B=pos, ref_B=ref)
+    return dict(input_ids=input_ids, pos=pos, ref=ref)
+
+
+def transform_reflogprob_clm(
+    example: dict[str, Any],
+    tokenizer: PreTrainedTokenizerBase,
+) -> dict[str, Any]:
+    pos = example["pos"]
+    assert example["seq"][pos] in NUCLEOTIDES
+    input_ids = tokenizer(example["seq"], return_tensors="pt")["input_ids"][0]
+    ref = input_ids[pos].item()
+    # Create 4 copies of the input sequence
+    new_input_ids = input_ids.unsqueeze(0).repeat(len(NUCLEOTIDES), 1)
+    for i, nuc in enumerate(NUCLEOTIDES):
+        new_input_ids[i, pos] = tokenizer.encode(nuc)[0]
+    ref = NUCLEOTIDES.index(example["seq"][pos])
+    return dict(input_ids=new_input_ids, ref=ref)
@@ -1,11 +1,58 @@
 import datasets
 import tempfile
 import torch.nn as nn
-from transformers import Trainer, TrainingArguments
-from typing import Any
+from transformers import Trainer, TrainingArguments, PreTrainedTokenizerBase
+from typing import Any, Callable
+from functools import partial
+
+from .data import (
+    transform_reflogprob_mlm,
+    transform_reflogprob_clm,
+)
+from .model import (
+    compute_reflogprob_mlm,
+    compute_reflogprob_clm,
+)
 
 
 def run_inference(
+    model: nn.Module,
+    tokenizer: PreTrainedTokenizerBase,  # TODO: create an adapter for this
+    dataset: datasets.Dataset,
+    compute_fn: Callable[..., Any],
+    data_transform_fn: Callable[..., dict[str, Any]] | None = None,
+    data_transform_on_the_fly: bool = False,
+    data_transform_kwargs: dict[str, Any] | None = None,
+    inference_kwargs: dict[str, Any] | None = None,
+) -> Any:
+    processed_dataset = _process_dataset(
+        dataset,
+        tokenizer,
+        data_transform_fn,
+        data_transform_on_the_fly,
+        data_transform_kwargs,
+    )
+    return _run_inference(
+        _ModelComputeFnWrapper(model, compute_fn),
+        processed_dataset,
+        **(inference_kwargs or {}),
+    )
+
+
+run_reflogprob_mlm = partial(
+    run_inference,
+    compute_fn=compute_reflogprob_mlm,
+    data_transform_fn=transform_reflogprob_mlm,
+)
+
+run_reflogprob_clm = partial(
+    run_inference,
+    compute_fn=compute_reflogprob_clm,
+    data_transform_fn=transform_reflogprob_clm,
+)
+
+
+def _run_inference(
     model: nn.Module,
     dataset: datasets.Dataset,
     **kwargs: Any,
@@ -29,7 +76,55 @@ def run_inference(
     """
     training_args = TrainingArguments(
         output_dir=tempfile.TemporaryDirectory().name,
-        **kwargs,
+        **(kwargs or {}),
     )
     trainer = Trainer(model=model, args=training_args)
     return trainer.predict(test_dataset=dataset).predictions
+
+
+class _ModelComputeFnWrapper(nn.Module):
+    def __init__(self, model: nn.Module, compute_fn: Callable[..., Any]):
+        super().__init__()
+        self.model = model
+        self.compute_fn = compute_fn
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        return self.compute_fn(self.model, *args, **kwargs)
+
+
+def _process_dataset(
+    dataset: datasets.Dataset,
+    tokenizer: PreTrainedTokenizerBase,
+    data_transform_fn: Callable[..., dict[str, Any]] | None = None,
+    data_transform_on_the_fly: bool = False,
+    data_transform_kwargs: dict[str, Any] | None = None,
+) -> datasets.Dataset:
+    if data_transform_fn is None:
+        return dataset
+    data_transform_fn = partial(data_transform_fn, tokenizer=tokenizer)
+    if data_transform_on_the_fly:
+        return dataset.with_transform(
+            _make_batch_transform(data_transform_fn),
+            **data_transform_kwargs,
+        )
+    return dataset.map(
+        data_transform_fn,
+        **data_transform_kwargs,
+    )
+
+
+def _make_batch_transform(
+    transform_fn: Callable[[dict[str, Any]], dict[str, Any]],
+) -> Callable[[dict[str, list[Any]]], dict[str, list[Any]]]:
+    def batch_transform_fn(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # Convert batch format to list of examples
+        examples = [dict(zip(batch.keys(), values)) for values in zip(*batch.values())]
+        # Apply transform to each example
+        transformed_examples = [transform_fn(example) for example in examples]
+        # Convert back to batch format
+        return {
+            key: [ex[key] for ex in transformed_examples]
+            for key in transformed_examples[0].keys()
+        }
+
+    return batch_transform_fn