fix(vllm_inference): use local tokenizer for nvidia Hub reference model

svc-bionemo · claude · svc-bionemo · commit 02bc46d879dd · 2026-04-18T07:27:49.000-07:00
The nvidia/esm2_t6_8M_UR50D Hub tokenizer_config.json references
TokenizersBackend which was removed in transformers 5.x, causing
AutoTokenizer.from_pretrained() to raise ValueError.

Load the reference model's tokenizer from the local esm_fast_tokenizer
directory (PreTrainedTokenizerFast) instead of from the Hub config.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
Signed-off-by: svc-bionemo &lt;267129667+svc-bionemo@users.noreply.github.com&gt;
diff --git a/bionemo-recipes/recipes/vllm_inference/esm2/tests/test_vllm.py b/bionemo-recipes/recipes/vllm_inference/esm2/tests/test_vllm.py
@@ -61,12 +61,14 @@ def _last_token_l2(hidden_state: torch.Tensor) -> np.ndarray:
     return vec
 
 
-def _hf_embed(model_id: str, sequences: list[str], dtype=torch.float32) -> np.ndarray:
+def _hf_embed(model_id: str, sequences: list[str], dtype=torch.float32, tokenizer_id: str | None = None) -> np.ndarray:
     """Run HuggingFace inference and return last-token L2-normalised embeddings."""
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)
     model = AutoModel.from_pretrained(model_id, trust_remote_code=True).to("cuda", dtype=dtype).eval()
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_id if tokenizer_id is not None else model_id, trust_remote_code=True
+    )
 
     vecs = []
     with torch.no_grad():
@@ -133,7 +135,9 @@ def hf_exported_embeddings(exported_checkpoint):
 @pytest.fixture(scope="session")
 def hf_reference_embeddings():
     """Embeddings from HuggingFace on the nvidia Hub model (ground truth)."""
-    return _hf_embed(REFERENCE_MODEL_ID, SEQUENCES)
+    # The nvidia Hub tokenizer_config.json references TokenizersBackend which was removed in
+    # transformers 5.x. Use the local PreTrainedTokenizerFast implementation instead.
+    return _hf_embed(REFERENCE_MODEL_ID, SEQUENCES, tokenizer_id=str(ESM2_MODEL_DIR / "esm_fast_tokenizer"))
 
 
 # ---- Tests ----