test(templates): unblock TestTemplateIntegration without HF_TOKEN

nv-alicheng · claude · nv-alicheng · commit 9b3f34b4dd9f · 2026-05-06T17:16:29.000-07:00
The 6 generated-template integration tests were skipped unconditionally
in CI/dev because the template placeholders default to gated
meta-llama/Llama-3.1-* repos that require HF_TOKEN to fetch the
tokenizer.

Substitute TinyLlama/TinyLlama-1.1B-Chat-v1.0 for the model name in
_resolve_template after placeholder expansion. TinyLlama is non-gated
(~1MB tokenizer download), shares the Llama-family tokenizer the
templates were written against, and the echo-server path doesn't care
about model identity — only that AutoTokenizer.from_pretrained
succeeds for the metrics aggregator's ISL/OSL/TPOT triggers.

Drops the @pytest.mark.skipif(not HF_TOKEN) decorator, removes the now-
unused os import.

Effect: integration suite goes from 20 passed / 8 skipped to 26 passed
/ 2 skipped. The remaining 2 skips need real LLM servers (vLLM/SGLang)
which aren't in scope.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tests/integration/commands/test_benchmark_command.py b/tests/integration/commands/test_benchmark_command.py
@@ -16,7 +16,6 @@
 """Integration tests for benchmark commands against echo server."""
 
 import json
-import os
 import re
 from pathlib import Path
 
@@ -184,11 +183,22 @@ def test_mode_logging(self, mock_http_echo_server, ds_dataset_path, caplog):
 )
 
 
+# Non-gated tokenizer model used in place of the templates' default
+# (which references gated meta-llama/Llama-3.1-*). The echo-server e2e
+# path doesn't care about the model identity, only that the tokenizer
+# exists for the metrics aggregator's ISL/OSL/TPOT triggers. TinyLlama's
+# tokenizer is ~1MB and matches the Llama-family tokenizer the templates
+# were written against.
+_TEST_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+
 def _resolve_template(template_path: Path, server_url: str) -> dict:
     """Load a template YAML, strip <PLACEHOLDER> wrappers, and patch for testing.
 
-    Only replaces placeholders with working values and caps n_samples_to_issue.
-    Everything else stays as the template defines it.
+    Replaces placeholders with working values, swaps the gated default
+    model for a non-gated tokenizer (so tests run without ``HF_TOKEN``),
+    and caps ``n_samples_to_issue``. Everything else stays as the template
+    defines it.
     """
     raw = template_path.read_text()
     # Strip <PLACEHOLDER eg: value> → value (all templates use eg: form)
@@ -197,6 +207,13 @@ def _resolve_template(template_path: Path, server_url: str) -> dict:
     raw = re.sub(r"http://localhost:\d+", server_url, raw)
     data = yaml.safe_load(raw)
 
+    # Swap any gated default model name for a non-gated tokenizer. The
+    # generated templates' "eg: meta-llama/Llama-3.1-8B-Instruct" placeholder
+    # points at a gated repo; substituting gpt2 lets these tests run in CI
+    # without HF_TOKEN.
+    if "model_params" in data and isinstance(data["model_params"], dict):
+        data["model_params"]["name"] = _TEST_MODEL_NAME
+
     # Cap total samples so test finishes in seconds
     data.setdefault("settings", {})
     data["settings"].setdefault("runtime", {})
@@ -213,10 +230,6 @@ class TestTemplateIntegration:
     """Verify generated templates run end-to-end against a local server."""
 
     @pytest.mark.integration
-    @pytest.mark.skipif(
-        not os.environ.get("HF_TOKEN"),
-        reason="Templates reference gated HF models; requires HF_TOKEN to fetch tokenizer",
-    )
     @pytest.mark.parametrize("template", _GENERATED_TEMPLATES)
     def test_template_runs(self, mock_http_echo_server, tmp_path, caplog, template):
         data = _resolve_template(TEMPLATE_DIR / template, mock_http_echo_server.url)