Skip to content

Commit d4d5081

Browse files
authored
Fix smoketests (#1161)
* remove manage_test_secrets * remove unauthorized meta llama model * Test private downloads
1 parent a5a6f9a commit d4d5081

3 files changed

Lines changed: 5 additions & 59 deletions

File tree

plugins/validation_tests/test_object_creation.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,19 @@
11
import os
2-
import re
32

43
import pytest
54
from flaky import flaky # type: ignore
5+
66
from modelgauge.base_test import PromptResponseTest
77
from modelgauge.caching import SqlDictCache
88
from modelgauge.config import load_secrets_from_config
99
from modelgauge.dependency_helper import FromSourceDependencyHelper
10-
from modelgauge.external_data import WebData
1110
from modelgauge.load_plugins import load_plugins
1211
from modelgauge.locales import EN_US # see "workaround" below
1312
from modelgauge.prompt import TextPrompt
14-
from modelgauge.prompt_sets import demo_prompt_set_url
1513
from modelgauge.record_init import InitializationRecord
1614
from modelgauge.sut import PromptResponseSUT, SUTOptions, SUTResponse
1715
from modelgauge.sut_capabilities import AcceptsTextPrompt
1816
from modelgauge.sut_registry import SUTS
19-
2017
from modelgauge.suts.baseten_api import BasetenSUT
2118
from modelgauge.suts.huggingface_chat_completion import HuggingFaceChatCompletionDedicatedSUT
2219
from modelgauge.suts.together_client import TogetherDedicatedChatSUT
@@ -32,21 +29,6 @@
3229
TIMEOUT = 25 * 60 # 25 minutes. Baseten is the slowest to start up.
3330

3431

35-
def ensure_public_dependencies(dependencies):
36-
"""Some tests are defined with dependencies that require an auth token to download them.
37-
In this test context, we substitute public files instead."""
38-
for k, d in dependencies.items():
39-
if isinstance(d, WebData):
40-
new_dependency = WebData(
41-
source_url=demo_prompt_set_url(d.source_url),
42-
headers=None,
43-
decompressor=d.decompressor,
44-
unpacker=d.unpacker,
45-
)
46-
dependencies[k] = new_dependency
47-
return dependencies
48-
49-
5032
@pytest.fixture(scope="session")
5133
def shared_run_dir(tmp_path_factory):
5234
# Create a single tmpdir and have all `make_test_items` share it.
@@ -64,7 +46,7 @@ def shared_run_dir(tmp_path_factory):
6446
@flaky
6547
@pytest.mark.parametrize("test_name", [key for key, _ in TESTS.items() if key not in TOO_SLOW])
6648
def test_all_tests_make_test_items(test_name, shared_run_dir):
67-
test = TESTS.make_instance(test_name, secrets=_FAKE_SECRETS)
49+
test = TESTS.make_instance(test_name, secrets=load_secrets_from_config())
6850

6951
# TODO remove when localized files are handled better
7052
# workaround
@@ -73,10 +55,9 @@ def test_all_tests_make_test_items(test_name, shared_run_dir):
7355

7456
if isinstance(test, PromptResponseTest):
7557
test_data_path = os.path.join(shared_run_dir, test.__class__.__name__)
76-
dependencies = ensure_public_dependencies(test.get_dependencies())
7758
dependency_helper = FromSourceDependencyHelper(
7859
test_data_path,
79-
dependencies,
60+
test.get_dependencies(),
8061
required_versions={},
8162
)
8263
test_items = test.make_test_items(dependency_helper)

src/modelgauge/suts/meta_llama_client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ def translate_response(self, request: MetaLlamaChatRequest, response: MetaLlamaM
115115
return SUTResponse(text=text)
116116

117117

118-
CHAT_MODELS = ["Llama-4-Scout-17B-16E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-3.3-8B-Instruct"]
118+
# Unauthorized models: ["Llama-4-Scout-17B-16E-Instruct-FP8"]
119+
CHAT_MODELS = ["Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-3.3-8B-Instruct"]
119120

120121
for model_name in CHAT_MODELS:
121122
SUTS.register(MetaLlamaSUT, "meta-" + model_name.lower() + "-llama", model_name, InjectSecret(MetaLlamaApiKey))

tests/modelbench_tests/test_run.py

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
import functools
21
import math
3-
import os
42
import pathlib
53
from datetime import datetime
64
from typing import List, Mapping, Sequence
@@ -23,7 +21,6 @@
2321
from modelbench.scoring import ValueEstimate
2422
from modelgauge.base_test import PromptResponseTest
2523
from modelgauge.preflight import make_sut
26-
from modelgauge.config import SECRETS_PATH
2724
from modelgauge.dynamic_sut_factory import ModelNotSupportedError, ProviderNotFoundError, UnknownSUTMakerError
2825
from modelgauge.locales import DEFAULT_LOCALE, EN_US, FR_FR, LOCALES
2926
from modelgauge.prompt_sets import PROMPT_SETS
@@ -32,8 +29,6 @@
3229
from modelgauge.sut import PromptResponseSUT
3330
from modelgauge_tests.fake_sut import FakeSUT
3431

35-
TEST_SECRETS_PATH = os.path.join("tests", "config", "secrets.toml")
36-
3732

3833
class AHazard(HazardDefinition):
3934
def tests(self, secrets: RawSecrets) -> List[PromptResponseTest]:
@@ -111,33 +106,6 @@ def mock_score(
111106
datetime.now(),
112107
)
113108

114-
def manage_test_secrets(func):
115-
"""Decorator that manages test secrets during test execution.
116-
117-
1. If a secrets file exists, it's backed up
118-
2. The test secrets file is copied to the expected location
119-
3. After the test completes, the original state is restored
120-
"""
121-
122-
@functools.wraps(func)
123-
def wrapper(*args, **kwargs):
124-
secrets_src = pathlib.Path(TEST_SECRETS_PATH)
125-
secrets_dst = pathlib.Path(SECRETS_PATH)
126-
backup_dst = secrets_dst.with_suffix(".bak")
127-
128-
if secrets_dst.exists():
129-
secrets_dst.replace(backup_dst)
130-
secrets_src.replace(secrets_dst)
131-
132-
try:
133-
return func(*args, **kwargs)
134-
finally:
135-
secrets_dst.replace(secrets_src)
136-
if backup_dst.exists():
137-
backup_dst.replace(secrets_dst)
138-
139-
return wrapper
140-
141109
@pytest.fixture(autouse=False)
142110
def mock_run_benchmarks(self, sut, monkeypatch, tmp_path):
143111
mock = MagicMock(return_value=fake_benchmark_run(AHazard(), sut, tmp_path))
@@ -168,7 +136,6 @@ def runner(self):
168136
],
169137
# TODO add more locales as we add support for them
170138
)
171-
@manage_test_secrets
172139
def test_benchmark_basic_run_produces_json(
173140
self, runner, mock_run_benchmarks, mock_score_benchmarks, sut_uid, version, locale, prompt_set, tmp_path
174141
):
@@ -232,7 +199,6 @@ def test_security_benchmark_basic_run_produces_json(
232199
],
233200
# TODO add more locales as we add support for them
234201
)
235-
@manage_test_secrets
236202
def test_benchmark_multiple_suts_produces_json(
237203
self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, tmp_path, monkeypatch
238204
):
@@ -368,7 +334,6 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben
368334
# benchmark_arg = mock_score_benchmarks.call_args.args[0][0]
369335
# assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark)
370336

371-
@manage_test_secrets
372337
def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid):
373338
result = runner.invoke(cli, ["benchmark", "--sut", sut_uid])
374339

@@ -383,7 +348,6 @@ def test_nonexistent_benchmark_prompt_sets_can_not_be_called(self, runner, sut_u
383348
assert "Invalid value for '--prompt-set'" in result.output
384349

385350
@pytest.mark.parametrize("prompt_set", PROMPT_SETS.keys())
386-
@manage_test_secrets
387351
def test_calls_score_benchmark_with_correct_prompt_set(self, runner, mock_run_benchmarks, prompt_set, sut_uid):
388352
result = runner.invoke(cli, ["benchmark", "--prompt-set", prompt_set, "--sut", sut_uid])
389353

0 commit comments

Comments
 (0)