From 3c8096e16ab01630c5ffb3d6c4c36b19340fb172 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Tue, 10 Mar 2026 12:59:16 -0700 Subject: [PATCH 01/15] auto collect factories --- src/modelgauge/dynamic_sut_factory.py | 3 +++ src/modelgauge/sut_factory.py | 26 +++---------------- src/modelgauge/suts/anthropic_sut_factory.py | 2 ++ src/modelgauge/suts/google_sut_factory.py | 4 +-- .../suts/huggingface_sut_factory.py | 6 ++++- src/modelgauge/suts/indirect_sut.py | 1 + src/modelgauge/suts/mistral_sut_factory.py | 2 ++ src/modelgauge/suts/modelship_sut.py | 2 ++ src/modelgauge/suts/openai_sut_factory.py | 2 +- src/modelgauge/suts/together_sut_factory.py | 6 ++--- 10 files changed, 24 insertions(+), 30 deletions(-) diff --git a/src/modelgauge/dynamic_sut_factory.py b/src/modelgauge/dynamic_sut_factory.py index 4511385c4..892af64ed 100644 --- a/src/modelgauge/dynamic_sut_factory.py +++ b/src/modelgauge/dynamic_sut_factory.py @@ -28,8 +28,11 @@ class UnknownSUTMakerError(Exception): class DynamicSUTFactory(ABC): + DRIVER_NAME: str # Must be set by subclasses. + def __init__(self, raw_secrets: RawSecrets): self.raw_secrets = raw_secrets + # assert self.DRIVER_NAME, "DynamicSUTFactory subclasses must set DRIVER_NAME" def injected_secrets(self): """Return the injected secrets as specified by `get_secrets`.""" diff --git a/src/modelgauge/sut_factory.py b/src/modelgauge/sut_factory.py index 2581cab03..771f65c9e 100644 --- a/src/modelgauge/sut_factory.py +++ b/src/modelgauge/sut_factory.py @@ -2,18 +2,11 @@ from modelgauge.config import load_secrets_from_config from modelgauge.dynamic_sut_factory import DynamicSUTFactory, UnknownSUTMakerError +from modelgauge.general import get_concrete_subclasses from modelgauge.secret_values import RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.sut_registry import SUTS -from modelgauge.suts.anthropic_sut_factory import AnthropicSUTFactory -from modelgauge.suts.google_sut_factory import GoogleSUTFactory -from modelgauge.suts.huggingface_sut_factory import HuggingFaceSUTFactory -from modelgauge.suts.indirect_sut import IndirectSUTFactory -from modelgauge.suts.mistral_sut_factory import MistralSUTFactory -from modelgauge.suts.modelship_sut import ModelShipSUTFactory -from modelgauge.suts.openai_sut_factory import OpenAICompatibleSUTFactory -from modelgauge.suts.together_sut_factory import TogetherSUTFactory class SUTNotFoundException(Exception): @@ -26,21 +19,8 @@ class SUTType(Enum): UNKNOWN = "unknown" -# TODO: Auto-collect? -# Make sure the factory module includes the matching key as a constant. -# Maps a string to the module and factory function in that module -# that can be used to create a dynamic sut -DYNAMIC_SUT_FACTORIES: dict = { - "anthropic": AnthropicSUTFactory, - "google": GoogleSUTFactory, - "hf": HuggingFaceSUTFactory, - "hfrelay": HuggingFaceSUTFactory, - "indirect": IndirectSUTFactory, - "openai": OpenAICompatibleSUTFactory, - "mistral": MistralSUTFactory, - "modelship": ModelShipSUTFactory, - "together": TogetherSUTFactory, -} +DYNAMIC_SUT_FACTORIES: dict = {cls.DRIVER_NAME: cls for cls in get_concrete_subclasses(DynamicSUTFactory)} # type: ignore + LEGACY_SUT_MODULE_MAP = { # HuggingFaceChatCompletionDedicatedSUT and HuggingFaceChatCompletionServerlessSUT diff --git a/src/modelgauge/suts/anthropic_sut_factory.py b/src/modelgauge/suts/anthropic_sut_factory.py index 582b438b4..d989110c7 100644 --- a/src/modelgauge/suts/anthropic_sut_factory.py +++ b/src/modelgauge/suts/anthropic_sut_factory.py @@ -12,6 +12,8 @@ class AnthropicSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "anthropic" + def get_secrets(self) -> list[InjectSecret]: api_key = InjectSecret(AnthropicApiKey) return [api_key] diff --git a/src/modelgauge/suts/google_sut_factory.py b/src/modelgauge/suts/google_sut_factory.py index 588105c47..34f06b0c2 100644 --- a/src/modelgauge/suts/google_sut_factory.py +++ b/src/modelgauge/suts/google_sut_factory.py @@ -8,10 +8,10 @@ from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.google_genai import GoogleGenAiSUT, GoogleAiApiKey -DRIVER_NAME = "google" - class GoogleSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "google" + def get_secrets(self) -> list[InjectSecret]: api_key = InjectSecret(GoogleAiApiKey) return [api_key] diff --git a/src/modelgauge/suts/huggingface_sut_factory.py b/src/modelgauge/suts/huggingface_sut_factory.py index e84f9577b..3c4b53b65 100644 --- a/src/modelgauge/suts/huggingface_sut_factory.py +++ b/src/modelgauge/suts/huggingface_sut_factory.py @@ -13,7 +13,7 @@ HuggingFaceChatCompletionServerlessSUT, ) -DRIVER_NAME = "hfrelay" +HF_DRIVER_NAME = "hf" logger = get_logger(__name__) # Set HF logging to ERROR because its default logger level is DEBUG. @@ -22,6 +22,8 @@ class HuggingFaceSUTFactory(DynamicSUTFactory): + DRIVER_NAME = HF_DRIVER_NAME + def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) self.serverless_factory = HuggingFaceChatCompletionServerlessSUTFactory(raw_secrets) @@ -45,6 +47,7 @@ def make_sut(self, sut_definition: SUTDefinition) -> BaseHuggingFaceChatCompleti class HuggingFaceChatCompletionServerlessSUTFactory(DynamicSUTFactory): + DRIVER_NAME = HF_DRIVER_NAME def get_secrets(self) -> list[InjectSecret]: hf_token = InjectSecret(HuggingFaceInferenceToken) @@ -89,6 +92,7 @@ def make_sut(self, sut_definition: SUTDefinition) -> HuggingFaceChatCompletionSe class HuggingFaceChatCompletionDedicatedSUTFactory(DynamicSUTFactory): + DRIVER_NAME = HF_DRIVER_NAME def get_secrets(self) -> list[InjectSecret]: hf_token = InjectSecret(HuggingFaceInferenceToken) diff --git a/src/modelgauge/suts/indirect_sut.py b/src/modelgauge/suts/indirect_sut.py index e27408ff1..da2a98225 100644 --- a/src/modelgauge/suts/indirect_sut.py +++ b/src/modelgauge/suts/indirect_sut.py @@ -138,6 +138,7 @@ def start(): class IndirectSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "indirect" def get_secrets(self) -> list[InjectSecret]: return [] diff --git a/src/modelgauge/suts/mistral_sut_factory.py b/src/modelgauge/suts/mistral_sut_factory.py index e79cd8c2a..bdb2db592 100644 --- a/src/modelgauge/suts/mistral_sut_factory.py +++ b/src/modelgauge/suts/mistral_sut_factory.py @@ -7,6 +7,8 @@ class MistralSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "mistral" + def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) self._client = None # Lazy load. diff --git a/src/modelgauge/suts/modelship_sut.py b/src/modelgauge/suts/modelship_sut.py index 3d10551e6..b25eaf27c 100644 --- a/src/modelgauge/suts/modelship_sut.py +++ b/src/modelgauge/suts/modelship_sut.py @@ -35,6 +35,8 @@ def request_as_dict_for_client(self, request: OpenAIChatRequest) -> dict[str, An class ModelShipSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "modelship" + def get_secrets(self) -> list[InjectSecret]: api_key = InjectSecret(ModelShipSecret) return [api_key] diff --git a/src/modelgauge/suts/openai_sut_factory.py b/src/modelgauge/suts/openai_sut_factory.py index cf24045bc..12ddbcacd 100644 --- a/src/modelgauge/suts/openai_sut_factory.py +++ b/src/modelgauge/suts/openai_sut_factory.py @@ -6,11 +6,11 @@ from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.openai_client import OpenAIChat -DRIVER_NAME = "openai" NUM_RETRIES = 7 class OpenAICompatibleSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "openai" def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) diff --git a/src/modelgauge/suts/together_sut_factory.py b/src/modelgauge/suts/together_sut_factory.py index 0e2292870..c46711796 100644 --- a/src/modelgauge/suts/together_sut_factory.py +++ b/src/modelgauge/suts/together_sut_factory.py @@ -7,10 +7,10 @@ from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.together_client import TogetherChatSUT -DRIVER_NAME = "together" - class TogetherSUTFactory(DynamicSUTFactory): + DRIVER_NAME = "together" + def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) self._client = None # Lazy load. @@ -50,7 +50,7 @@ def make_sut(self, sut_definition: SUTDefinition) -> TogetherChatSUT: f"Model {sut_metadata.external_model_name()} not found or not available on together." ) - assert sut_metadata.driver == DRIVER_NAME + assert sut_metadata.driver == self.DRIVER_NAME return TogetherChatSUT( sut_definition.dynamic_uid, sut_metadata.external_model_name(), From caf625376d0eff9feb21b71d858438b70071103c Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Tue, 10 Mar 2026 13:03:02 -0700 Subject: [PATCH 02/15] remove comment --- src/modelgauge/dynamic_sut_factory.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/modelgauge/dynamic_sut_factory.py b/src/modelgauge/dynamic_sut_factory.py index 892af64ed..6be26b64e 100644 --- a/src/modelgauge/dynamic_sut_factory.py +++ b/src/modelgauge/dynamic_sut_factory.py @@ -28,11 +28,10 @@ class UnknownSUTMakerError(Exception): class DynamicSUTFactory(ABC): - DRIVER_NAME: str # Must be set by subclasses. + DRIVER_NAME: str # Must be set by subclasses in order for auto collection to work. def __init__(self, raw_secrets: RawSecrets): self.raw_secrets = raw_secrets - # assert self.DRIVER_NAME, "DynamicSUTFactory subclasses must set DRIVER_NAME" def injected_secrets(self): """Return the injected secrets as specified by `get_secrets`.""" From 465448fb4e0648acefb3993e20a876fe31dbefd3 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Tue, 10 Mar 2026 13:14:11 -0700 Subject: [PATCH 03/15] dont use real suts in unit tests --- tests/modelbench_tests/test_run.py | 48 ++++-------------------------- 1 file changed, 5 insertions(+), 43 deletions(-) diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index b842d55f4..09ffaa68d 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -316,7 +316,7 @@ def invoke(command, args=None, **kwargs): ], # TODO add more locales as we add support for them ) - @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"]) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_benchmark_basic_run_produces_json( self, monkeypatch, @@ -396,7 +396,7 @@ def test_benchmark_basic_run_produces_json( ], # TODO add more locales as we add support for them ) - @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay;mt=500;t=0.3"]) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_benchmark_multiple_suts_produces_json( self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, run_dir, monkeypatch ): @@ -488,44 +488,6 @@ def test_benchmark_bad_sut_errors_out(self, runner): catch_exceptions=False, ) - with patch( - "modelgauge.suts.huggingface_sut_factory.HuggingFaceChatCompletionServerlessSUTFactory._find", - side_effect=ProviderNotFoundError("bad provider"), - ): - with pytest.raises(ModelNotSupportedError): - _ = runner( - cli, - [ - "benchmark", - "general", - "-m", - "1", - "--sut", - "meta/llama:notreal:hfrelay", - *benchmark_options, - ], - catch_exceptions=False, - ) - - with patch( - "modelgauge.suts.huggingface_sut_factory.hfh.model_info", - side_effect=ModelNotSupportedError("bad model"), - ): - with pytest.raises(ModelNotSupportedError): - _ = runner( - cli, - [ - "benchmark", - "general", - "-m", - "1", - "--sut", - "google/bogus:cohere:hfrelay", - *benchmark_options, - ], - catch_exceptions=False, - ) - @pytest.mark.parametrize("version", ["0.0", "0.5"]) def test_invalid_benchmark_versions_can_not_be_called(self, version, runner): result = runner(cli, ["benchmark", "general", "--version", "0.0"]) @@ -546,7 +508,7 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben # # benchmark_arg = mock_score_benchmarks.call_args.args[0][0] # assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark) - @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"]) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid): _ = runner(cli, ["benchmark", "general", "--sut", sut_uid]) @@ -555,14 +517,14 @@ def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid): assert benchmark_arg.locale == EN_US assert benchmark_arg.prompt_set == "demo" - @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"]) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_nonexistent_benchmark_prompt_sets_can_not_be_called(self, runner, sut_uid): result = runner(cli, ["benchmark", "general", "--prompt-set", "fake", "--sut", sut_uid]) assert result.exit_code == 2 assert "Invalid value for '--prompt-set'" in result.output @pytest.mark.parametrize("prompt_set", GENERAL_PROMPT_SETS.keys()) - @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"]) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_calls_score_benchmark_with_correct_prompt_set(self, runner, mock_run_benchmarks, prompt_set, sut_uid): _ = runner(cli, ["benchmark", "general", "--prompt-set", prompt_set, "--sut", sut_uid]) From 78f71f0c215b067ff455ac2191e5f39856b3584f Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Wed, 11 Mar 2026 12:42:08 -0700 Subject: [PATCH 04/15] actually collect factories --- src/modelgauge/sut_factory.py | 10 ++++++---- src/modelgauge/suts/together_cli.py | 30 ----------------------------- 2 files changed, 6 insertions(+), 34 deletions(-) delete mode 100644 src/modelgauge/suts/together_cli.py diff --git a/src/modelgauge/sut_factory.py b/src/modelgauge/sut_factory.py index 771f65c9e..d6a4ca650 100644 --- a/src/modelgauge/sut_factory.py +++ b/src/modelgauge/sut_factory.py @@ -19,9 +19,6 @@ class SUTType(Enum): UNKNOWN = "unknown" -DYNAMIC_SUT_FACTORIES: dict = {cls.DRIVER_NAME: cls for cls in get_concrete_subclasses(DynamicSUTFactory)} # type: ignore - - LEGACY_SUT_MODULE_MAP = { # HuggingFaceChatCompletionDedicatedSUT and HuggingFaceChatCompletionServerlessSUT "nvidia-llama-3-1-nemotron-nano-8b-v1": "huggingface_chat_completion", @@ -130,8 +127,13 @@ def __init__(self, sut_registry): self.dynamic_sut_factories = self._load_dynamic_sut_factories(load_secrets_from_config()) def _load_dynamic_sut_factories(self, secrets: RawSecrets) -> dict[str, DynamicSUTFactory]: + from modelgauge.load_namespaces import load_namespace + + load_namespace("suts") + dynamic_sut_factories: dict = {cls.DRIVER_NAME: cls for cls in get_concrete_subclasses(DynamicSUTFactory)} # type: ignore factories: dict[str, DynamicSUTFactory] = {} - for driver, factory_class in DYNAMIC_SUT_FACTORIES.items(): + + for driver, factory_class in dynamic_sut_factories.items(): factories[driver] = factory_class(secrets) return factories diff --git a/src/modelgauge/suts/together_cli.py b/src/modelgauge/suts/together_cli.py deleted file mode 100644 index 73335cfd7..000000000 --- a/src/modelgauge/suts/together_cli.py +++ /dev/null @@ -1,30 +0,0 @@ -import together # type: ignore -from collections import defaultdict -from modelgauge.command_line import display_header, display_list_item, cli -from modelgauge.config import load_secrets_from_config -from modelgauge.suts.together_client import TogetherApiKey - - -@cli.command() -def list_together(): - """List all models available in together.ai.""" - - secrets = load_secrets_from_config() - together.api_key = TogetherApiKey.make(secrets).value - model_list = together.Models.list() - - # Group by display_type, which seems to be the model's style. - by_display_type = defaultdict(list) - for model in model_list: - try: - display_type = model["display_type"] - except KeyError: - display_type = "unknown" - display_name = model["display_name"] - by_display_type[display_type].append(f"{display_name}: {model['name']}") - - for display_name, models in by_display_type.items(): - display_header(f"{display_name}: {len(models)}") - for model in sorted(models): - display_list_item(model) - display_header(f"Total: {len(model_list)}") From f7bd679cebdd17cd6b2e658ecc6110f8eabc763c Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 12:11:56 -0700 Subject: [PATCH 05/15] New type of factory object for auto collection --- src/modelgauge/dynamic_sut_factory.py | 14 ++++++++-- src/modelgauge/sut_factory.py | 18 ++++++------- src/modelgauge/suts/anthropic_sut_factory.py | 4 +-- src/modelgauge/suts/google_sut_factory.py | 4 +-- .../suts/huggingface_sut_factory.py | 17 ++++++------ src/modelgauge/suts/mistral_sut_factory.py | 4 +-- src/modelgauge/suts/openai_sut_factory.py | 19 ++++++++----- src/modelgauge/suts/together_sut_factory.py | 4 +-- .../sut_tests/test_openai_sut_factory.py | 2 +- .../test_dynamic_sut_factory.py | 27 ++++++++++++++++++- tests/modelgauge_tests/test_sut_factory.py | 7 +++++ 11 files changed, 84 insertions(+), 36 deletions(-) diff --git a/src/modelgauge/dynamic_sut_factory.py b/src/modelgauge/dynamic_sut_factory.py index 6be26b64e..70052de1d 100644 --- a/src/modelgauge/dynamic_sut_factory.py +++ b/src/modelgauge/dynamic_sut_factory.py @@ -28,8 +28,6 @@ class UnknownSUTMakerError(Exception): class DynamicSUTFactory(ABC): - DRIVER_NAME: str # Must be set by subclasses in order for auto collection to work. - def __init__(self, raw_secrets: RawSecrets): self.raw_secrets = raw_secrets @@ -44,3 +42,15 @@ def get_secrets(self) -> list[InjectSecret]: @abstractmethod def make_sut(self, sut_definition: SUTDefinition) -> SUT: pass + + +class DynamicSUTFactoryDriver(DynamicSUTFactory, ABC): + """These classes will be collected as driver factories for dynamic SUTs. They may call regular DynamicSUTFactories.""" + + DRIVER_NAME: str + + def __init__(self, raw_secrets: RawSecrets): + super().__init__(raw_secrets) + assert hasattr(self, "DRIVER_NAME") and isinstance( + self.DRIVER_NAME, str + ), "DynamicSUTFactoryDriver subclasses must have a DRIVER_NAME attribute" diff --git a/src/modelgauge/sut_factory.py b/src/modelgauge/sut_factory.py index d6a4ca650..a05a62ec8 100644 --- a/src/modelgauge/sut_factory.py +++ b/src/modelgauge/sut_factory.py @@ -1,8 +1,9 @@ from enum import Enum from modelgauge.config import load_secrets_from_config -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, UnknownSUTMakerError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, UnknownSUTMakerError from modelgauge.general import get_concrete_subclasses +from modelgauge.load_namespaces import load_namespace from modelgauge.secret_values import RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition @@ -126,16 +127,15 @@ def __init__(self, sut_registry): self.sut_registry = sut_registry self.dynamic_sut_factories = self._load_dynamic_sut_factories(load_secrets_from_config()) - def _load_dynamic_sut_factories(self, secrets: RawSecrets) -> dict[str, DynamicSUTFactory]: - from modelgauge.load_namespaces import load_namespace - + def _load_dynamic_sut_factories(self, secrets: RawSecrets) -> dict[str, DynamicSUTFactoryDriver]: load_namespace("suts") - dynamic_sut_factories: dict = {cls.DRIVER_NAME: cls for cls in get_concrete_subclasses(DynamicSUTFactory)} # type: ignore - factories: dict[str, DynamicSUTFactory] = {} + dynamic_sut_factories = {} + for cls in get_concrete_subclasses(DynamicSUTFactoryDriver): # type: ignore + if cls.DRIVER_NAME in dynamic_sut_factories: + raise ValueError(f"Multiple DynamicSUTFactoryDrivers have the same DRIVER_NAME '{cls.DRIVER_NAME}'.") + dynamic_sut_factories[cls.DRIVER_NAME] = cls(secrets) - for driver, factory_class in dynamic_sut_factories.items(): - factories[driver] = factory_class(secrets) - return factories + return dynamic_sut_factories def knows(self, uid: str) -> bool: """Check if the registry knows about a given SUT UID. Dynamic SUTs are always considered known.""" diff --git a/src/modelgauge/suts/anthropic_sut_factory.py b/src/modelgauge/suts/anthropic_sut_factory.py index d989110c7..991230f3d 100644 --- a/src/modelgauge/suts/anthropic_sut_factory.py +++ b/src/modelgauge/suts/anthropic_sut_factory.py @@ -4,14 +4,14 @@ from anthropic import Anthropic -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError from modelgauge.secret_values import RawSecrets, InjectSecret from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.anthropic_api import AnthropicApiKey, AnthropicSUT -class AnthropicSUTFactory(DynamicSUTFactory): +class AnthropicSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "anthropic" def get_secrets(self) -> list[InjectSecret]: diff --git a/src/modelgauge/suts/google_sut_factory.py b/src/modelgauge/suts/google_sut_factory.py index 34f06b0c2..bfb65fe54 100644 --- a/src/modelgauge/suts/google_sut_factory.py +++ b/src/modelgauge/suts/google_sut_factory.py @@ -2,14 +2,14 @@ from google import genai -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError from modelgauge.secret_values import RawSecrets, InjectSecret from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.google_genai import GoogleGenAiSUT, GoogleAiApiKey -class GoogleSUTFactory(DynamicSUTFactory): +class GoogleSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "google" def get_secrets(self) -> list[InjectSecret]: diff --git a/src/modelgauge/suts/huggingface_sut_factory.py b/src/modelgauge/suts/huggingface_sut_factory.py index 3c4b53b65..34b2db4f9 100644 --- a/src/modelgauge/suts/huggingface_sut_factory.py +++ b/src/modelgauge/suts/huggingface_sut_factory.py @@ -4,7 +4,12 @@ from airrlogger.log_config import get_logger from modelgauge.auth.huggingface_inference_token import HuggingFaceInferenceToken -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError, ProviderNotFoundError +from modelgauge.dynamic_sut_factory import ( + DynamicSUTFactory, + DynamicSUTFactoryDriver, + ModelNotSupportedError, + ProviderNotFoundError, +) from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.huggingface_chat_completion import ( @@ -13,16 +18,14 @@ HuggingFaceChatCompletionServerlessSUT, ) -HF_DRIVER_NAME = "hf" - logger = get_logger(__name__) # Set HF logging to ERROR because its default logger level is DEBUG. # There are also many warnings which are not really actionable and very repetitive. logging.getLogger("huggingface_hub").setLevel(logging.ERROR) -class HuggingFaceSUTFactory(DynamicSUTFactory): - DRIVER_NAME = HF_DRIVER_NAME +class HuggingFaceSUTFactory(DynamicSUTFactoryDriver): + DRIVER_NAME = "hf" def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) @@ -47,8 +50,6 @@ def make_sut(self, sut_definition: SUTDefinition) -> BaseHuggingFaceChatCompleti class HuggingFaceChatCompletionServerlessSUTFactory(DynamicSUTFactory): - DRIVER_NAME = HF_DRIVER_NAME - def get_secrets(self) -> list[InjectSecret]: hf_token = InjectSecret(HuggingFaceInferenceToken) return [hf_token] @@ -92,8 +93,6 @@ def make_sut(self, sut_definition: SUTDefinition) -> HuggingFaceChatCompletionSe class HuggingFaceChatCompletionDedicatedSUTFactory(DynamicSUTFactory): - DRIVER_NAME = HF_DRIVER_NAME - def get_secrets(self) -> list[InjectSecret]: hf_token = InjectSecret(HuggingFaceInferenceToken) return [hf_token] diff --git a/src/modelgauge/suts/mistral_sut_factory.py b/src/modelgauge/suts/mistral_sut_factory.py index bdb2db592..0b0fe0aa0 100644 --- a/src/modelgauge/suts/mistral_sut_factory.py +++ b/src/modelgauge/suts/mistral_sut_factory.py @@ -1,4 +1,4 @@ -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition @@ -6,7 +6,7 @@ from modelgauge.suts.mistral_sut import MistralAISut -class MistralSUTFactory(DynamicSUTFactory): +class MistralSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "mistral" def __init__(self, raw_secrets: RawSecrets): diff --git a/src/modelgauge/suts/openai_sut_factory.py b/src/modelgauge/suts/openai_sut_factory.py index 12ddbcacd..9deb3cdcf 100644 --- a/src/modelgauge/suts/openai_sut_factory.py +++ b/src/modelgauge/suts/openai_sut_factory.py @@ -1,7 +1,12 @@ from openai import OpenAI, NotFoundError from modelgauge.auth.openai_compatible_secrets import OpenAICompatibleApiKey -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError, ProviderNotFoundError +from modelgauge.dynamic_sut_factory import ( + DynamicSUTFactory, + DynamicSUTFactoryDriver, + ModelNotSupportedError, + ProviderNotFoundError, +) from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.openai_client import OpenAIChat @@ -9,9 +14,7 @@ NUM_RETRIES = 7 -class OpenAICompatibleSUTFactory(DynamicSUTFactory): - DRIVER_NAME = "openai" - +class BaseOpenAISUTFactory(DynamicSUTFactory): def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) self.provider = None # must be set in child classes and match name of section (scope) in secrets.toml @@ -32,6 +35,10 @@ def _make_client(self) -> OpenAI: _client = OpenAI(api_key=api_key.value, max_retries=NUM_RETRIES) return _client + +class OpenAICompatibleSUTFactory(BaseOpenAISUTFactory, DynamicSUTFactoryDriver): + DRIVER_NAME = "openai" + def make_sut(self, sut_definition: SUTDefinition) -> OpenAIChat: factory = factory_class = None self.provider = sut_definition.get("provider") # type: ignore @@ -54,7 +61,7 @@ def make_sut(self, sut_definition: SUTDefinition) -> OpenAIChat: return factory.make_sut(sut_definition) -class OpenAISUTFactory(OpenAICompatibleSUTFactory): +class OpenAISUTFactory(BaseOpenAISUTFactory): """OpenAI SUT hosted by OpenAI""" def __init__(self, raw_secrets: RawSecrets): @@ -76,7 +83,7 @@ def make_sut(self, sut_definition: SUTDefinition) -> OpenAIChat: return OpenAIChat(sut_definition.uid, sut_definition.get("model"), client=self.client) # type: ignore -class OpenAIGenericSUTFactory(OpenAICompatibleSUTFactory): +class OpenAIGenericSUTFactory(BaseOpenAISUTFactory): """A SUT that uses the OpenAI client, not hosted by OpenAI""" def __init__(self, raw_secrets: RawSecrets, base_url: str | None = None): diff --git a/src/modelgauge/suts/together_sut_factory.py b/src/modelgauge/suts/together_sut_factory.py index c46711796..45381be5a 100644 --- a/src/modelgauge/suts/together_sut_factory.py +++ b/src/modelgauge/suts/together_sut_factory.py @@ -1,14 +1,14 @@ from together import Together # type: ignore from modelgauge.auth.together_key import TogetherApiKey -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError from modelgauge.dynamic_sut_metadata import DynamicSUTMetadata from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.together_client import TogetherChatSUT -class TogetherSUTFactory(DynamicSUTFactory): +class TogetherSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "together" def __init__(self, raw_secrets: RawSecrets): diff --git a/tests/modelgauge_tests/sut_tests/test_openai_sut_factory.py b/tests/modelgauge_tests/sut_tests/test_openai_sut_factory.py index 5c1449589..487c4b5ae 100644 --- a/tests/modelgauge_tests/sut_tests/test_openai_sut_factory.py +++ b/tests/modelgauge_tests/sut_tests/test_openai_sut_factory.py @@ -120,7 +120,7 @@ def test_factory_tries_to_make_a_generic_sut(factory, sut_definition): def test_factory_makes_the_right_openai_sut(factory): - with patch("modelgauge.suts.openai_sut_factory.OpenAICompatibleSUTFactory._make_client"): + with patch("modelgauge.suts.openai_sut_factory.BaseOpenAISUTFactory._make_client"): sut_definition = SUTDefinition(model="gpt-5", maker="openai", driver="openai") sut = factory.make_sut(sut_definition) assert sut.uid == "openai/gpt-5:openai" diff --git a/tests/modelgauge_tests/test_dynamic_sut_factory.py b/tests/modelgauge_tests/test_dynamic_sut_factory.py index 09fb9ab8a..98642e69c 100644 --- a/tests/modelgauge_tests/test_dynamic_sut_factory.py +++ b/tests/modelgauge_tests/test_dynamic_sut_factory.py @@ -1,6 +1,6 @@ import pytest -from modelgauge.dynamic_sut_factory import DynamicSUTFactory +from modelgauge.dynamic_sut_factory import DynamicSUTFactory, DynamicSUTFactoryDriver from modelgauge.sut_definition import SUTDefinition from modelgauge.secret_values import InjectSecret from modelgauge_tests.fake_sut import FakeSUT @@ -41,3 +41,28 @@ def test_injected_secrets_missing_required(): factory = FakeDynamicFactory({"optional-scope": {"optional-key": "optional-value"}}) with pytest.raises(MissingSecretValues): factory.injected_secrets() + + +def test_dynamic_sut_factory_driver_instantiation(): + class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + pass + + with pytest.raises(AssertionError): + MyDriverFactory({}) + + class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + DRIVER_NAME: str + + with pytest.raises(AssertionError): + MyDriverFactory({}) + + class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + DRIVER_NAME = None + + with pytest.raises(AssertionError): + MyDriverFactory({}) + + class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + DRIVER_NAME = "driver" + + factory = MyDriverFactory({}) diff --git a/tests/modelgauge_tests/test_sut_factory.py b/tests/modelgauge_tests/test_sut_factory.py index 6bba93593..a628cce84 100644 --- a/tests/modelgauge_tests/test_sut_factory.py +++ b/tests/modelgauge_tests/test_sut_factory.py @@ -56,6 +56,13 @@ def test_make_instance_preregistered(sut_factory): assert isinstance(sut, SUT) +# def test_load_dynamic_sut_factories(): +# class MainFactory(SUTFactory): +# with patch("modelgauge.general.get_concrete_subclasses", return_value={TestFactory}): +# factory = SUTFactory() +# assert factory.dynamic_sut_factories == {} + + def test_make_instance_dynamic(sut_factory_dynamic): sut = sut_factory_dynamic.make_instance("google/gemma:driver1", secrets={}) assert isinstance(sut, FakeSUT) From 2dca4e8bb36905001f0c617ccd6fe1845a05e6e0 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 12:21:11 -0700 Subject: [PATCH 06/15] update readme --- docs/add-a-new-sut-driver.md | 35 +++++++---------------------------- docs/suts-how-to.md | 21 --------------------- 2 files changed, 7 insertions(+), 49 deletions(-) diff --git a/docs/add-a-new-sut-driver.md b/docs/add-a-new-sut-driver.md index 853a07400..f7699c759 100644 --- a/docs/add-a-new-sut-driver.md +++ b/docs/add-a-new-sut-driver.md @@ -7,19 +7,8 @@ Most providers need their own driver. We provide several drivers that can be use ### Does an Existing Driver Exist? -If your SUT provider is listed as a key in the `DYNAMIC_SUT_FACTORIES` in -[sut_factory](../src/modelgauge/sut_factory.py), you don't need to write any code. - -```python -DYNAMIC_SUT_FACTORIES: dict = { - "hf": HuggingFaceSUTFactory, - "hfrelay": HuggingFaceSUTFactory, - "openai": OpenAICompatibleSUTFactory, - "together": TogetherSUTFactory, - "modelship": ModelShipSUTFactory, -} -``` -Please refer to [suts-how-to.md](./suts-how-to.md#existing) for details. +Search the existing `DynamicSUTFactoryDriver` classes; +if one already exists for your provider, you can don't need to write any code. ### Is Your SUT Already Pre-Defined? @@ -79,13 +68,11 @@ class MySUT(PromptResponseSUT): return MySUTResponse(**response_json) ``` -2. Create a factory class that creates an instance of your SUT from its UID. Look at [TogetherSUTFactory](../src/modelgauge/suts/together_sut_factory.py) for inspiration. +2. Create a factory class that creates an instance of your SUT from its UID. Look at [TogetherSUTFactoryDriver](../src/modelgauge/suts/together_sut_factory.py) for inspiration. The `DRIVER_NAME` constant must be unique to your driver. It will be a key in a dict. ```python -DRIVER_NAME = "my_sut" - class MySUTApiKey(RequiredSecret): # adjust this to your specific provider @classmethod @@ -95,7 +82,9 @@ class MySUTApiKey(RequiredSecret): key="api_key" ) -class MySUTFactory(DynamicSUTFactory): +class MySUTFactory(DynamicSUTFactoryDriver): + DRIVER_NAME = "my_sut" + def __init__(self, raw_secrets: RawSecrets): # RawSecrets is a dict of secrets super().__init__(raw_secrets) @@ -112,17 +101,7 @@ class MySUTFactory(DynamicSUTFactory): ) ``` -3. Add an entry for your new factory class in the `DYNAMIC_SUT_FACTORIES` dict in [sut_factory](../src/modelgauge/sut_factory.py). - -```python -DYNAMIC_SUT_FACTORIES: dict = { - ... - "my_sut": MySUTFactory, - ... -} -``` - -4. Add a scope to [config/secrets.toml](../config/secrets.toml) for your provider, using the `scope` you defined in the `Secret` class(es) for your SUT: +3. Add a scope to [config/secrets.toml](../config/secrets.toml) for your provider, using the `scope` you defined in the `Secret` class(es) for your SUT: ```toml [my_host] diff --git a/docs/suts-how-to.md b/docs/suts-how-to.md index b0bfd0b43..02df7ff40 100644 --- a/docs/suts-how-to.md +++ b/docs/suts-how-to.md @@ -32,27 +32,6 @@ A lot of new SUTs will require no code if your model is hosted on one of the pro Factory classes are used to create SUT objects for you, including their driver and model name, based the elements in the SUT UID. -Available drivers are identified in `DYNAMIC_SUT_FACTORIES` in -[sut_factory](../src/modelgauge/sut_factory.py). The keys correspond to the `driver` string in the SUT UID. - -We may add more drivers from time to time. - -```python -DYNAMIC_SUT_FACTORIES: dict = { - "hf": HuggingFaceSUTFactory, - "hfrelay": HuggingFaceSUTFactory, - "openai": OpenAICompatibleSUTFactory, - "together": TogetherSUTFactory, - "modelship": ModelShipSUTFactory, -} -``` - -* "hf" is used for models hosted by Huggingface -* "hfrelay" is used for models hosted by one of Huggingface's inference provider partners (e.g. nebius, sambanova) and proxied by Huggingface ([more info](https://huggingface.co/docs/inference-providers/en/index)) -* "openai" is a model hosted by OpenAI -* "together" is a model hosted by together.ai -* "modelship" is internal to MLCommons - #### Usage For models using one of those drivers, all you need is to add your credentials to [config/secrets.toml](../config/secrets.toml) in a section named after the driver name string, e.g. for together.ai: From 3ac320b8f8330368728dcb7ce92993918353067d Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 12:22:54 -0700 Subject: [PATCH 07/15] remove commented out code --- tests/modelgauge_tests/test_sut_factory.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/modelgauge_tests/test_sut_factory.py b/tests/modelgauge_tests/test_sut_factory.py index a628cce84..6bba93593 100644 --- a/tests/modelgauge_tests/test_sut_factory.py +++ b/tests/modelgauge_tests/test_sut_factory.py @@ -56,13 +56,6 @@ def test_make_instance_preregistered(sut_factory): assert isinstance(sut, SUT) -# def test_load_dynamic_sut_factories(): -# class MainFactory(SUTFactory): -# with patch("modelgauge.general.get_concrete_subclasses", return_value={TestFactory}): -# factory = SUTFactory() -# assert factory.dynamic_sut_factories == {} - - def test_make_instance_dynamic(sut_factory_dynamic): sut = sut_factory_dynamic.make_instance("google/gemma:driver1", secrets={}) assert isinstance(sut, FakeSUT) From 33e5f1ce5de4d7eb8b4b420f193469fae6c219d9 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 12:27:47 -0700 Subject: [PATCH 08/15] update llama factory --- src/modelgauge/suts/meta_llama_factory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/modelgauge/suts/meta_llama_factory.py b/src/modelgauge/suts/meta_llama_factory.py index 4bf6558be..fc728d810 100644 --- a/src/modelgauge/suts/meta_llama_factory.py +++ b/src/modelgauge/suts/meta_llama_factory.py @@ -1,13 +1,15 @@ from llama_api_client import LlamaAPIClient -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.meta_llama_client import MetaLlamaApiKey, MetaLlamaSUT -class LlamaSUTFactory(DynamicSUTFactory): +class LlamaSUTFactory(DynamicSUTFactoryDriver): + DRIVER_NAME = "llama" + def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) self._client = None From 23a3a31fb7a535222d087afc290ddc598178a578 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 12:30:53 -0700 Subject: [PATCH 09/15] add modelship --- src/modelgauge/suts/modelship_sut.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/modelgauge/suts/modelship_sut.py b/src/modelgauge/suts/modelship_sut.py index b25eaf27c..24d9aae99 100644 --- a/src/modelgauge/suts/modelship_sut.py +++ b/src/modelgauge/suts/modelship_sut.py @@ -1,7 +1,7 @@ from typing import Optional, Mapping, Any from modelgauge.auth.openai_compatible_secrets import OpenAICompatibleApiKey -from modelgauge.dynamic_sut_factory import DynamicSUTFactory +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.openai_client import OpenAIChat, OpenAIChatRequest @@ -34,7 +34,7 @@ def request_as_dict_for_client(self, request: OpenAIChatRequest) -> dict[str, An return request_as_dict -class ModelShipSUTFactory(DynamicSUTFactory): +class ModelShipSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "modelship" def get_secrets(self) -> list[InjectSecret]: From 873fac215ccc22d2a21e431fc58d56e6913686d7 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 12:31:24 -0700 Subject: [PATCH 10/15] add indirect sut --- src/modelgauge/suts/indirect_sut.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/modelgauge/suts/indirect_sut.py b/src/modelgauge/suts/indirect_sut.py index da2a98225..f3688be97 100644 --- a/src/modelgauge/suts/indirect_sut.py +++ b/src/modelgauge/suts/indirect_sut.py @@ -5,7 +5,7 @@ import uvicorn from pydantic import BaseModel -from modelgauge.dynamic_sut_factory import DynamicSUTFactory +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver from modelgauge.prompt import TextPrompt from modelgauge.ready import ReadyResponse from modelgauge.secret_values import InjectSecret @@ -137,7 +137,7 @@ def start(): thread.start() -class IndirectSUTFactory(DynamicSUTFactory): +class IndirectSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "indirect" def get_secrets(self) -> list[InjectSecret]: From a1bf68d5cf95c6a6ac26773452f4487c7fe64276 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 12 Mar 2026 16:59:49 -0700 Subject: [PATCH 11/15] update aws factory --- src/modelgauge/suts/aws_bedrock_sut_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/modelgauge/suts/aws_bedrock_sut_factory.py b/src/modelgauge/suts/aws_bedrock_sut_factory.py index 774c90d76..df85bf809 100644 --- a/src/modelgauge/suts/aws_bedrock_sut_factory.py +++ b/src/modelgauge/suts/aws_bedrock_sut_factory.py @@ -2,14 +2,14 @@ import boto3 -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.aws_bedrock_client import AmazonBedrockSut, AwsAccessKeyId, AwsSecretAccessKey -class AWSBedrockSUTFactory(DynamicSUTFactory): +class AWSBedrockSUTFactory(DynamicSUTFactoryDriver): DRIVER_NAME = "aws" def __init__(self, raw_secrets: RawSecrets): From 27952d46175a79efb1560da32dab719d224f35cc Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Mon, 16 Mar 2026 09:28:00 -0700 Subject: [PATCH 12/15] rename to DynamicDriverSUTFactory --- src/modelgauge/dynamic_sut_factory.py | 4 ++-- src/modelgauge/sut_factory.py | 6 +++--- src/modelgauge/suts/anthropic_sut_factory.py | 4 ++-- src/modelgauge/suts/aws_bedrock_sut_factory.py | 4 ++-- src/modelgauge/suts/google_sut_factory.py | 4 ++-- src/modelgauge/suts/huggingface_sut_factory.py | 4 ++-- src/modelgauge/suts/indirect_sut.py | 4 ++-- src/modelgauge/suts/meta_llama_factory.py | 4 ++-- src/modelgauge/suts/mistral_sut_factory.py | 4 ++-- src/modelgauge/suts/modelship_sut.py | 4 ++-- src/modelgauge/suts/openai_sut_factory.py | 4 ++-- src/modelgauge/suts/together_sut_factory.py | 4 ++-- tests/modelgauge_tests/test_dynamic_sut_factory.py | 10 +++++----- 13 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/modelgauge/dynamic_sut_factory.py b/src/modelgauge/dynamic_sut_factory.py index 510bbd891..62f37c840 100644 --- a/src/modelgauge/dynamic_sut_factory.py +++ b/src/modelgauge/dynamic_sut_factory.py @@ -45,7 +45,7 @@ def make_sut(self, sut_definition: SUTDefinition) -> SUT: pass -class DynamicSUTFactoryDriver(DynamicSUTFactory, ABC): +class DynamicDriverSUTFactory(DynamicSUTFactory, ABC): """These classes will be collected as driver factories for dynamic SUTs. They may call regular DynamicSUTFactories.""" DRIVER_NAME: str @@ -54,4 +54,4 @@ def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) assert hasattr(self, "DRIVER_NAME") and isinstance( self.DRIVER_NAME, str - ), "DynamicSUTFactoryDriver subclasses must have a DRIVER_NAME attribute" + ), "DynamicDriverSUTFactory subclasses must have a DRIVER_NAME attribute" diff --git a/src/modelgauge/sut_factory.py b/src/modelgauge/sut_factory.py index 29c94cd2e..0ab096793 100644 --- a/src/modelgauge/sut_factory.py +++ b/src/modelgauge/sut_factory.py @@ -1,7 +1,7 @@ from enum import Enum from modelgauge.config import load_secrets_from_config -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, UnknownSUTMakerError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, UnknownSUTMakerError from modelgauge.general import get_concrete_subclasses from modelgauge.load_namespaces import load_namespace from modelgauge.secret_values import RawSecrets @@ -131,10 +131,10 @@ def __init__(self, sut_registry): self.sut_registry = sut_registry self.dynamic_sut_factories = self._load_dynamic_sut_factories(load_secrets_from_config()) - def _load_dynamic_sut_factories(self, secrets: RawSecrets) -> dict[str, DynamicSUTFactoryDriver]: + def _load_dynamic_sut_factories(self, secrets: RawSecrets) -> dict[str, DynamicDriverSUTFactory]: load_namespace("suts") dynamic_sut_factories = {} - for cls in get_concrete_subclasses(DynamicSUTFactoryDriver): # type: ignore + for cls in get_concrete_subclasses(DynamicDriverSUTFactory): # type: ignore if cls.DRIVER_NAME in dynamic_sut_factories: raise ValueError(f"Multiple DynamicSUTFactoryDrivers have the same DRIVER_NAME '{cls.DRIVER_NAME}'.") dynamic_sut_factories[cls.DRIVER_NAME] = cls(secrets) diff --git a/src/modelgauge/suts/anthropic_sut_factory.py b/src/modelgauge/suts/anthropic_sut_factory.py index 991230f3d..948ffc5e8 100644 --- a/src/modelgauge/suts/anthropic_sut_factory.py +++ b/src/modelgauge/suts/anthropic_sut_factory.py @@ -4,14 +4,14 @@ from anthropic import Anthropic -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError from modelgauge.secret_values import RawSecrets, InjectSecret from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.anthropic_api import AnthropicApiKey, AnthropicSUT -class AnthropicSUTFactory(DynamicSUTFactoryDriver): +class AnthropicSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "anthropic" def get_secrets(self) -> list[InjectSecret]: diff --git a/src/modelgauge/suts/aws_bedrock_sut_factory.py b/src/modelgauge/suts/aws_bedrock_sut_factory.py index df85bf809..3cbc04912 100644 --- a/src/modelgauge/suts/aws_bedrock_sut_factory.py +++ b/src/modelgauge/suts/aws_bedrock_sut_factory.py @@ -2,14 +2,14 @@ import boto3 -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.aws_bedrock_client import AmazonBedrockSut, AwsAccessKeyId, AwsSecretAccessKey -class AWSBedrockSUTFactory(DynamicSUTFactoryDriver): +class AWSBedrockSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "aws" def __init__(self, raw_secrets: RawSecrets): diff --git a/src/modelgauge/suts/google_sut_factory.py b/src/modelgauge/suts/google_sut_factory.py index bfb65fe54..a30b7075d 100644 --- a/src/modelgauge/suts/google_sut_factory.py +++ b/src/modelgauge/suts/google_sut_factory.py @@ -2,14 +2,14 @@ from google import genai -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError from modelgauge.secret_values import RawSecrets, InjectSecret from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.google_genai import GoogleGenAiSUT, GoogleAiApiKey -class GoogleSUTFactory(DynamicSUTFactoryDriver): +class GoogleSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "google" def get_secrets(self) -> list[InjectSecret]: diff --git a/src/modelgauge/suts/huggingface_sut_factory.py b/src/modelgauge/suts/huggingface_sut_factory.py index 34b2db4f9..d6774493f 100644 --- a/src/modelgauge/suts/huggingface_sut_factory.py +++ b/src/modelgauge/suts/huggingface_sut_factory.py @@ -6,7 +6,7 @@ from modelgauge.auth.huggingface_inference_token import HuggingFaceInferenceToken from modelgauge.dynamic_sut_factory import ( DynamicSUTFactory, - DynamicSUTFactoryDriver, + DynamicDriverSUTFactory, ModelNotSupportedError, ProviderNotFoundError, ) @@ -24,7 +24,7 @@ logging.getLogger("huggingface_hub").setLevel(logging.ERROR) -class HuggingFaceSUTFactory(DynamicSUTFactoryDriver): +class HuggingFaceSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "hf" def __init__(self, raw_secrets: RawSecrets): diff --git a/src/modelgauge/suts/indirect_sut.py b/src/modelgauge/suts/indirect_sut.py index f3688be97..2ced6c5f0 100644 --- a/src/modelgauge/suts/indirect_sut.py +++ b/src/modelgauge/suts/indirect_sut.py @@ -5,7 +5,7 @@ import uvicorn from pydantic import BaseModel -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory from modelgauge.prompt import TextPrompt from modelgauge.ready import ReadyResponse from modelgauge.secret_values import InjectSecret @@ -137,7 +137,7 @@ def start(): thread.start() -class IndirectSUTFactory(DynamicSUTFactoryDriver): +class IndirectSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "indirect" def get_secrets(self) -> list[InjectSecret]: diff --git a/src/modelgauge/suts/meta_llama_factory.py b/src/modelgauge/suts/meta_llama_factory.py index c745df3d7..623ab16db 100644 --- a/src/modelgauge/suts/meta_llama_factory.py +++ b/src/modelgauge/suts/meta_llama_factory.py @@ -1,13 +1,13 @@ from llama_api_client import LlamaAPIClient -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.meta_llama_client import MetaLlamaApiKey, MetaLlamaModeratedSUT, MetaLlamaSUT -class LlamaSUTFactory(DynamicSUTFactoryDriver): +class LlamaSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "llama" def __init__(self, raw_secrets: RawSecrets): diff --git a/src/modelgauge/suts/mistral_sut_factory.py b/src/modelgauge/suts/mistral_sut_factory.py index 0b0fe0aa0..bbc7c21cd 100644 --- a/src/modelgauge/suts/mistral_sut_factory.py +++ b/src/modelgauge/suts/mistral_sut_factory.py @@ -1,4 +1,4 @@ -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut import SUT from modelgauge.sut_definition import SUTDefinition @@ -6,7 +6,7 @@ from modelgauge.suts.mistral_sut import MistralAISut -class MistralSUTFactory(DynamicSUTFactoryDriver): +class MistralSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "mistral" def __init__(self, raw_secrets: RawSecrets): diff --git a/src/modelgauge/suts/modelship_sut.py b/src/modelgauge/suts/modelship_sut.py index 24d9aae99..4b5ae4714 100644 --- a/src/modelgauge/suts/modelship_sut.py +++ b/src/modelgauge/suts/modelship_sut.py @@ -1,7 +1,7 @@ from typing import Optional, Mapping, Any from modelgauge.auth.openai_compatible_secrets import OpenAICompatibleApiKey -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.openai_client import OpenAIChat, OpenAIChatRequest @@ -34,7 +34,7 @@ def request_as_dict_for_client(self, request: OpenAIChatRequest) -> dict[str, An return request_as_dict -class ModelShipSUTFactory(DynamicSUTFactoryDriver): +class ModelShipSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "modelship" def get_secrets(self) -> list[InjectSecret]: diff --git a/src/modelgauge/suts/openai_sut_factory.py b/src/modelgauge/suts/openai_sut_factory.py index 9deb3cdcf..e99792777 100644 --- a/src/modelgauge/suts/openai_sut_factory.py +++ b/src/modelgauge/suts/openai_sut_factory.py @@ -3,7 +3,7 @@ from modelgauge.auth.openai_compatible_secrets import OpenAICompatibleApiKey from modelgauge.dynamic_sut_factory import ( DynamicSUTFactory, - DynamicSUTFactoryDriver, + DynamicDriverSUTFactory, ModelNotSupportedError, ProviderNotFoundError, ) @@ -36,7 +36,7 @@ def _make_client(self) -> OpenAI: return _client -class OpenAICompatibleSUTFactory(BaseOpenAISUTFactory, DynamicSUTFactoryDriver): +class OpenAICompatibleSUTFactory(BaseOpenAISUTFactory, DynamicDriverSUTFactory): DRIVER_NAME = "openai" def make_sut(self, sut_definition: SUTDefinition) -> OpenAIChat: diff --git a/src/modelgauge/suts/together_sut_factory.py b/src/modelgauge/suts/together_sut_factory.py index 45381be5a..951b55404 100644 --- a/src/modelgauge/suts/together_sut_factory.py +++ b/src/modelgauge/suts/together_sut_factory.py @@ -1,14 +1,14 @@ from together import Together # type: ignore from modelgauge.auth.together_key import TogetherApiKey -from modelgauge.dynamic_sut_factory import DynamicSUTFactoryDriver, ModelNotSupportedError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError from modelgauge.dynamic_sut_metadata import DynamicSUTMetadata from modelgauge.secret_values import InjectSecret, RawSecrets from modelgauge.sut_definition import SUTDefinition from modelgauge.suts.together_client import TogetherChatSUT -class TogetherSUTFactory(DynamicSUTFactoryDriver): +class TogetherSUTFactory(DynamicDriverSUTFactory): DRIVER_NAME = "together" def __init__(self, raw_secrets: RawSecrets): diff --git a/tests/modelgauge_tests/test_dynamic_sut_factory.py b/tests/modelgauge_tests/test_dynamic_sut_factory.py index 08bbb0bc3..d06ffa223 100644 --- a/tests/modelgauge_tests/test_dynamic_sut_factory.py +++ b/tests/modelgauge_tests/test_dynamic_sut_factory.py @@ -1,6 +1,6 @@ import pytest -from modelgauge.dynamic_sut_factory import DynamicSUTFactory, DynamicSUTFactoryDriver +from modelgauge.dynamic_sut_factory import DynamicSUTFactory, DynamicDriverSUTFactory from modelgauge.sut_definition import SUTDefinition from modelgauge.secret_values import InjectSecret from modelgauge_tests.fake_sut import FakeSUT @@ -49,25 +49,25 @@ def test_injected_secrets_missing_required(): def test_dynamic_sut_factory_driver_instantiation(): - class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): pass with pytest.raises(AssertionError): MyDriverFactory({}) - class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): DRIVER_NAME: str with pytest.raises(AssertionError): MyDriverFactory({}) - class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): DRIVER_NAME = None with pytest.raises(AssertionError): MyDriverFactory({}) - class MyDriverFactory(FakeDynamicFactory, DynamicSUTFactoryDriver): + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): DRIVER_NAME = "driver" factory = MyDriverFactory({}) From 85da9e48ecaa31271cc798a123437092d91a7c9e Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Mon, 16 Mar 2026 09:34:01 -0700 Subject: [PATCH 13/15] enforce non-empty driver name --- src/modelgauge/dynamic_sut_factory.py | 6 +++--- tests/modelgauge_tests/test_dynamic_sut_factory.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/modelgauge/dynamic_sut_factory.py b/src/modelgauge/dynamic_sut_factory.py index 62f37c840..f5cabbdca 100644 --- a/src/modelgauge/dynamic_sut_factory.py +++ b/src/modelgauge/dynamic_sut_factory.py @@ -52,6 +52,6 @@ class DynamicDriverSUTFactory(DynamicSUTFactory, ABC): def __init__(self, raw_secrets: RawSecrets): super().__init__(raw_secrets) - assert hasattr(self, "DRIVER_NAME") and isinstance( - self.DRIVER_NAME, str - ), "DynamicDriverSUTFactory subclasses must have a DRIVER_NAME attribute" + assert ( + hasattr(self, "DRIVER_NAME") and isinstance(self.DRIVER_NAME, str) and len(self.DRIVER_NAME) > 0 + ), "DynamicDriverSUTFactory subclasses must have a str DRIVER_NAME attribute" diff --git a/tests/modelgauge_tests/test_dynamic_sut_factory.py b/tests/modelgauge_tests/test_dynamic_sut_factory.py index d06ffa223..06f553b42 100644 --- a/tests/modelgauge_tests/test_dynamic_sut_factory.py +++ b/tests/modelgauge_tests/test_dynamic_sut_factory.py @@ -67,6 +67,12 @@ class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): with pytest.raises(AssertionError): MyDriverFactory({}) + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): + DRIVER_NAME = "" + + with pytest.raises(AssertionError): + MyDriverFactory({}) + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): DRIVER_NAME = "driver" From ade54fec047a8ac634509cf5e2a7dfd891e6c989 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Mon, 16 Mar 2026 09:54:39 -0700 Subject: [PATCH 14/15] add unit test for dynamic factory collection --- tests/modelgauge_tests/test_sut_factory.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/modelgauge_tests/test_sut_factory.py b/tests/modelgauge_tests/test_sut_factory.py index e0a6f541d..88bf43562 100644 --- a/tests/modelgauge_tests/test_sut_factory.py +++ b/tests/modelgauge_tests/test_sut_factory.py @@ -1,7 +1,7 @@ from unittest.mock import patch import pytest -from modelgauge.dynamic_sut_factory import UnknownSUTMakerError +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, UnknownSUTMakerError from modelgauge.instance_factory import InstanceFactory from modelgauge.sut import SUT from modelgauge.sut_factory import IncompatibleSUTParamsError, SUTFactory, SUTNotFoundException, SUTType @@ -51,6 +51,22 @@ def test_knows(sut_factory): assert sut_factory.knows(UNKNOWN_UID) is False +def test_load_dynamic_sut_factories(): + class MyDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): + DRIVER_NAME = "driver-1" + + class OtherDriverFactory(FakeDynamicFactory, DynamicDriverSUTFactory): + DRIVER_NAME = "driver-2" + + with patch("modelgauge.sut_factory.get_concrete_subclasses", return_value=[MyDriverFactory, OtherDriverFactory]): + sut_factory = SUTFactory({}) + + assert sut_factory.dynamic_sut_factories is not None + assert len(sut_factory.dynamic_sut_factories) == 2 + assert isinstance(sut_factory.dynamic_sut_factories["driver-1"], MyDriverFactory) + assert isinstance(sut_factory.dynamic_sut_factories["driver-2"], OtherDriverFactory) + + def test_get_missing_dependencies_dynamic(sut_factory): assert sut_factory.get_missing_dependencies(DYNAMIC_UID, secrets={}) == [] From 293dd29678373575667ed6b5bfc9487004962728 Mon Sep 17 00:00:00 2001 From: Barbara Korycki Date: Thu, 19 Mar 2026 09:24:22 -0700 Subject: [PATCH 15/15] Dynamic nvidia nim sut factory (#1507) * remove deprecated models * nvidia nim sut factory --- src/modelgauge/suts/nvidia_nim_api_client.py | 22 ------------ src/modelgauge/suts/nvidia_nim_sut_factory.py | 36 +++++++++++++++++++ .../sut_tests/test_nvidia_nim_sut_factory.py | 32 +++++++++++++++++ 3 files changed, 68 insertions(+), 22 deletions(-) create mode 100644 src/modelgauge/suts/nvidia_nim_sut_factory.py create mode 100644 tests/modelgauge_tests/sut_tests/test_nvidia_nim_sut_factory.py diff --git a/src/modelgauge/suts/nvidia_nim_api_client.py b/src/modelgauge/suts/nvidia_nim_api_client.py index 351c2f43a..7ff4adc8e 100644 --- a/src/modelgauge/suts/nvidia_nim_api_client.py +++ b/src/modelgauge/suts/nvidia_nim_api_client.py @@ -58,28 +58,6 @@ def _translate_request(self, messages, options: ModelOptions) -> NIMOpenAIChatRe ) -SUTS.register( - NvidiaNIMApiClient, - "nvidia-llama-3.1-nemotron-70b-instruct", - "nvidia/llama-3.1-nemotron-70b-instruct", - InjectSecret(NvidiaNIMApiKey), -) - - -SUTS.register( - NvidiaNIMApiClient, - "nvidia-nemotron-4-340b-instruct", - "nvidia/nemotron-4-340b-instruct", - InjectSecret(NvidiaNIMApiKey), -) - -SUTS.register( - NvidiaNIMApiClient, - "nvidia-mistral-nemo-minitron-8b-8k-instruct", - "nvidia/mistral-nemo-minitron-8b-8k-instruct", - InjectSecret(NvidiaNIMApiKey), -) - SUTS.register( NvidiaNIMApiClient, "nvidia-nemotron-mini-4b-instruct", diff --git a/src/modelgauge/suts/nvidia_nim_sut_factory.py b/src/modelgauge/suts/nvidia_nim_sut_factory.py new file mode 100644 index 000000000..78c0c3c89 --- /dev/null +++ b/src/modelgauge/suts/nvidia_nim_sut_factory.py @@ -0,0 +1,36 @@ +from openai import OpenAI + +from modelgauge.dynamic_sut_factory import DynamicDriverSUTFactory, ModelNotSupportedError +from modelgauge.secret_values import InjectSecret +from modelgauge.sut_definition import SUTDefinition +from modelgauge.suts.nvidia_nim_api_client import BASE_URL, NvidiaNIMApiKey, NvidiaNIMApiClient + + +class NvidiaNIMSUTFactory(DynamicDriverSUTFactory): + DRIVER_NAME = "nvidia-nim" + + def __init__(self, raw_secrets): + super().__init__(raw_secrets) + self._client = None + + @property + def client(self) -> OpenAI: + if self._client is None: + self._client = OpenAI(api_key=self.injected_secrets()[0].value, base_url=BASE_URL) + return self._client + + def get_secrets(self) -> list[InjectSecret]: + return [InjectSecret(NvidiaNIMApiKey)] + + def _model_exists(self, model_name: str) -> bool: + try: + self.client.models.retrieve(model_name) # type: ignore + return True + except: + return False + + def make_sut(self, sut_definition: SUTDefinition) -> NvidiaNIMApiClient: + model_name = sut_definition.external_model_name() + if not self._model_exists(model_name): + raise ModelNotSupportedError(f"Model {model_name} not found or not available on nvidia NIM.") + return NvidiaNIMApiClient(sut_definition.uid, model_name, *self.injected_secrets()) diff --git a/tests/modelgauge_tests/sut_tests/test_nvidia_nim_sut_factory.py b/tests/modelgauge_tests/sut_tests/test_nvidia_nim_sut_factory.py new file mode 100644 index 000000000..ddd37451e --- /dev/null +++ b/tests/modelgauge_tests/sut_tests/test_nvidia_nim_sut_factory.py @@ -0,0 +1,32 @@ +import pytest +from unittest.mock import MagicMock + +from modelgauge.dynamic_sut_factory import ModelNotSupportedError +from modelgauge.sut_definition import SUTDefinition +from modelgauge.suts.nvidia_nim_api_client import NvidiaNIMApiClient +from modelgauge.suts.nvidia_nim_sut_factory import NvidiaNIMSUTFactory + + +@pytest.fixture +def factory(): + return NvidiaNIMSUTFactory({"nvidia-nim-api": {"api_key": "value"}}) + + +def test_make_sut(factory): + factory._client = MagicMock() + factory._client.models.retrieve.return_value = "model exists" + + sut_definition = SUTDefinition(model="bar", maker="foo", driver="nvidia-nim") + sut = factory.make_sut(sut_definition) + + assert isinstance(sut, NvidiaNIMApiClient) + assert sut.uid == "foo/bar:nvidia-nim" + assert sut.model == "foo/bar" + + +def test_make_sut_bad_model(factory): + sut_definition = SUTDefinition(model="bogus", maker="fake", driver="nvidia-nim") + factory._client = MagicMock() + factory._client.models.retrieve.side_effect = Exception() + with pytest.raises(ModelNotSupportedError): + factory.make_sut(sut_definition)