Make together retry more. (#978)

wpietri · web-flow · commit 40bbaba191f9 · 2025-04-14T16:26:19.000-05:00
Allow changing number of threads when running benchmarks.
Speed up `retry` tests and other sleeping tests.
diff --git a/src/modelbench/run.py b/src/modelbench/run.py
@@ -84,7 +84,7 @@ def cli() -> None:
 @click.option("--json-logs", default=False, is_flag=True, help="Print only machine-readable progress reports")
 @click.option("sut_uids", "--sut", "-s", multiple=True, help="SUT uid(s) to run", required=True, callback=validate_uid)
 @click.option("--anonymize", type=int, help="Random number seed for consistent anonymization of SUTs")
-@click.option("--parallel", default=False, help="Obsolete flag, soon to be removed")
+@click.option("--threads", default=32, help="How many threads to use per stage")
 @click.option(
     "--version",
     "-v",
@@ -125,12 +125,10 @@ def benchmark(
     json_logs: bool,
     sut_uids: List[str],
     anonymize=None,
-    parallel=False,
+    threads=32,
     prompt_set="demo",
     evaluator="default",
 ) -> None:
-    if parallel:
-        click.echo("--parallel option unnecessary; benchmarks are now always run in parallel")
     start_time = datetime.now(timezone.utc)
     if locale == "all":
         locales = LOCALES
@@ -143,7 +141,9 @@ def benchmark(
     suts = get_suts(sut_uids)
     benchmarks = [get_benchmark(version, l, prompt_set, evaluator) for l in locales]
 
-    run = run_benchmarks_for_suts(benchmarks, suts, max_instances, debug=debug, json_logs=json_logs)
+    run = run_benchmarks_for_suts(
+        benchmarks, suts, max_instances, debug=debug, json_logs=json_logs, thread_count=threads
+    )
     benchmark_scores = score_benchmarks(run)
     output_dir.mkdir(exist_ok=True, parents=True)
     for b in benchmarks:
diff --git a/src/modelgauge/annotation_pipeline.py b/src/modelgauge/annotation_pipeline.py
@@ -112,6 +112,7 @@ def handle_item(self, item: SutInteraction):
 
 class AnnotatorWorkers(CachingPipe):
     def __init__(self, annotators: dict[str, Annotator], workers=None, cache_path=None):
+        self.sleep_time = 10
         if workers is None:
             workers = 8
         super().__init__(thread_count=workers, cache_path=cache_path)
@@ -140,7 +141,7 @@ def handle_uncached_item(self, item):
                 logger.warning(
                     f"Exception calling annotator {annotator_uid} on attempt {tries}: {e}\nRetrying.....", exc_info=True
                 )
-                time.sleep(10)
+                time.sleep(self.sleep_time)
         result = annotator.translate_response(request, response)
         self.annotation_counts[annotator_uid] += 1
         return sut_interaction, annotator_uid, result
diff --git a/src/modelgauge/api_server.py b/src/modelgauge/api_server.py
diff --git a/src/modelgauge/prompt_pipeline.py b/src/modelgauge/prompt_pipeline.py
@@ -150,6 +150,7 @@ def handle_item(self, item):
 
 class PromptSutWorkers(CachingPipe):
     def __init__(self, suts: dict[str, SUT], sut_options: Optional[SUTOptions] = None, workers=None, cache_path=None):
+        self.sleep_time = 10
         if workers is None:
             workers = 8
         super().__init__(thread_count=workers, cache_path=cache_path)
@@ -178,7 +179,7 @@ def call_sut(self, prompt_text: TextPrompt, sut: PromptResponseSUT) -> SUTRespon
                 break
             except Exception as e:
                 logger.warning(f"Exception calling SUT {sut.uid} on attempt {tries}: {e}\nRetrying.....", exc_info=True)
-                time.sleep(10)
+                time.sleep(self.sleep_time)
         result = sut.translate_response(request, response)
         self.sut_response_counts[sut.uid] += 1
         return result
diff --git a/src/modelgauge/suts/together_client.py b/src/modelgauge/suts/together_client.py
@@ -32,7 +32,7 @@ def _retrying_post(url, headers, json_payload):
     """HTTP Post with retry behavior."""
     session = requests.Session()
     retries = Retry(
-        total=7,
+        total=15,
         backoff_factor=2,
         status_forcelist=[
             408,  # Request Timeout
diff --git a/tests/modelgauge_tests/test_annotation_pipeline.py b/tests/modelgauge_tests/test_annotation_pipeline.py
@@ -251,6 +251,7 @@ def test_annotator_worker_retries_until_success():
     annotator.annotate = mock
 
     w = AnnotatorWorkers({"fake-annotator": annotator})
+    w.sleep_time = 0.01
     sut_interaction = make_sut_interaction("1", "prompt", "sut", "response")
     result = w.handle_item((sut_interaction, "fake-annotator"))
 
diff --git a/tests/modelgauge_tests/test_api_server.py b/tests/modelgauge_tests/test_api_server.py
diff --git a/tests/modelgauge_tests/test_prompt_pipeline.py b/tests/modelgauge_tests/test_prompt_pipeline.py
@@ -198,6 +198,7 @@ def test_prompt_sut_worker_retries_until_success(suts):
     prompt_with_context = TestItem(source_id="1", prompt=TextPrompt(text="a prompt"))
 
     w = PromptSutWorkers(suts)
+    w.sleep_time = 0.01
     result = w.handle_item((prompt_with_context, "fake1"))
     assert result == SutInteraction(prompt_with_context, "fake1", SUTResponse(text="a response"))
     assert mock.call_count == num_exceptions + 1
diff --git a/tests/modelgauge_tests/test_retry_decorator.py b/tests/modelgauge_tests/test_retry_decorator.py
@@ -1,3 +1,5 @@
+from unittest.mock import patch
+
 import pytest
 import time
 
@@ -28,7 +30,8 @@ def always_fail():
         raise KeyError("Intentional failure")
 
     with pytest.raises(KeyError):
-        always_fail()
+        with patch("time.sleep") as patched_sleep:
+            always_fail()
 
     assert attempt_counter == BASE_RETRY_COUNT
 
@@ -44,7 +47,8 @@ def succeed_before_base_retry_total():
             raise ValueError("Intentional failure")
         return "success"
 
-    assert succeed_before_base_retry_total() == "success"
+    with patch("time.sleep") as patched_sleep:
+        assert succeed_before_base_retry_total() == "success"
     assert attempt_counter == BASE_RETRY_COUNT