ENG-3697: Improve eval sample upload progress (#670)

d42me · web-flow · commit 5715bca6f535 · 2026-05-20T08:58:05.000+02:00
diff --git a/packages/prime-evals/src/prime_evals/evals.py b/packages/prime-evals/src/prime_evals/evals.py
@@ -3,7 +3,7 @@
 import sys
 import warnings
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import httpx
 from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
@@ -25,6 +25,16 @@ def _build_user_agent() -> str:
     return f"prime-evals/{__version__} python/{python_version}"
 
 
+def _samples_upload_headers(api_key: Optional[str]) -> Dict[str, str]:
+    headers = {
+        "Content-Type": "application/json",
+        "User-Agent": _build_user_agent(),
+    }
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+
 class EvalsClient:
     """
     Client for the Prime Evals API
@@ -214,6 +224,7 @@ def push_samples(
         samples: List[Dict[str, Any]],
         max_payload_bytes: int = 25 * 1024 * 1024,
         max_workers: int = 4,
+        progress_callback: Optional[Callable[[int], None]] = None,
     ) -> Dict[str, Any]:
         """Push evaluation samples in adaptive batches with concurrent uploads."""
         if not samples:
@@ -222,34 +233,41 @@ def push_samples(
             raise ValueError("max_workers must be at least 1")
 
         batches, skipped_count = self._build_batches(samples, max_payload_bytes)
+        if skipped_count and progress_callback is not None:
+            progress_callback(skipped_count)
+
         total_samples_pushed = 0
         errors = []
-
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = {
-                executor.submit(self._upload_batch, evaluation_id, b): i
-                for i, b in enumerate(batches)
-            }
-            for future in as_completed(futures):
-                try:
-                    total_samples_pushed += future.result()
-                except Exception as e:
-                    errors.append(f"Batch {futures[future] + 1}: {e}")
+        headers = _samples_upload_headers(self.client.api_key)
+
+        with httpx.Client(headers=headers, timeout=300.0) as http_client:
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                futures = {
+                    executor.submit(self._upload_batch, http_client, evaluation_id, b): i
+                    for i, b in enumerate(batches)
+                }
+                for future in as_completed(futures):
+                    try:
+                        uploaded_count = future.result()
+                        total_samples_pushed += uploaded_count
+                        if progress_callback is not None:
+                            progress_callback(uploaded_count)
+                    except Exception as e:
+                        errors.append(f"Batch {futures[future] + 1}: {e}")
 
         if errors:
             raise EvalsAPIError(f"Failed to push samples: {'; '.join(errors)}")
 
         return {"samples_pushed": total_samples_pushed, "samples_skipped": skipped_count}
 
-    def _upload_batch(self, evaluation_id: str, batch: List[Dict[str, Any]]) -> int:
+    def _upload_batch(
+        self,
+        http_client: httpx.Client,
+        evaluation_id: str,
+        batch: List[Dict[str, Any]],
+    ) -> int:
         """Upload a single batch of samples with retry on rate limit."""
         url = f"{self.client.base_url}/api/v1/evaluations/{evaluation_id}/samples"
-        headers: Dict[str, str] = {
-            "Content-Type": "application/json",
-            "User-Agent": _build_user_agent(),
-        }
-        if self.client.api_key:
-            headers["Authorization"] = f"Bearer {self.client.api_key}"
 
         @retry(
             retry=retry_if_exception(_is_retryable),
@@ -258,7 +276,7 @@ def _upload_batch(self, evaluation_id: str, batch: List[Dict[str, Any]]) -> int:
             reraise=True,
         )
         def do_upload() -> int:
-            response = httpx.post(url, json={"samples": batch}, headers=headers, timeout=300.0)
+            response = http_client.post(url, json={"samples": batch})
             response.raise_for_status()
             return len(batch)
 
@@ -564,6 +582,7 @@ async def push_samples(
         samples: List[Dict[str, Any]],
         max_payload_bytes: int = 25 * 1024 * 1024,
         max_concurrent: int = 4,
+        progress_callback: Optional[Callable[[int], None]] = None,
     ) -> Dict[str, Any]:
         """Push evaluation samples in adaptive batches with concurrent uploads."""
         if not samples:
@@ -572,18 +591,18 @@ async def push_samples(
             raise ValueError("max_concurrent must be at least 1")
 
         batches, skipped_count = self._build_batches(samples, max_payload_bytes)
+        if skipped_count and progress_callback is not None:
+            progress_callback(skipped_count)
+
         semaphore = asyncio.Semaphore(max_concurrent)
         errors: List[str] = []
 
         base_url = self.client.base_url
-        headers: Dict[str, str] = {
-            "Content-Type": "application/json",
-            "User-Agent": _build_user_agent(),
-        }
-        if self.client.api_key:
-            headers["Authorization"] = f"Bearer {self.client.api_key}"
+        headers = _samples_upload_headers(self.client.api_key)
 
-        async def upload_batch(idx: int, batch: List[Dict[str, Any]]) -> int:
+        async def upload_batch(
+            http_client: httpx.AsyncClient, idx: int, batch: List[Dict[str, Any]]
+        ) -> int:
             url = f"{base_url}/api/v1/evaluations/{evaluation_id}/samples"
 
             @retry(
@@ -593,22 +612,27 @@ async def upload_batch(idx: int, batch: List[Dict[str, Any]]) -> int:
                 reraise=True,
             )
             async def do_upload() -> int:
-                async with httpx.AsyncClient(timeout=300.0) as client:
-                    response = await client.post(url, json={"samples": batch}, headers=headers)
-                    response.raise_for_status()
-                    return len(batch)
+                response = await http_client.post(url, json={"samples": batch})
+                response.raise_for_status()
+                return len(batch)
 
             async with semaphore:
                 try:
-                    return await do_upload()
+                    uploaded_count = await do_upload()
+                    if progress_callback is not None:
+                        progress_callback(uploaded_count)
+                    return uploaded_count
                 except httpx.HTTPStatusError as e:
                     errors.append(f"Batch {idx + 1}: HTTP {e.response.status_code}")
                     return 0
                 except httpx.RequestError as e:
                     errors.append(f"Batch {idx + 1}: {e}")
                     return 0
 
-        results = await asyncio.gather(*[upload_batch(i, b) for i, b in enumerate(batches)])
+        async with httpx.AsyncClient(headers=headers, timeout=300.0) as http_client:
+            results = await asyncio.gather(
+                *[upload_batch(http_client, i, b) for i, b in enumerate(batches)]
+            )
 
         if errors:
             raise EvalsAPIError(f"Failed to push samples: {'; '.join(errors)}")
diff --git a/packages/prime-evals/tests/test_evals.py b/packages/prime-evals/tests/test_evals.py
@@ -1,8 +1,11 @@
 """Tests for Prime Evals SDK"""
 
+import asyncio
+from types import SimpleNamespace
+
 import pytest
 
-from prime_evals.evals import EvalsClient
+from prime_evals.evals import AsyncEvalsClient, EvalsClient
 from prime_evals.models import (
     CreateEvaluationRequest,
     Evaluation,
@@ -122,6 +125,101 @@ def test_sample_model_with_metadata():
     assert sample.info == {"batch": 1}
 
 
+def test_push_samples_reports_progress_and_reuses_http_client(monkeypatch):
+    posts = []
+    created_clients = []
+
+    class FakeResponse:
+        def raise_for_status(self):
+            return None
+
+    class FakeHttpClient:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+            created_clients.append(self)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_args):
+            return None
+
+        def post(self, url, json):
+            posts.append({"url": url, "json": json, "headers": self.kwargs["headers"]})
+            return FakeResponse()
+
+    monkeypatch.setattr("prime_evals.evals.httpx.Client", FakeHttpClient)
+    api_client = SimpleNamespace(
+        base_url="https://api.example",
+        api_key="secret-token",
+    )
+    client = EvalsClient(api_client)
+    progress = []
+
+    with pytest.warns(UserWarning, match="exceeds maximum payload size"):
+        result = client.push_samples(
+            "eval-1",
+            [{"x": "a"}, {"x": "b" * 50}, {"x": "c"}],
+            max_payload_bytes=35,
+            max_workers=1,
+            progress_callback=progress.append,
+        )
+
+    assert result == {"samples_pushed": 2, "samples_skipped": 1}
+    assert progress == [1, 1, 1]
+    assert len(posts) == 2
+    assert len(created_clients) == 1
+    assert posts[0]["headers"]["Authorization"] == "Bearer secret-token"
+
+
+def test_async_push_samples_reports_progress_and_reuses_http_client(monkeypatch):
+    posts = []
+    created_clients = []
+
+    class FakeResponse:
+        def raise_for_status(self):
+            return None
+
+    class FakeAsyncHttpClient:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+            created_clients.append(self)
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *_args):
+            return None
+
+        async def post(self, url, json):
+            posts.append({"url": url, "json": json, "headers": self.kwargs["headers"]})
+            return FakeResponse()
+
+    monkeypatch.setattr("prime_evals.evals.httpx.AsyncClient", FakeAsyncHttpClient)
+    client = AsyncEvalsClient.__new__(AsyncEvalsClient)
+    client.client = SimpleNamespace(
+        base_url="https://api.example",
+        api_key="secret-token",
+    )
+    progress = []
+
+    result = asyncio.run(
+        client.push_samples(
+            "eval-1",
+            [{"x": "a"}, {"x": "b"}],
+            max_payload_bytes=35,
+            max_concurrent=1,
+            progress_callback=progress.append,
+        )
+    )
+
+    assert result == {"samples_pushed": 2, "samples_skipped": 0}
+    assert progress == [1, 1]
+    assert len(posts) == 2
+    assert len(created_clients) == 1
+    assert posts[0]["headers"]["Authorization"] == "Bearer secret-token"
+
+
 def test_evals_client_context_manager():
     """Test EvalsClient can be used as context manager"""
     try:
diff --git a/packages/prime/src/prime_cli/commands/evals.py b/packages/prime/src/prime_cli/commands/evals.py
@@ -9,6 +9,7 @@
 import typer
 from click.core import ParameterSource
 from prime_evals import EvalsAPIError, EvalsClient, InvalidEvaluationError
+from rich.progress import Progress
 from rich.syntax import Syntax
 from rich.table import Table
 
@@ -1008,6 +1009,22 @@ def _resolve_eval_viewer_url(evaluation_id: str, response: Optional[dict[str, An
     return get_eval_viewer_url(evaluation_id)
 
 
+def _push_samples_with_progress(
+    client: EvalsClient, evaluation_id: str, samples: list[dict[str, Any]]
+) -> None:
+    if not console.is_terminal:
+        client.push_samples(evaluation_id, samples)
+        return
+
+    with Progress(console=console, transient=True) as progress:
+        task_id = progress.add_task("Uploading samples", total=len(samples))
+        client.push_samples(
+            evaluation_id,
+            samples,
+            progress_callback=lambda uploaded: progress.update(task_id, advance=uploaded),
+        )
+
+
 def _require_published_environment_for_eval_push(env_name: str, eval_path: Path) -> None:
     console.print("[red]Error:[/red] Evaluation uploads require a pushed environment.")
     console.print(
@@ -1095,7 +1112,7 @@ def _push_single_eval(
     results = eval_data.get("results", [])
     if results:
         console.print(f"[blue]Pushing {len(results)} samples...[/blue]")
-        client.push_samples(eval_id, results)
+        _push_samples_with_progress(client, eval_id, results)
         console.print("[green]✓ Samples pushed successfully[/green]")
         console.print()