mlcommons
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 4 additions & 0 deletions b/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/inference_endpoint/commands/probe.py‎
Lines changed: 79 additions & 34 deletions b/‎src/inference_endpoint/commands/probe.py‎
Lines changed: 79 additions & 34 deletions
diff --git a/‎src/inference_endpoint/config/schema.py‎
Lines changed: 2 additions & 1 deletion b/‎src/inference_endpoint/config/schema.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/inference_endpoint/dataset_manager/dataloader.py‎
Lines changed: 2 additions & 1 deletion b/‎src/inference_endpoint/dataset_manager/dataloader.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/inference_endpoint/endpoint_client/configs.py‎
Lines changed: 9 additions & 7 deletions b/‎src/inference_endpoint/endpoint_client/configs.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎src/inference_endpoint/endpoint_client/http_sample_issuer.py‎
Lines changed: 12 additions & 7 deletions b/‎src/inference_endpoint/endpoint_client/http_sample_issuer.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎src/inference_endpoint/endpoint_client/worker.py‎
Lines changed: 2 additions & 4 deletions b/‎src/inference_endpoint/endpoint_client/worker.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/inference_endpoint/endpoint_client/zmq_utils.py‎
Lines changed: 13 additions & 3 deletions b/‎src/inference_endpoint/endpoint_client/zmq_utils.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎src/inference_endpoint/openai/openai_msgspec_adapter.py‎
Lines changed: 15 additions & 11 deletions b/‎src/inference_endpoint/openai/openai_msgspec_adapter.py‎
Lines changed: 15 additions & 11 deletions
@@ -74,7 +74,8 @@ where = ["src"]
 [tool.setuptools.package-dir]
 "" = "src"
 
-
+[tool.autopep8]
+max_line_length = 88
 
 [tool.ruff]
 target-version = "py312"
 
@@ -513,6 +513,10 @@ def _run_benchmark(
                 "model": model_name,
                 "stream": enable_streaming,
                 "max_completion_tokens": max_tokens,
+                "temperature": config.model_params.temperature,
+                "top_p": config.model_params.top_p,
+                "top_k": config.model_params.top_k,
+                "repetition_penalty": config.model_params.repetition_penalty,
             },
         )
         dataloader.load()
 
@@ -23,13 +23,13 @@
 import time
 from urllib.parse import urljoin
 
-from inference_endpoint.core.types import Query
+from inference_endpoint.core.types import Query, QueryResult
 from inference_endpoint.endpoint_client.configs import (
     AioHttpConfig,
     HTTPClientConfig,
     ZMQConfig,
 )
-from inference_endpoint.endpoint_client.futures_client import FuturesHttpClient
+from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
 from inference_endpoint.exceptions import (
     ExecutionError,
     InputValidationError,
@@ -80,75 +80,120 @@ async def run_probe_command(args: argparse.Namespace) -> None:
             zmq_readiness_queue_addr=f"ipc://{tmp_dir}/ready",
         )
 
-        client = FuturesHttpClient(http_config, aiohttp_config, zmq_config)
+        client = HTTPEndpointClient(http_config, aiohttp_config, zmq_config)
         await client.async_start()
 
         logger.info(f"Sending {num_requests} requests...")
 
-        # Send test requests and collect futures
-        futures = []
-        start_times = {}
+        # Send test requests
+        start_times: dict[str, float] = {}
+        sent_query_ids: list[str] = []
+        issue_errors: list[str] = []
 
         # TODO: this might not work with a real vLLM/SGLang endpoint, fix this.
         for i in range(num_requests):
+            query_id = f"probe-{i}"
             query = Query(
-                id=f"probe-{i}",
+                id=query_id,
                 data={
                     "prompt": test_prompt,
                     "model": model_name,
                     "max_tokens": 50,
                     "stream": False,
                 },
             )
-            start_times[f"probe-{i}"] = time.time()
 
             try:
-                future = await client.issue_query(query)
-                futures.append((f"probe-{i}", future))
-                # Simple progress indicator
-                if (i + 1) % max(1, num_requests // 10) == 0 or i == num_requests - 1:
-                    logger.info(f"  Issued {i + 1}/{num_requests} requests")
+                start_times[query_id] = time.time()
+                await client.issue_query_async(query)
+                # Only track successfully issued queries
+                sent_query_ids.append(query_id)
             except Exception as e:
+                issue_errors.append(f"{query_id}: Failed to issue - {str(e)[:50]}")
                 logger.warning(f"Failed to issue request {i}: {str(e)[:50]}")
+                continue
+
+            # Simple progress indicator
+            if (i + 1) % max(1, num_requests // 10) == 0 or i == num_requests - 1:
+                logger.info(f"  Issued {i + 1}/{num_requests} requests")
 
         # Wait for all responses
-        latencies = []
-        errors = []
-        responses = []
+        latencies: list[float] = []
+        errors: list[str] = issue_errors  # Include any issue errors
+        responses: list[tuple[str, str]] = []
+
+        # Only count successfully issued queries
+        num_expected = len(sent_query_ids)
+        if num_expected == 0:
+            logger.error("✗ No queries were successfully issued")
+            raise ExecutionError("Probe failed: no queries could be issued")
 
         # Wait for all responses with generous timeout (probe queries can be slow)
-        # Default HTTP client timeout is 30s, give extra buffer for processing
-        probe_timeout = 60.0  # 60 seconds per query
+        probe_timeout = 60.0  # 60 seconds total
+        start_wait = time.time()
+
+        logger.info(f"Waiting for {num_expected} responses...")
 
-        logger.info(f"Waiting for {len(futures)} responses...")
+        received_ids: set[str] = set()
 
-        for idx, (query_id, future) in enumerate(futures):
+        while (
+            len(received_ids) < num_expected
+            and (time.time() - start_wait) < probe_timeout
+        ):
             try:
-                result = await asyncio.wait_for(future, timeout=probe_timeout)
-                # Calculate latency - should always be in start_times
-                assert (
-                    query_id in start_times
-                ), f"Query {query_id} not found in start_times"
+                result = await client.get_ready_responses_async()
+
+                if result is None:
+                    await asyncio.sleep(0.01)
+                    continue
+
+                # Skip non-final streaming chunks
+                if not isinstance(result, QueryResult):
+                    continue
+
+                query_id = result.id
+
+                if query_id in received_ids:
+                    logger.warning(f"Received duplicate response for {query_id}")
+                    continue
+
+                received_ids.add(query_id)
+
+                # Calculate latency - should always be in start_times for issued queries
+                if query_id not in start_times:
+                    logger.warning(
+                        f"Received response for unknown query_id: {query_id}, skipping"
+                    )
+                    continue
                 latency_ms = (time.time() - start_times[query_id]) * 1000
-                latencies.append(latency_ms)
 
                 if result.error:
                     errors.append(f"{query_id}: {result.error}")
                 else:
-                    # Store successful response for sanity check
+                    latencies.append(latency_ms)
                     responses.append((query_id, result.response_output))
-            except TimeoutError:
-                errors.append(f"{query_id}: Timeout (>{probe_timeout}s)")
+
+                # Simple progress indicator
+                if (
+                    len(received_ids) % max(1, num_expected // 10) == 0
+                    or len(received_ids) == num_expected
+                ):
+                    logger.info(
+                        f"  Processed {len(received_ids)}/{num_expected} responses"
+                    )
+
             except Exception as e:
-                errors.append(f"{query_id}: {str(e)[:50]}")
+                logger.warning(f"Error receiving response: {str(e)[:50]}")
+                await asyncio.sleep(0.01)
 
-            # Simple progress indicator
-            if (idx + 1) % max(1, len(futures) // 10) == 0 or idx == len(futures) - 1:
-                logger.info(f"  Processed {idx + 1}/{len(futures)} responses")
+        # Mark any issued but not received as timeout
+        for query_id in sent_query_ids:
+            if query_id not in received_ids:
+                errors.append(f"{query_id}: Timeout (>{probe_timeout}s)")
 
         # Report results
         success_count = len(latencies)
-        logger.info(f"✓ Completed: {success_count}/{num_requests} successful")
+        logger.info(f"✓ Completed: {success_count}/{num_expected} successful")
 
         if latencies:
             avg_latency = sum(latencies) / len(latencies)
 
@@ -136,9 +136,10 @@ class ModelParams(BaseModel):
     """Model generation parameters."""
 
     name: str | None = None
-    temperature: float = 0.7
+    temperature: float | None = None
     top_k: int | None = None
     top_p: float | None = None
+    repetition_penalty: float | None = None
     max_new_tokens: int = 1024
     osl_distribution: OSLDistribution | None = None
     streaming: StreamingMode = StreamingMode.AUTO
 
@@ -325,7 +325,8 @@ def default_parser(x):
     def load(self):
         with open(self.file_path) as file:
             for line in file:
-                self.data.append(self.parser(json.loads(line)))
+                if line := line.strip():
+                    self.data.append(self.parser(json.loads(line)))
 
     def load_sample(self, index: int) -> Any:
         return self.data[index]
 
@@ -205,20 +205,22 @@ class ZMQConfig:
     """Configuration for ZMQ sockets and communication."""
 
     # Main ZMQ settings
-    zmq_io_threads: int = 4  # Number of ZMQ IO threads
-    zmq_high_water_mark: int = 10_000  # max msg queue size
+    zmq_io_threads: int = 4  # Number of ZMQ IO threads ; TODO(vir): needs to scale?
+    zmq_high_water_mark: int = 0  # Max queue size per socket (0=unlimited)
 
     # ZMQ addresses (use None for auto-generated prefixes using PID)
     zmq_request_queue_prefix: str | None = None
     zmq_response_queue_addr: str | None = None
     zmq_readiness_queue_addr: str | None = None
 
     # ZMQ socket options
-    zmq_linger: int = 0  # Don't block on close
-    zmq_send_timeout: int = -1  # Non-blocking send
-    zmq_recv_timeout: int = 100  # Timeout on receive() call
-    zmq_recv_buffer_size: int = 10 * 1024 * 1024  # 10MB receive buffer
-    zmq_send_buffer_size: int = 10 * 1024 * 1024  # 10MB send buffer
+    zmq_linger: int = 0  # 0 = Don't block on close
+    zmq_immediate: int = 1  # ensure messages only enqueued on READY connections
+    zmq_send_timeout: int = -1  # -1 = Non-blocking send
+    zmq_recv_timeout: int = 1  # Timeout on receive() in ms
+
+    zmq_recv_buffer_size: int = 10 * 1024 * 1024  # 10MB receive buffer (OS level)
+    zmq_send_buffer_size: int = 10 * 1024 * 1024  # 10MB send buffer (OS level)
 
     def __post_init__(self):
         """Generate portable ZMQ socket paths if not provided."""
 
@@ -18,7 +18,6 @@
 import asyncio
 import logging
 import threading
-from typing import Any
 
 from inference_endpoint.core.types import Query, QueryResult, StreamChunk
 from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
@@ -103,7 +102,18 @@ def issue(self, sample: Sample):
         if self.n_inflight == 0:
             self._client_idle_event.clear()
         self.n_inflight += 1
-        self.http_client.issue_query(Query(id=sample.uuid, data=sample.data))
+        self.http_client.issue_query(
+            Query(
+                id=sample.uuid,
+                data=sample.data,
+                headers={
+                    "Content-Type": "application/json",
+                    "Accept": "text/event-stream"
+                    if sample.data.get("stream", False)
+                    else "application/json",
+                },
+            )
+        )
 
     def wait_for_all_complete(self, timeout: float | None = None):
         """Wait (blocking) for all pending queries to complete.
@@ -123,8 +133,3 @@ def shutdown(self):
 
         if self.response_task:
             self.response_task.cancel()
-
-    def process_sample_data(self, s_uuid: int, sample_data: Any):
-        raise NotImplementedError(
-            "HttpClientSampleIssuer does not implement process_sample_data"
-        )
 
@@ -266,18 +266,16 @@ async def _make_http_request(self, query: Query):
             return
 
         url = self.http_config.endpoint_url
-        headers = query.headers if hasattr(query, "headers") else {}
-
         logging.debug(
-            f"Making HTTP request to {url} with payload: {query} and headers: {headers}"
+            f"Making HTTP request to {url} with query: {query} and headers: {query.headers}"
         )
 
         # Encode query to bytes using adapter
         payload_bytes = self._adapter.encode_query(query)
 
         # Issue the request with pre-encoded bytes
         async with self._session.post(
-            url, data=payload_bytes, headers=headers
+            url, data=payload_bytes, headers=query.headers
         ) as response:
             if response.status != 200:
                 error_text = await response.text()
 
@@ -76,27 +76,37 @@ def close(self, linger_ms: int | None = None) -> None:
 class ZMQPushSocket(ZMQSocket):
     """Async wrapper for ZMQ PUSH socket."""
 
-    def __init__(self, context: zmq.asyncio.Context, address: str, config: ZMQConfig):
+    def __init__(
+        self,
+        context: zmq.asyncio.Context,
+        address: str,
+        config: ZMQConfig,
+        bind: bool = False,
+    ):
         """
         Initialize ZMQ push socket.
 
         Args:
             context: ZMQ context
             address: Socket address
             config: ZMQ configuration
+            bind: Whether to bind (True) or connect (False) to the address
         """
-        super().__init__(context, zmq.PUSH, address, config, bind=False)
+        super().__init__(context, zmq.PUSH, address, config, bind=bind)
         self._encoder = msgspec.msgpack.Encoder()
 
     def _set_socket_options(self, config: ZMQConfig) -> None:
         """Set PUSH socket specific options."""
         self.socket.setsockopt(zmq.SNDHWM, config.zmq_high_water_mark)
         self.socket.setsockopt(zmq.SNDBUF, config.zmq_send_buffer_size)
         self.socket.setsockopt(zmq.SNDTIMEO, config.zmq_send_timeout)
+        self.socket.setsockopt(zmq.IMMEDIATE, config.zmq_immediate)
 
     @profile
     async def send(self, data: Any) -> None:
-        """Serialize to msgspec and send data through push socket."""
+        """
+        Serialize to msgspec and send data through push socket.
+        """
         serialized = self._encoder.encode(data)
         await self.socket.send(serialized, flags=zmq.NOBLOCK, copy=False)
 
 
@@ -37,24 +37,26 @@ class ChatMessage(msgspec.Struct, kw_only=True, omit_defaults=True):
 
     role: str
     content: str
-    name: str
+    name: str | None = None
 
 
 class ChatCompletionRequest(msgspec.Struct, kw_only=True, omit_defaults=True):
     """OpenAI chat completion request."""
 
     model: str
     messages: list[ChatMessage]
-    temperature: float
-    max_completion_tokens: int
-    stream: bool
-    top_p: float
-    n: int
-    stop: str | list[str]
-    presence_penalty: float
-    frequency_penalty: float
-    logit_bias: dict[str, float]
-    user: str
+    temperature: float | None = None
+    max_completion_tokens: int | None = None
+    stream: bool | None = None
+    top_p: float | None = None
+    top_k: int | None = None
+    repetition_penalty: float | None = None
+    n: int | None = None
+    stop: str | list[str] | None = None
+    presence_penalty: float | None = None
+    frequency_penalty: float | None = None
+    logit_bias: dict[str, float] | None = None
+    user: str | None = None
 
 
 class ChatCompletionResponseMessage(msgspec.Struct, kw_only=True, omit_defaults=True):
@@ -158,6 +160,8 @@ def to_endpoint_request(cls, query: Query) -> ChatCompletionRequest:
             max_completion_tokens=query.data.get("max_completion_tokens"),
             temperature=query.data.get("temperature"),
             top_p=query.data.get("top_p"),
+            top_k=query.data.get("top_k"),
+            repetition_penalty=query.data.get("repetition_penalty"),
             n=query.data.get("n"),
             presence_penalty=query.data.get("presence_penalty"),
             frequency_penalty=query.data.get("frequency_penalty"),