wip added inference streaming option, improve example

tamirse · tamirse · commit 1527c3c83cc9 · 2025-04-04T10:29:07.000+03:00
diff --git a/datacrunch/InferenceClient/inference_client.py b/datacrunch/InferenceClient/inference_client.py
@@ -2,7 +2,7 @@
 from dataclasses_json import dataclass_json, Undefined  # type: ignore
 import requests
 from requests.structures import CaseInsensitiveDict
-from typing import Optional, Dict, Any, Union
+from typing import Optional, Dict, Any, Union, Generator
 from urllib.parse import urlparse
 
 
@@ -14,10 +14,76 @@ class InferenceClientError(Exception):
 @dataclass_json(undefined=Undefined.EXCLUDE)
 @dataclass
 class InferenceResponse:
-    body: Any
     headers: CaseInsensitiveDict[str]
     status_code: int
     status_text: str
+    _original_response: requests.Response
+    _stream: bool = False
+
+    def _is_stream_response(self, headers: CaseInsensitiveDict[str]) -> bool:
+        """Check if the response headers indicate a streaming response.
+
+        Args:
+            headers: The response headers to check
+
+        Returns:
+            bool: True if the response is likely a stream, False otherwise
+        """
+        # Standard chunked transfer encoding
+        is_chunked_transfer = headers.get(
+            'Transfer-Encoding', '').lower() == 'chunked'
+        # Server-Sent Events content type
+        is_event_stream = headers.get(
+            'Content-Type', '').lower() == 'text/event-stream'
+        # NDJSON
+        is_ndjson = headers.get(
+            'Content-Type', '').lower() == 'application/x-ndjson'
+        # Stream JSON
+        is_stream_json = headers.get(
+            'Content-Type', '').lower() == 'application/stream+json'
+        # Keep-alive
+        is_keep_alive = headers.get(
+            'Connection', '').lower() == 'keep-alive'
+        # No content length
+        has_no_content_length = 'Content-Length' not in headers
+
+        # No Content-Length with keep-alive often suggests streaming (though not definitive)
+        is_keep_alive_and_no_content_length = is_keep_alive and has_no_content_length
+
+        return (self._stream or is_chunked_transfer or is_event_stream or is_ndjson or
+                is_stream_json or is_keep_alive_and_no_content_length)
+
+    def output(self, is_text: bool = False) -> Any:
+        try:
+            if is_text:
+                return self._original_response.text
+            return self._original_response.json()
+        except Exception as e:
+            # if the response is a stream (check headers), raise relevant error
+            if self._is_stream_response(self._original_response.headers):
+                raise InferenceClientError(
+                    f"Response might be a stream, use the stream method instead")
+            raise InferenceClientError(
+                f"Failed to parse response as JSON: {str(e)}")
+
+    def stream(self, chunk_size: int = 512, as_text: bool = True) -> Generator[Any, None, None]:
+        """Stream the response content.
+
+        Args:
+            chunk_size: Size of chunks to stream, in bytes
+            as_text: If True, stream as text using iter_lines. If False, stream as binary using iter_content.
+
+        Returns:
+            Generator yielding chunks of the response
+        """
+        if as_text:
+            for chunk in self._original_response.iter_lines(chunk_size=chunk_size):
+                if chunk:
+                    yield chunk
+        else:
+            for chunk in self._original_response.iter_content(chunk_size=chunk_size):
+                if chunk:
+                    yield chunk
 
 
 @dataclass_json(undefined=Undefined.EXCLUDE)
@@ -169,24 +235,24 @@ def _make_request(self, method: str, path: str, **kwargs) -> requests.Response:
         except requests.exceptions.RequestException as e:
             raise InferenceClientError(f"Request to {path} failed: {str(e)}")
 
-    def run_sync(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None):
-        response = self.post(
-            path, json=data, timeout_seconds=timeout_seconds, headers=headers)
+    def run_sync(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None, http_method: str = "POST", stream: bool = False):
+        response = self._make_request(
+            http_method, path, json=data, timeout_seconds=timeout_seconds, headers=headers, stream=stream)
 
         return InferenceResponse(
-            body=response.json(),
             headers=response.headers,
             status_code=response.status_code,
-            status_text=response.reason
+            status_text=response.reason,
+            _original_response=response
         )
 
-    def run(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None):
+    def run(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None, http_method: str = "POST"):
         # Add the "Prefer: respond-async" header to the request, to indicate that the request is async
         headers = headers or {}
         headers['Prefer'] = 'respond-async'
 
-        response = self.post(
-            path, json=data, timeout_seconds=timeout_seconds, headers=headers)
+        response = self._make_request(
+            http_method, path, json=data, timeout_seconds=timeout_seconds, headers=headers)
 
         # TODO: this response format isn't final
         execution_id = response.json()['id']
diff --git a/datacrunch/containers/containers.py b/datacrunch/containers/containers.py
@@ -358,14 +358,16 @@ def _validate_inference_client(self) -> None:
             raise ValueError(
                 "Inference client not initialized. Use from_dict_with_inference_key or set_inference_client to initialize inference capabilities.")
 
-    def run_sync(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None) -> InferenceResponse:
+    def run_sync(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None, http_method: str = "POST", stream: bool = False) -> InferenceResponse:
         """Runs a synchronous inference request.
 
         Args:
             data: The data to send in the request.
             path: The endpoint path to send the request to.
             timeout_seconds: Maximum time to wait for the response.
             headers: Optional headers to include in the request.
+            http_method: The HTTP method to use for the request.
+            stream: Whether to stream the response.
 
         Returns:
             InferenceResponse: The response from the inference request.
@@ -374,16 +376,18 @@ def run_sync(self, data: Dict[str, Any], path: str = "", timeout_seconds: int =
             ValueError: If the inference client is not initialized.
         """
         self._validate_inference_client()
-        return self._inference_client.run_sync(data, path, timeout_seconds, headers)
+        return self._inference_client.run_sync(data, path, timeout_seconds, headers, http_method, stream)
 
-    def run(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None):
+    def run(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 * 5, headers: Optional[Dict[str, str]] = None, http_method: str = "POST", stream: bool = False):
         """Runs an asynchronous inference request.
 
         Args:
             data: The data to send in the request.
             path: The endpoint path to send the request to.
             timeout_seconds: Maximum time to wait for the response.
             headers: Optional headers to include in the request.
+            http_method: The HTTP method to use for the request.
+            stream: Whether to stream the response.
 
         Returns:
             The response from the inference request.
@@ -392,7 +396,7 @@ def run(self, data: Dict[str, Any], path: str = "", timeout_seconds: int = 60 *
             ValueError: If the inference client is not initialized.
         """
         self._validate_inference_client()
-        return self._inference_client.run(data, path, timeout_seconds, headers)
+        return self._inference_client.run(data, path, timeout_seconds, headers, http_method, stream)
 
     def health(self):
         """Checks the health of the deployed application.
diff --git a/examples/containers/sglang_deployment_example.py b/examples/containers/sglang_deployment_example.py
@@ -8,6 +8,7 @@
 import time
 import signal
 import sys
+import json
 from datetime import datetime
 from datacrunch import DataCrunchClient
 from datacrunch.exceptions import APIException
@@ -33,9 +34,9 @@
 
 # Configuration constants
 DEPLOYMENT_NAME = f"sglang-deployment-example-{CURRENT_TIMESTAMP}"
-MODEL_PATH = "deepseek-ai/deepseek-llm-7b-chat"
+SGLANG_IMAGE_URL = "docker.io/lmsysorg/sglang:v0.4.1.post6-cu124"
+DEEPSEEK_MODEL_PATH = "deepseek-ai/deepseek-llm-7b-chat"
 HF_SECRET_NAME = "huggingface-token"
-IMAGE_URL = "docker.io/lmsysorg/sglang:v0.4.1.post6-cu124"
 
 # Get confidential values from environment variables
 DATACRUNCH_CLIENT_ID = os.environ.get('DATACRUNCH_CLIENT_ID')
@@ -140,18 +141,19 @@ def graceful_shutdown(signum, frame) -> None:
         sys.exit(1)
 
     # Create container configuration
+    APP_PORT = 30000
     container = Container(
-        image=IMAGE_URL,
-        exposed_port=30000,
+        image=SGLANG_IMAGE_URL,
+        exposed_port=APP_PORT,
         healthcheck=HealthcheckSettings(
             enabled=True,
-            port=30000,
+            port=APP_PORT,
             path="/health"
         ),
         entrypoint_overrides=EntrypointOverridesSettings(
             enabled=True,
             cmd=["python3", "-m", "sglang.launch_server", "--model-path",
-                 MODEL_PATH, "--host", "0.0.0.0", "--port", "30000"]
+                 DEEPSEEK_MODEL_PATH, "--host", "0.0.0.0", "--port", str(APP_PORT)]
         ),
         env=[
             EnvVar(
@@ -162,16 +164,19 @@ def graceful_shutdown(signum, frame) -> None:
         ]
     )
 
-    # Create scaling configuration - default values
+    # Create scaling configuration
     scaling_options = ScalingOptions(
         min_replica_count=1,
-        max_replica_count=2,
-        scale_down_policy=ScalingPolicy(delay_seconds=300),
-        scale_up_policy=ScalingPolicy(delay_seconds=300),
+        max_replica_count=5,
+        scale_down_policy=ScalingPolicy(delay_seconds=60 * 5),
+        scale_up_policy=ScalingPolicy(
+            delay_seconds=0),  # No delay for scale up
         queue_message_ttl_seconds=500,
-        concurrent_requests_per_replica=1,
+        # Modern LLM engines are optimized for batching requests, with minimal performance impact. Taking advantage of batching can significantly improve throughput.
+        concurrent_requests_per_replica=32,
         scaling_triggers=ScalingTriggers(
-            queue_load=QueueLoadScalingTrigger(threshold=1),
+            # lower value means more aggressive scaling
+            queue_load=QueueLoadScalingTrigger(threshold=0.1),
             cpu_utilization=UtilizationScalingTrigger(
                 enabled=True,
                 threshold=90
@@ -224,7 +229,7 @@ def graceful_shutdown(signum, frame) -> None:
         # Test completions endpoint
         print("\nTesting completions API...")
         completions_data = {
-            "model": MODEL_PATH,
+            "model": DEEPSEEK_MODEL_PATH,
             "prompt": "Is consciousness fundamentally computational, or is there something more to subjective experience that cannot be reduced to information processing?",
             "max_tokens": 128,
             "temperature": 0.7,
@@ -239,6 +244,31 @@ def graceful_shutdown(signum, frame) -> None:
         print("Completions API is working!")
         print(f"Response: {completions_response}")
 
+        # Make a stream sync inference request to the SGLang server
+        completions_response_stream = created_deployment.run_sync(
+            completions_data,
+            path="/v1/completions",
+            stream=True
+        )
+        print("Stream completions API is working!")
+        # Print the streamed response
+        for line in completions_response_stream.stream(as_text=True):
+            if line:
+                line = line.decode('utf-8')
+
+                if line.startswith('data:'):
+                    data = line[5:]  # Remove 'data: ' prefix
+                    if data == '[DONE]':
+                        break
+                    try:
+                        event_data = json.loads(data)
+                        token_text = event_data['choices'][0]['text']
+
+                        # Print token immediately to show progress
+                        print(token_text, end='', flush=True)
+                    except json.JSONDecodeError:
+                        continue
+
     except Exception as e:
         print(f"Error testing deployment: {e}")