LCORE-1860: Add E2E tests for degraded mode startup scenarios

anik120 · anik120 · commit 24d5db91b73d · 2026-06-24T15:52:39.000-04:00
Add end-to-end tests to verify LCORE behavior when starting without
llama-stack and allow_degraded_mode is enabled.
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-degraded-mode.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-degraded-mode.yaml
@@ -0,0 +1,25 @@
+name: Lightspeed Core Service (LCS) - Degraded Mode Test
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  # Server mode - connects to separate llama-stack service
+  use_as_library_client: false
+  url: http://${env.E2E_LLAMA_HOSTNAME}:8321
+  api_key: xyzzy
+  # Enable degraded mode to allow startup without llama-stack
+  allow_degraded_mode: true
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+authentication:
+  module: "noop"
+inference:
+  default_provider: openai
+  default_model: gpt-4o-mini
diff --git a/tests/e2e/features/degraded_mode_startup.feature b/tests/e2e/features/degraded_mode_startup.feature
@@ -0,0 +1,64 @@
+@e2e_group_3 @skip-in-library-mode @Authorized
+Feature: Degraded mode startup
+
+  End-to-end scenarios that test LCORE startup behavior when llama-stack
+  is NOT available at startup time and allow_degraded_mode is enabled.
+
+  These tests verify that LCORE can start without llama-stack and continue
+  to serve health/metrics endpoints in degraded mode.
+
+  Background:
+    Given The service is started locally
+      And The system is in default state
+      And REST API service prefix is /v1
+      And the Lightspeed stack configuration directory is "tests/e2e/configuration"
+
+  @skip-health-check
+  Scenario: Service starts in degraded mode when llama-stack is not running
+    Given The llama-stack connection is disrupted
+      And The service uses the lightspeed-stack-degraded-mode.yaml configuration
+      And The service is restarted
+    When I access endpoint "liveness" using HTTP GET method
+    Then The status code of the response is 200
+    And The body of the response is the following
+    """
+    {"alive": true}
+    """
+
+  @skip-health-check
+  Scenario: Readiness endpoint reports degraded state when started without llama-stack
+    Given The llama-stack connection is disrupted
+      And The service uses the lightspeed-stack-degraded-mode.yaml configuration
+      And The service is restarted
+    When I access endpoint "readiness" using HTTP GET method
+    Then The status code of the response is 503
+    And The body of the response, ignoring the "providers" field, is the following
+    """
+    {"ready": false, "reason": "Cannot connect to backend service", "overall_status": "unhealthy", "impacts": ["LLM inference unavailable", "Provider health checks unavailable"]}
+    """
+
+  @skip-health-check
+  Scenario: Metrics endpoint works in degraded mode
+    Given The llama-stack connection is disrupted
+      And The service uses the lightspeed-stack-degraded-mode.yaml configuration
+      And The service is restarted
+    When I access endpoint "metrics" using HTTP GET method
+    Then The status code of the response is 200
+    And The response body contains "ls_started_in_degraded_mode"
+
+  @skip-health-check
+  Scenario: Degraded mode metric is set to 1.0 when started without llama-stack
+    Given The llama-stack connection is disrupted
+      And The service uses the lightspeed-stack-degraded-mode.yaml configuration
+      And The service is restarted
+    When I access endpoint "metrics" using HTTP GET method
+    Then The status code of the response is 200
+    And The response body contains "ls_started_in_degraded_mode 1.0"
+
+  Scenario: Degraded mode metric is set to 0.0 when started with llama-stack
+    Given Llama Stack is restarted
+      And The service uses the lightspeed-stack-degraded-mode.yaml configuration
+      And The service is restarted
+    When I access endpoint "metrics" using HTTP GET method
+    Then The status code of the response is 200
+    And The response body contains "ls_started_in_degraded_mode 0.0"
diff --git a/tests/e2e/features/steps/common.py b/tests/e2e/features/steps/common.py
@@ -190,14 +190,21 @@ def restart_service(context: Context) -> None:
     ``The service uses the ... configuration`` step did not change the active
     YAML), skips the restart and clears the flag.
 
+    If the scenario has the @skip-health-check tag, skips waiting for Docker
+    health checks (used for degraded mode tests where service starts but
+    fails health checks due to llama-stack unavailability).
+
     Parameters:
     ----------
         context (Context): Behave context.
     """
     if getattr(context, "lightspeed_stack_skip_restart", False):
         context.lightspeed_stack_skip_restart = False
         return
-    restart_container("lightspeed-stack")
+
+    # Check if scenario has @skip-health-check tag
+    skip_health = "skip-health-check" in context.tags
+    restart_container("lightspeed-stack", skip_health_check=skip_health)
 
 
 @given("The system is in default state")
diff --git a/tests/e2e/features/steps/common_http.py b/tests/e2e/features/steps/common_http.py
@@ -112,6 +112,19 @@ def check_response_body_does_not_contain(context: Context, substring: str) -> No
     ), f"The response text '{context.response.text}' contains '{substring}'"
 
 
+@then('The response body contains "{substring}"')
+def check_response_contains_substring(context: Context, substring: str) -> None:
+    """Check that response body contains a specific substring.
+
+    This step handles quoted strings and performs exact matching
+    (case-sensitive) for metrics and structured output validation.
+    """
+    assert context.response is not None, "Request needs to be performed first"
+    assert (
+        substring in context.response.text
+    ), f"Expected '{substring}' in response, but got: {context.response.text[:200]}"
+
+
 @then("The body of the response is the following")
 def check_prediction_result(context: Context) -> None:
     """Check the content of the response to be exactly the same.
diff --git a/tests/e2e/utils/utils.py b/tests/e2e/utils/utils.py
@@ -181,7 +181,9 @@ def validate_json(message: Any, schema: Any) -> None:
         assert False, "The provided schema is faulty:" + str(e)
 
 
-def wait_for_container_health(container_name: str, max_attempts: int = 6) -> None:
+def wait_for_container_health(
+    container_name: str, max_attempts: int = 6, skip_health_check: bool = False
+) -> None:
     """Wait for container to be healthy.
 
     Polls a Docker container until its health status becomes `healthy` or the
@@ -200,7 +202,28 @@ def wait_for_container_health(container_name: str, max_attempts: int = 6) -> Non
     ----------
         container_name (str): Docker container name or ID to check.
         max_attempts (int): Maximum number of health check attempts (default 6).
+        skip_health_check (bool): If True, skip health check (for degraded mode tests).
     """
+    if skip_health_check:
+        print(f"Skipping health check for {container_name} (degraded mode test)")
+        # Wait for container to be responsive (check /liveness instead of Docker health)
+        import requests
+
+        for attempt in range(30):  # 30 attempts = ~30 seconds max
+            try:
+                response = requests.get("http://localhost:8080/liveness", timeout=2)
+                if response.status_code == 200:
+                    print(f"{container_name} is responsive (degraded mode)")
+                    return
+            except (requests.ConnectionError, requests.Timeout):
+                pass
+
+            if attempt < 29:
+                time.sleep(1)
+
+        print(f"{container_name} did not respond after 30 attempts (degraded mode)")
+        return
+
     if is_prow_environment():
         wait_for_pod_health(container_name, max_attempts)
         return
@@ -435,9 +458,13 @@ def clear_llama_stack_storage(container_name: str = "lightspeed-stack") -> None:
         raise
 
 
-def restart_container(container_name: str) -> None:
+def restart_container(container_name: str, skip_health_check: bool = False) -> None:
     """Restart a Docker container by name and wait until it is healthy.
 
+    Parameters:
+        container_name: Name of the Docker container to restart.
+        skip_health_check: If True, skip health check (for degraded mode tests).
+
     Returns:
         None
 
@@ -470,7 +497,9 @@ def restart_container(container_name: str) -> None:
     # Library mode embeds llama-stack, so the container takes longer to start
     # (~45-60s vs ~10s in server mode).  Use a generous attempt count so
     # MCP-auth scenarios that restart the container don't time out.
-    wait_for_container_health(container_name, max_attempts=12)
+    wait_for_container_health(
+        container_name, max_attempts=12, skip_health_check=skip_health_check
+    )
 
     if container_name == "llama-stack":
         from tests.e2e.features.steps.health import (