Skip to content

Commit 24d5db9

Browse files
committed
LCORE-1860: Add E2E tests for degraded mode startup scenarios
Add end-to-end tests to verify LCORE behavior when starting without llama-stack and allow_degraded_mode is enabled.
1 parent ab86328 commit 24d5db9

5 files changed

Lines changed: 142 additions & 4 deletions

File tree

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: Lightspeed Core Service (LCS) - Degraded Mode Test
2+
service:
3+
host: 0.0.0.0
4+
port: 8080
5+
auth_enabled: false
6+
workers: 1
7+
color_log: true
8+
access_log: true
9+
llama_stack:
10+
# Server mode - connects to separate llama-stack service
11+
use_as_library_client: false
12+
url: http://${env.E2E_LLAMA_HOSTNAME}:8321
13+
api_key: xyzzy
14+
# Enable degraded mode to allow startup without llama-stack
15+
allow_degraded_mode: true
16+
user_data_collection:
17+
feedback_enabled: true
18+
feedback_storage: "/tmp/data/feedback"
19+
transcripts_enabled: true
20+
transcripts_storage: "/tmp/data/transcripts"
21+
authentication:
22+
module: "noop"
23+
inference:
24+
default_provider: openai
25+
default_model: gpt-4o-mini
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
@e2e_group_3 @skip-in-library-mode @Authorized
2+
Feature: Degraded mode startup
3+
4+
End-to-end scenarios that test LCORE startup behavior when llama-stack
5+
is NOT available at startup time and allow_degraded_mode is enabled.
6+
7+
These tests verify that LCORE can start without llama-stack and continue
8+
to serve health/metrics endpoints in degraded mode.
9+
10+
Background:
11+
Given The service is started locally
12+
And The system is in default state
13+
And REST API service prefix is /v1
14+
And the Lightspeed stack configuration directory is "tests/e2e/configuration"
15+
16+
@skip-health-check
17+
Scenario: Service starts in degraded mode when llama-stack is not running
18+
Given The llama-stack connection is disrupted
19+
And The service uses the lightspeed-stack-degraded-mode.yaml configuration
20+
And The service is restarted
21+
When I access endpoint "liveness" using HTTP GET method
22+
Then The status code of the response is 200
23+
And The body of the response is the following
24+
"""
25+
{"alive": true}
26+
"""
27+
28+
@skip-health-check
29+
Scenario: Readiness endpoint reports degraded state when started without llama-stack
30+
Given The llama-stack connection is disrupted
31+
And The service uses the lightspeed-stack-degraded-mode.yaml configuration
32+
And The service is restarted
33+
When I access endpoint "readiness" using HTTP GET method
34+
Then The status code of the response is 503
35+
And The body of the response, ignoring the "providers" field, is the following
36+
"""
37+
{"ready": false, "reason": "Cannot connect to backend service", "overall_status": "unhealthy", "impacts": ["LLM inference unavailable", "Provider health checks unavailable"]}
38+
"""
39+
40+
@skip-health-check
41+
Scenario: Metrics endpoint works in degraded mode
42+
Given The llama-stack connection is disrupted
43+
And The service uses the lightspeed-stack-degraded-mode.yaml configuration
44+
And The service is restarted
45+
When I access endpoint "metrics" using HTTP GET method
46+
Then The status code of the response is 200
47+
And The response body contains "ls_started_in_degraded_mode"
48+
49+
@skip-health-check
50+
Scenario: Degraded mode metric is set to 1.0 when started without llama-stack
51+
Given The llama-stack connection is disrupted
52+
And The service uses the lightspeed-stack-degraded-mode.yaml configuration
53+
And The service is restarted
54+
When I access endpoint "metrics" using HTTP GET method
55+
Then The status code of the response is 200
56+
And The response body contains "ls_started_in_degraded_mode 1.0"
57+
58+
Scenario: Degraded mode metric is set to 0.0 when started with llama-stack
59+
Given Llama Stack is restarted
60+
And The service uses the lightspeed-stack-degraded-mode.yaml configuration
61+
And The service is restarted
62+
When I access endpoint "metrics" using HTTP GET method
63+
Then The status code of the response is 200
64+
And The response body contains "ls_started_in_degraded_mode 0.0"

tests/e2e/features/steps/common.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,14 +190,21 @@ def restart_service(context: Context) -> None:
190190
``The service uses the ... configuration`` step did not change the active
191191
YAML), skips the restart and clears the flag.
192192
193+
If the scenario has the @skip-health-check tag, skips waiting for Docker
194+
health checks (used for degraded mode tests where service starts but
195+
fails health checks due to llama-stack unavailability).
196+
193197
Parameters:
194198
----------
195199
context (Context): Behave context.
196200
"""
197201
if getattr(context, "lightspeed_stack_skip_restart", False):
198202
context.lightspeed_stack_skip_restart = False
199203
return
200-
restart_container("lightspeed-stack")
204+
205+
# Check if scenario has @skip-health-check tag
206+
skip_health = "skip-health-check" in context.tags
207+
restart_container("lightspeed-stack", skip_health_check=skip_health)
201208

202209

203210
@given("The system is in default state")

tests/e2e/features/steps/common_http.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,19 @@ def check_response_body_does_not_contain(context: Context, substring: str) -> No
112112
), f"The response text '{context.response.text}' contains '{substring}'"
113113

114114

115+
@then('The response body contains "{substring}"')
116+
def check_response_contains_substring(context: Context, substring: str) -> None:
117+
"""Check that response body contains a specific substring.
118+
119+
This step handles quoted strings and performs exact matching
120+
(case-sensitive) for metrics and structured output validation.
121+
"""
122+
assert context.response is not None, "Request needs to be performed first"
123+
assert (
124+
substring in context.response.text
125+
), f"Expected '{substring}' in response, but got: {context.response.text[:200]}"
126+
127+
115128
@then("The body of the response is the following")
116129
def check_prediction_result(context: Context) -> None:
117130
"""Check the content of the response to be exactly the same.

tests/e2e/utils/utils.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,9 @@ def validate_json(message: Any, schema: Any) -> None:
181181
assert False, "The provided schema is faulty:" + str(e)
182182

183183

184-
def wait_for_container_health(container_name: str, max_attempts: int = 6) -> None:
184+
def wait_for_container_health(
185+
container_name: str, max_attempts: int = 6, skip_health_check: bool = False
186+
) -> None:
185187
"""Wait for container to be healthy.
186188
187189
Polls a Docker container until its health status becomes `healthy` or the
@@ -200,7 +202,28 @@ def wait_for_container_health(container_name: str, max_attempts: int = 6) -> Non
200202
----------
201203
container_name (str): Docker container name or ID to check.
202204
max_attempts (int): Maximum number of health check attempts (default 6).
205+
skip_health_check (bool): If True, skip health check (for degraded mode tests).
203206
"""
207+
if skip_health_check:
208+
print(f"Skipping health check for {container_name} (degraded mode test)")
209+
# Wait for container to be responsive (check /liveness instead of Docker health)
210+
import requests
211+
212+
for attempt in range(30): # 30 attempts = ~30 seconds max
213+
try:
214+
response = requests.get("http://localhost:8080/liveness", timeout=2)
215+
if response.status_code == 200:
216+
print(f"{container_name} is responsive (degraded mode)")
217+
return
218+
except (requests.ConnectionError, requests.Timeout):
219+
pass
220+
221+
if attempt < 29:
222+
time.sleep(1)
223+
224+
print(f"{container_name} did not respond after 30 attempts (degraded mode)")
225+
return
226+
204227
if is_prow_environment():
205228
wait_for_pod_health(container_name, max_attempts)
206229
return
@@ -435,9 +458,13 @@ def clear_llama_stack_storage(container_name: str = "lightspeed-stack") -> None:
435458
raise
436459

437460

438-
def restart_container(container_name: str) -> None:
461+
def restart_container(container_name: str, skip_health_check: bool = False) -> None:
439462
"""Restart a Docker container by name and wait until it is healthy.
440463
464+
Parameters:
465+
container_name: Name of the Docker container to restart.
466+
skip_health_check: If True, skip health check (for degraded mode tests).
467+
441468
Returns:
442469
None
443470
@@ -470,7 +497,9 @@ def restart_container(container_name: str) -> None:
470497
# Library mode embeds llama-stack, so the container takes longer to start
471498
# (~45-60s vs ~10s in server mode). Use a generous attempt count so
472499
# MCP-auth scenarios that restart the container don't time out.
473-
wait_for_container_health(container_name, max_attempts=12)
500+
wait_for_container_health(
501+
container_name, max_attempts=12, skip_health_check=skip_health_check
502+
)
474503

475504
if container_name == "llama-stack":
476505
from tests.e2e.features.steps.health import (

0 commit comments

Comments
 (0)