Skip to content

Commit fe35459

Browse files
radofuchsRadovan Fuchs
andauthored
LCORE-1472: Add e2e retriy capability (#1500)
* add retry functionality to flaky e2e tests --------- Co-authored-by: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
1 parent 05607dd commit fe35459

7 files changed

Lines changed: 57 additions & 22 deletions

File tree

tests/e2e/features/environment.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import time
1313

1414
import requests
15+
from behave.contrib.scenario_autoretry import patch_scenario_with_autoretry
1516
from behave.model import Feature, Scenario
1617
from behave.runner import Context
1718

@@ -37,6 +38,10 @@
3738
# Wall-clock start for each feature (on ``Feature``; survives Behave context resets).
3839
_E2E_FEATURE_PERF_START_ATTR = "_lightspeed_e2e_feature_perf_start"
3940

41+
# Opt-in scenario retries for infrastructure flakiness (tag scenario with ``@flaky``).
42+
_E2E_FLAKY_TAG = "flaky"
43+
_E2E_FLAKY_MAX_ATTEMPTS = 5
44+
4045

4146
def _fetch_models_from_service() -> dict:
4247
"""Query /v1/models endpoint and return first LLM model.
@@ -355,13 +360,27 @@ def before_feature(context: Context, feature: Feature) -> None:
355360
356361
Records monotonic start time on ``feature`` for duration logging in
357362
``after_feature`` (includes scenarios and feature teardown).
363+
364+
Scenarios tagged ``@flaky`` are patched to retry the full scenario up to
365+
``max_attempts`` times before accepting failure. The cap defaults to
366+
``_E2E_FLAKY_MAX_ATTEMPTS`` and can be overridden with the
367+
``E2E_FLAKY_MAX_ATTEMPTS`` environment variable.
358368
"""
359369
setattr(feature, _E2E_FEATURE_PERF_START_ATTR, time.perf_counter())
360370
reset_active_lightspeed_stack_config_basename()
361371
context.active_lightspeed_stack_config_basename = None
362372
# One real Llama disruption per feature (module-level flag; survives context resets)
363373
reset_llama_stack_disrupt_once_tracking()
364374

375+
try:
376+
max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
377+
except ValueError:
378+
max_flaky = _E2E_FLAKY_MAX_ATTEMPTS
379+
if max_flaky > 1:
380+
for scenario in feature.walk_scenarios():
381+
if _E2E_FLAKY_TAG in scenario.effective_tags:
382+
patch_scenario_with_autoretry(scenario, max_attempts=max_flaky)
383+
365384
if "Feedback" in feature.tags:
366385
context.hostname = os.getenv("E2E_LSC_HOSTNAME", "localhost")
367386
context.port = os.getenv("E2E_LSC_PORT", "8080")

tests/e2e/features/mcp.feature

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Feature: MCP tests
1919
Then The status code of the response is 200
2020
And The body of the response contains mcp-file
2121

22-
@MCPFileAuthConfig
22+
@MCPFileAuthConfig @flaky
2323
Scenario: Check if query endpoint succeeds when MCP file-based auth token is passed
2424
Given MCP toolgroups are reset for a new MCP configuration
2525
And The service uses the lightspeed-stack-mcp-file-auth.yaml configuration
@@ -36,7 +36,7 @@ Feature: MCP tests
3636
| Hello |
3737
And The token metrics should have increased
3838

39-
@MCPFileAuthConfig
39+
@MCPFileAuthConfig @flaky
4040
Scenario: Check if streaming_query endpoint succeeds when MCP file-based auth token is passed
4141
Given MCP toolgroups are reset for a new MCP configuration
4242
And The service uses the lightspeed-stack-mcp-file-auth.yaml configuration
@@ -128,7 +128,7 @@ Feature: MCP tests
128128
Then The status code of the response is 200
129129
And The body of the response contains mcp-kubernetes
130130

131-
@MCPKubernetesAuthConfig
131+
@MCPKubernetesAuthConfig @flaky
132132
Scenario: Check if query endpoint succeeds when MCP kubernetes auth token is passed
133133
Given MCP toolgroups are reset for a new MCP configuration
134134
And The service uses the lightspeed-stack-mcp-kubernetes-auth.yaml configuration
@@ -145,7 +145,7 @@ Feature: MCP tests
145145
| Hello |
146146
And The token metrics should have increased
147147

148-
@MCPKubernetesAuthConfig
148+
@MCPKubernetesAuthConfig @flaky
149149
Scenario: Check if streaming_query endpoint succeeds when MCP kubernetes auth token is passed
150150
Given MCP toolgroups are reset for a new MCP configuration
151151
And The service uses the lightspeed-stack-mcp-kubernetes-auth.yaml configuration
@@ -239,7 +239,7 @@ Feature: MCP tests
239239
Then The status code of the response is 200
240240
And The body of the response contains mcp-client
241241

242-
@MCPClientAuthConfig
242+
@MCPClientAuthConfig @flaky
243243
Scenario: Check if query endpoint succeeds when MCP client-provided auth token is passed
244244
Given MCP toolgroups are reset for a new MCP configuration
245245
And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -259,7 +259,7 @@ Feature: MCP tests
259259
| Hello |
260260
And The token metrics should have increased
261261

262-
@MCPClientAuthConfig
262+
@MCPClientAuthConfig @flaky
263263
Scenario: Check if streaming_query endpoint succeeds when MCP client-provided auth token is passed
264264
Given MCP toolgroups are reset for a new MCP configuration
265265
And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -289,7 +289,7 @@ Feature: MCP tests
289289
Then The status code of the response is 200
290290
And The body of the response does not contain mcp-client
291291

292-
@MCPClientAuthConfig
292+
@MCPClientAuthConfig @flaky
293293
Scenario: Check if query endpoint succeeds by skipping when MCP client-provided auth token is omitted
294294
Given MCP toolgroups are reset for a new MCP configuration
295295
And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -306,7 +306,7 @@ Feature: MCP tests
306306
| Hello |
307307
And The token metrics should have increased
308308

309-
@MCPClientAuthConfig
309+
@MCPClientAuthConfig @flaky
310310
Scenario: Check if streaming_query endpoint succeeds by skipping when MCP client-provided auth token is omitted
311311
Given MCP toolgroups are reset for a new MCP configuration
312312
And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -407,7 +407,7 @@ Feature: MCP tests
407407
Then The status code of the response is 200
408408
And The body of the response contains mcp-oauth
409409

410-
@MCPOAuthAuthConfig
410+
@MCPOAuthAuthConfig @flaky
411411
Scenario: Check if query endpoint succeeds when MCP OAuth auth token is passed
412412
Given MCP toolgroups are reset for a new MCP configuration
413413
And The service uses the lightspeed-stack-mcp-oauth-auth.yaml configuration
@@ -427,7 +427,7 @@ Feature: MCP tests
427427
| Hello |
428428
And The token metrics should have increased
429429

430-
@MCPOAuthAuthConfig
430+
@MCPOAuthAuthConfig @flaky
431431
Scenario: Check if streaming_query endpoint succeeds when MCP OAuth auth token is passed
432432
Given MCP toolgroups are reset for a new MCP configuration
433433
And The service uses the lightspeed-stack-mcp-oauth-auth.yaml configuration

tests/e2e/features/query.feature

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Feature: Query endpoint API tests
1010
And The service uses the lightspeed-stack-auth-noop-token.yaml configuration
1111
And The service is restarted
1212

13+
@flaky
1314
Scenario: Check if LLM responds properly to restrictive system prompt to sent question with different system prompt
1415
And I capture the current token metrics
1516
When I use "query" to ask question with authorization header
@@ -22,6 +23,7 @@ Feature: Query endpoint API tests
2223
| ask |
2324
And The token metrics should have increased
2425

26+
@flaky
2527
Scenario: Check if LLM responds properly to non-restrictive system prompt to sent question with different system prompt
2628
And I capture the current token metrics
2729
When I use "query" to ask question with authorization header

tests/e2e/features/responses.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ Feature: Responses endpoint API tests
334334
And The body of the response contains beta
335335
And The responses conversation id matches the first stored conversation
336336

337+
@flaky
337338
Scenario: Responses forks to a new conversation when previous_response_id is not the latest turn
338339
Given The system is in default state
339340
When I use "responses" to ask question with authorization header

tests/e2e/features/responses_streaming.feature

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ Feature: Responses endpoint streaming API tests
325325
}
326326
"""
327327

328+
@flaky
328329
Scenario: Streaming responses continues a thread using previous_response_id from latest turn
329330
When I use "responses" to ask question with authorization header
330331
"""
@@ -355,6 +356,7 @@ Feature: Responses endpoint streaming API tests
355356
Then The status code of the response is 200
356357
And The responses conversation id matches the multi-turn baseline
357358

359+
@flaky
358360
Scenario: Streaming responses continues a thread using conversation id
359361
When I use "responses" to ask question with authorization header
360362
"""
@@ -376,6 +378,7 @@ Feature: Responses endpoint streaming API tests
376378
And The body of the response contains beta
377379
And The responses conversation id matches the first stored conversation
378380

381+
@flaky
379382
Scenario: Streaming responses forks to a new conversation when previous_response_id is not the latest turn
380383
When I use "responses" to ask question with authorization header
381384
"""

tests/e2e/features/steps/llm_query_response.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
)
2626

2727

28+
def _response_contains_fragment(text: str, fragment: str) -> bool:
29+
"""Return whether *fragment* occurs in *text* as a substring (case-insensitive)."""
30+
return fragment.lower() in text.lower()
31+
32+
2833
def _collect_output_item_types(response_body: dict[str, Any]) -> list[str]:
2934
"""Collect ``type`` from each top-level ``output`` item in a Responses API JSON body."""
3035
output = cast(list[dict[str, Any]], response_body["output"])
@@ -238,6 +243,7 @@ def check_fragments_in_responses_output_text(context: Context) -> None:
238243
"""Check that fragments from the scenario table appear in JSON ``output_text``.
239244
240245
Used for POST ``/v1/responses`` (query endpoint uses the ``response`` field).
246+
Matching is case-insensitive.
241247
"""
242248
assert context.response is not None, "Request needs to be performed first"
243249
response_json = context.response.json()
@@ -250,9 +256,10 @@ def check_fragments_in_responses_output_text(context: Context) -> None:
250256

251257
for fragment in context.table:
252258
expected = fragment["Fragments in LLM response"]
253-
assert (
254-
expected in output_text
255-
), f"Fragment '{expected}' not found in output_text: '{output_text}'"
259+
assert _response_contains_fragment(output_text, expected), (
260+
f"Fragment {expected!r} not found in output_text (case-insensitive): "
261+
f"{output_text!r}"
262+
)
256263

257264

258265
@then("The response should contain following fragments")
@@ -262,7 +269,7 @@ def check_fragments_in_response(context: Context) -> None:
262269
First checks that the HTTP response exists and contains a
263270
"response" field. For each fragment listed in the scenario's
264271
table under "Fragments in LLM response", asserts that it
265-
appears as a substring in the LLM's response. Raises an
272+
appears as a substring in the LLM's response (case-insensitive). Raises an
266273
assertion error if any fragment is missing or if the fragments
267274
table is not provided.
268275
"""
@@ -288,9 +295,10 @@ def check_fragments_in_response(context: Context) -> None:
288295

289296
for fragment in context.table:
290297
expected = fragment["Fragments in LLM response"]
291-
assert (
292-
expected in response
293-
), f"Fragment '{expected}' not found in LLM response: '{response}'"
298+
assert _response_contains_fragment(response, expected), (
299+
f"Fragment {expected!r} not found in LLM response (case-insensitive): "
300+
f"{response!r}"
301+
)
294302

295303

296304
@then("The streamed response should contain following fragments")
@@ -300,7 +308,7 @@ def check_streamed_fragments_in_response(context: Context) -> None:
300308
First checks that the HTTP response exists and contains a
301309
"response" field. For each fragment listed in the scenario's
302310
table under "Fragments in LLM response", asserts that it
303-
appears as a substring in the LLM's response. Raises an
311+
appears as a substring in the LLM's response (case-insensitive). Raises an
304312
assertion error if any fragment is missing or if the fragments
305313
table is not provided.
306314
"""
@@ -311,9 +319,10 @@ def check_streamed_fragments_in_response(context: Context) -> None:
311319

312320
for fragment in context.table:
313321
expected = fragment["Fragments in LLM response"]
314-
assert (
315-
expected in response
316-
), f"Fragment '{expected}' not found in LLM response: '{response}'"
322+
assert _response_contains_fragment(response, expected), (
323+
f"Fragment {expected!r} not found in streamed LLM response "
324+
f"(case-insensitive): {response!r}"
325+
)
317326

318327

319328
@then("The streamed response contains error message {message}")

tests/e2e/features/streaming_query.feature

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ Feature: streaming_query endpoint API tests
1010
And The service uses the lightspeed-stack-auth-noop-token.yaml configuration
1111
And The service is restarted
1212

13-
1413
Scenario: Check if streaming_query response in tokens matches the full response
1514
And I use "streaming_query" to ask question with authorization header
1615
"""
@@ -20,6 +19,7 @@ Feature: streaming_query endpoint API tests
2019
Then The status code of the response is 200
2120
And The streamed response is equal to the full response
2221

22+
@flaky
2323
Scenario: Check if LLM responds properly to restrictive system prompt to sent question with different system prompt
2424
And I capture the current token metrics
2525
And I use "streaming_query" to ask question with authorization header
@@ -33,6 +33,7 @@ Feature: streaming_query endpoint API tests
3333
| questions |
3434
And The token metrics should have increased
3535

36+
@flaky
3637
Scenario: Check if LLM responds properly to non-restrictive system prompt to sent question with different system prompt
3738
And I capture the current token metrics
3839
And I use "streaming_query" to ask question with authorization header

0 commit comments

Comments
 (0)