LCORE-1472: Add e2e retriy capability (#1500)

radofuchs · Radovan Fuchs · web-flow · commit fe354598a162 · 2026-04-14T13:46:08.000+02:00
* add retry functionality to flaky e2e tests

---------

Co-authored-by: Radovan Fuchs &lt;rfuchs@rfuchs-thinkpadp1gen7.tpb.csb&gt;
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
@@ -12,6 +12,7 @@
 import time
 
 import requests
+from behave.contrib.scenario_autoretry import patch_scenario_with_autoretry
 from behave.model import Feature, Scenario
 from behave.runner import Context
 
@@ -37,6 +38,10 @@
 # Wall-clock start for each feature (on ``Feature``; survives Behave context resets).
 _E2E_FEATURE_PERF_START_ATTR = "_lightspeed_e2e_feature_perf_start"
 
+# Opt-in scenario retries for infrastructure flakiness (tag scenario with ``@flaky``).
+_E2E_FLAKY_TAG = "flaky"
+_E2E_FLAKY_MAX_ATTEMPTS = 5
+
 
 def _fetch_models_from_service() -> dict:
     """Query /v1/models endpoint and return first LLM model.
@@ -355,13 +360,27 @@ def before_feature(context: Context, feature: Feature) -> None:
 
     Records monotonic start time on ``feature`` for duration logging in
     ``after_feature`` (includes scenarios and feature teardown).
+
+    Scenarios tagged ``@flaky`` are patched to retry the full scenario up to
+    ``max_attempts`` times before accepting failure. The cap defaults to
+    ``_E2E_FLAKY_MAX_ATTEMPTS`` and can be overridden with the
+    ``E2E_FLAKY_MAX_ATTEMPTS`` environment variable.
     """
     setattr(feature, _E2E_FEATURE_PERF_START_ATTR, time.perf_counter())
     reset_active_lightspeed_stack_config_basename()
     context.active_lightspeed_stack_config_basename = None
     # One real Llama disruption per feature (module-level flag; survives context resets)
     reset_llama_stack_disrupt_once_tracking()
 
+    try:
+        max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
+    except ValueError:
+        max_flaky = _E2E_FLAKY_MAX_ATTEMPTS
+    if max_flaky > 1:
+        for scenario in feature.walk_scenarios():
+            if _E2E_FLAKY_TAG in scenario.effective_tags:
+                patch_scenario_with_autoretry(scenario, max_attempts=max_flaky)
+
     if "Feedback" in feature.tags:
         context.hostname = os.getenv("E2E_LSC_HOSTNAME", "localhost")
         context.port = os.getenv("E2E_LSC_PORT", "8080")
diff --git a/tests/e2e/features/mcp.feature b/tests/e2e/features/mcp.feature
@@ -19,7 +19,7 @@ Feature: MCP tests
     Then The status code of the response is 200
     And The body of the response contains mcp-file
 
-  @MCPFileAuthConfig
+  @MCPFileAuthConfig @flaky
   Scenario: Check if query endpoint succeeds when MCP file-based auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-file-auth.yaml configuration
@@ -36,7 +36,7 @@ Feature: MCP tests
         | Hello                     |
     And The token metrics should have increased
 
-  @MCPFileAuthConfig
+  @MCPFileAuthConfig @flaky
   Scenario: Check if streaming_query endpoint succeeds when MCP file-based auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-file-auth.yaml configuration
@@ -128,7 +128,7 @@ Feature: MCP tests
     Then The status code of the response is 200
     And The body of the response contains mcp-kubernetes
 
-  @MCPKubernetesAuthConfig
+  @MCPKubernetesAuthConfig @flaky
   Scenario: Check if query endpoint succeeds when MCP kubernetes auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-kubernetes-auth.yaml configuration
@@ -145,7 +145,7 @@ Feature: MCP tests
         | Hello                     |
     And The token metrics should have increased
 
-  @MCPKubernetesAuthConfig
+  @MCPKubernetesAuthConfig @flaky
   Scenario: Check if streaming_query endpoint succeeds when MCP kubernetes auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-kubernetes-auth.yaml configuration
@@ -239,7 +239,7 @@ Feature: MCP tests
     Then The status code of the response is 200
     And The body of the response contains mcp-client
 
-  @MCPClientAuthConfig
+  @MCPClientAuthConfig @flaky
   Scenario: Check if query endpoint succeeds when MCP client-provided auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -259,7 +259,7 @@ Feature: MCP tests
         | Hello                     |
     And The token metrics should have increased
 
-  @MCPClientAuthConfig
+  @MCPClientAuthConfig @flaky
   Scenario: Check if streaming_query endpoint succeeds when MCP client-provided auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -289,7 +289,7 @@ Feature: MCP tests
     Then The status code of the response is 200
     And The body of the response does not contain mcp-client
 
-  @MCPClientAuthConfig
+  @MCPClientAuthConfig @flaky
   Scenario: Check if query endpoint succeeds by skipping when MCP client-provided auth token is omitted
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -306,7 +306,7 @@ Feature: MCP tests
         | Hello                     |
     And The token metrics should have increased
 
-  @MCPClientAuthConfig
+  @MCPClientAuthConfig @flaky
   Scenario: Check if streaming_query endpoint succeeds by skipping when MCP client-provided auth token is omitted
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-client-auth.yaml configuration
@@ -407,7 +407,7 @@ Feature: MCP tests
     Then The status code of the response is 200
     And The body of the response contains mcp-oauth
 
-  @MCPOAuthAuthConfig
+  @MCPOAuthAuthConfig @flaky
   Scenario: Check if query endpoint succeeds when MCP OAuth auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-oauth-auth.yaml configuration
@@ -427,7 +427,7 @@ Feature: MCP tests
         | Hello                     |
     And The token metrics should have increased
 
-  @MCPOAuthAuthConfig
+  @MCPOAuthAuthConfig @flaky
   Scenario: Check if streaming_query endpoint succeeds when MCP OAuth auth token is passed
     Given MCP toolgroups are reset for a new MCP configuration
       And The service uses the lightspeed-stack-mcp-oauth-auth.yaml configuration
diff --git a/tests/e2e/features/query.feature b/tests/e2e/features/query.feature
@@ -10,6 +10,7 @@ Feature: Query endpoint API tests
       And The service uses the lightspeed-stack-auth-noop-token.yaml configuration
       And The service is restarted
 
+  @flaky
   Scenario: Check if LLM responds properly to restrictive system prompt to sent question with different system prompt
     And I capture the current token metrics
     When I use "query" to ask question with authorization header
@@ -22,6 +23,7 @@ Feature: Query endpoint API tests
           | ask                       |
       And The token metrics should have increased
 
+  @flaky
   Scenario: Check if LLM responds properly to non-restrictive system prompt to sent question with different system prompt
     And I capture the current token metrics
     When I use "query" to ask question with authorization header
diff --git a/tests/e2e/features/responses.feature b/tests/e2e/features/responses.feature
@@ -334,6 +334,7 @@ Feature: Responses endpoint API tests
       And The body of the response contains beta
       And The responses conversation id matches the first stored conversation
 
+  @flaky
   Scenario: Responses forks to a new conversation when previous_response_id is not the latest turn
     Given The system is in default state
     When I use "responses" to ask question with authorization header
diff --git a/tests/e2e/features/responses_streaming.feature b/tests/e2e/features/responses_streaming.feature
@@ -325,6 +325,7 @@ Feature: Responses endpoint streaming API tests
         }
         """
 
+  @flaky
   Scenario: Streaming responses continues a thread using previous_response_id from latest turn
     When I use "responses" to ask question with authorization header
     """
@@ -355,6 +356,7 @@ Feature: Responses endpoint streaming API tests
     Then The status code of the response is 200
       And The responses conversation id matches the multi-turn baseline
 
+  @flaky
   Scenario: Streaming responses continues a thread using conversation id
     When I use "responses" to ask question with authorization header
     """
@@ -376,6 +378,7 @@ Feature: Responses endpoint streaming API tests
       And The body of the response contains beta
       And The responses conversation id matches the first stored conversation
 
+  @flaky
   Scenario: Streaming responses forks to a new conversation when previous_response_id is not the latest turn  
     When I use "responses" to ask question with authorization header
     """
diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py
@@ -25,6 +25,11 @@
 )
 
 
+def _response_contains_fragment(text: str, fragment: str) -> bool:
+    """Return whether *fragment* occurs in *text* as a substring (case-insensitive)."""
+    return fragment.lower() in text.lower()
+
+
 def _collect_output_item_types(response_body: dict[str, Any]) -> list[str]:
     """Collect ``type`` from each top-level ``output`` item in a Responses API JSON body."""
     output = cast(list[dict[str, Any]], response_body["output"])
@@ -238,6 +243,7 @@ def check_fragments_in_responses_output_text(context: Context) -> None:
     """Check that fragments from the scenario table appear in JSON ``output_text``.
 
     Used for POST ``/v1/responses`` (query endpoint uses the ``response`` field).
+    Matching is case-insensitive.
     """
     assert context.response is not None, "Request needs to be performed first"
     response_json = context.response.json()
@@ -250,9 +256,10 @@ def check_fragments_in_responses_output_text(context: Context) -> None:
 
     for fragment in context.table:
         expected = fragment["Fragments in LLM response"]
-        assert (
-            expected in output_text
-        ), f"Fragment '{expected}' not found in output_text: '{output_text}'"
+        assert _response_contains_fragment(output_text, expected), (
+            f"Fragment {expected!r} not found in output_text (case-insensitive): "
+            f"{output_text!r}"
+        )
 
 
 @then("The response should contain following fragments")
@@ -262,7 +269,7 @@ def check_fragments_in_response(context: Context) -> None:
     First checks that the HTTP response exists and contains a
     "response" field. For each fragment listed in the scenario's
     table under "Fragments in LLM response", asserts that it
-    appears as a substring in the LLM's response. Raises an
+    appears as a substring in the LLM's response (case-insensitive). Raises an
     assertion error if any fragment is missing or if the fragments
     table is not provided.
     """
@@ -288,9 +295,10 @@ def check_fragments_in_response(context: Context) -> None:
 
     for fragment in context.table:
         expected = fragment["Fragments in LLM response"]
-        assert (
-            expected in response
-        ), f"Fragment '{expected}' not found in LLM response: '{response}'"
+        assert _response_contains_fragment(response, expected), (
+            f"Fragment {expected!r} not found in LLM response (case-insensitive): "
+            f"{response!r}"
+        )
 
 
 @then("The streamed response should contain following fragments")
@@ -300,7 +308,7 @@ def check_streamed_fragments_in_response(context: Context) -> None:
     First checks that the HTTP response exists and contains a
     "response" field. For each fragment listed in the scenario's
     table under "Fragments in LLM response", asserts that it
-    appears as a substring in the LLM's response. Raises an
+    appears as a substring in the LLM's response (case-insensitive). Raises an
     assertion error if any fragment is missing or if the fragments
     table is not provided.
     """
@@ -311,9 +319,10 @@ def check_streamed_fragments_in_response(context: Context) -> None:
 
     for fragment in context.table:
         expected = fragment["Fragments in LLM response"]
-        assert (
-            expected in response
-        ), f"Fragment '{expected}' not found in LLM response: '{response}'"
+        assert _response_contains_fragment(response, expected), (
+            f"Fragment {expected!r} not found in streamed LLM response "
+            f"(case-insensitive): {response!r}"
+        )
 
 
 @then("The streamed response contains error message {message}")
diff --git a/tests/e2e/features/streaming_query.feature b/tests/e2e/features/streaming_query.feature
@@ -10,7 +10,6 @@ Feature: streaming_query endpoint API tests
       And The service uses the lightspeed-stack-auth-noop-token.yaml configuration
       And The service is restarted
 
-
   Scenario: Check if streaming_query response in tokens matches the full response
     And I use "streaming_query" to ask question with authorization header
     """
@@ -20,6 +19,7 @@ Feature: streaming_query endpoint API tests
      Then The status code of the response is 200
       And The streamed response is equal to the full response
 
+  @flaky
   Scenario: Check if LLM responds properly to restrictive system prompt to sent question with different system prompt
       And I capture the current token metrics
       And I use "streaming_query" to ask question with authorization header
@@ -33,6 +33,7 @@ Feature: streaming_query endpoint API tests
           | questions                 |
       And The token metrics should have increased
 
+  @flaky
   Scenario: Check if LLM responds properly to non-restrictive system prompt to sent question with different system prompt
       And I capture the current token metrics
       And I use "streaming_query" to ask question with authorization header