From 8100e83fb8aca91211c1b4f799d9a1f86b6e7bc3 Mon Sep 17 00:00:00 2001 From: Bob Strahan Date: Mon, 22 Jun 2026 16:42:25 +0000 Subject: [PATCH] fix(test-studio): return partial result instead of raising when metrics not cached (#358) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_test_results() in the test_results_resolver Lambda raised an unhandled ValueError ("Test run ... processing completed, evaluating results") when a run reached a terminal state but the evaluation aggregation never wrote testRunResult — i.e. aggregation is still running, timed out, or failed silently (reproduced on a 3463-document run). The exception surfaced as an opaque error and Test Studio spun on "Loading..." indefinitely. Return a structured partial TestRun (true status, file counts, and metadata; metric fields omitted) instead of raising. The GraphQL TestRun type already makes every metric field nullable, so the partial response is schema-valid. This also prevents a single not-yet-aggregated run from failing an entire compareTestRuns request. The deeper question of why aggregation can stall on very large runs is left as a documented follow-up. Add a regression unit test covering the terminal-status / no-cached- metrics path. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 2 + .../tests/unit/test_results_resolver.py | 45 +++++++++++++++++++ .../src/lambda/test_results_resolver/index.py | 30 ++++++++++--- 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e699cfe73..e228bb669 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,8 @@ SPDX-License-Identifier: MIT-0 ### Fixed +- **Test Studio results error for runs stuck in evaluation (#358)** — `getTestRun` (the `test_results_resolver` Lambda) raised an unhandled `ValueError` ("Test run … processing completed, evaluating results") when a run reached a terminal state but the evaluation-aggregation step never cached `testRunResult` — e.g. when aggregation is still running, timed out, or failed silently on a large run (the reporter hit this with 3 463 documents). The exception surfaced as an opaque error and the run spun on "Loading…" forever in Test Studio. The resolver now returns a structured partial `TestRun` (true status plus file counts and metadata, metric fields omitted) instead of raising, so the UI renders the in-progress/terminal state gracefully. This also stops a single not-yet-aggregated run from failing an entire `compareTestRuns` request. (The separate question of *why* aggregation can stall on very large runs is tracked as a follow-up.) + - **Build Info "update available" indicator broke against the public release bucket** — The `getLatestPublishedVersion` resolver discovered the newest published version by calling `ListObjectsV2` on the public artifacts bucket and parsing `idp-main_.yaml` keys. That bucket grants `GetObject` only (no listing), so the check failed on real public deployments. `idp-cli publish` now writes a small pointer object — `/idp-main-latest.json` (`{version, templateUrl}`) — at the version-stripped prefix on every release, and the resolver reads that one known key with a single `GetObject` (unsigned, falling back to signed), with a conventional `idp-main_.yaml` URL fallback if the pointer omits one. No version parsing or `ListObjectsV2`. The check stays disabled when `PUBLIC_ARTIFACTS_BUCKET` is unset. - **Private AppSync unreachable from browser clients (WorkSpaces, VPN, bastion)** — `scripts/vpc-endpoints.yaml` `VpcEndpointSecurityGroup` previously allowed inbound HTTPS (port 443) only from the Lambda security group. Browsers inside the VPC send AppSync GraphQL requests directly to the `appsync-api` VPC Interface Endpoint (not through the ALB), so all queries, mutations, and subscriptions hung indefinitely — the Configuration page showed "Loading configuration..." forever, the Document List never populated, and the Upload Documents page showed "Input bucket not configured". Fixed by adding a `VpcCidr` parameter and a second ingress rule for the VPC CIDR block. `deploy-vpc-endpoints.py` now auto-looks up the VPC primary CIDR via `ec2:DescribeVpcs` and passes it automatically — no CLI changes required. Re-run `deploy-vpc-endpoints.py` against an existing deployment to apply the fix. diff --git a/lib/idp_common_pkg/tests/unit/test_results_resolver.py b/lib/idp_common_pkg/tests/unit/test_results_resolver.py index b1a94c11a..2652e0119 100644 --- a/lib/idp_common_pkg/tests/unit/test_results_resolver.py +++ b/lib/idp_common_pkg/tests/unit/test_results_resolver.py @@ -213,6 +213,51 @@ def test_build_config_comparison(): assert "temperature" in [item["setting"] for item in config_diff] +@pytest.mark.unit +def test_get_test_results_missing_metrics_returns_partial_not_raises(): + """When processing reached a terminal state but the evaluation aggregation + never cached testRunResult (timed out / failed silently on a large run), + get_test_results returns a structured partial TestRun instead of raising an + opaque ValueError that leaves the UI spinning on "Loading..." (issue #358).""" + test_run_id = "TEST-SET-ID" + metadata = { + "PK": f"testrun#{test_run_id}", + "SK": "metadata", + # Already terminal, so the status-refresh branch is skipped and we fall + # straight through to the "no cached metrics" else branch. + "Status": "COMPLETE", + "TestSetId": "set-1", + "TestSetName": "big-classification-set", + "FilesCount": 3463, + "CompletedFiles": 3460, + "FailedFiles": 3, + "CreatedAt": "2025-01-01T00:00:00Z", + "Context": "ctx", + "ConfigVersion": "v7", + # No "testRunResult" key -> aggregation hasn't written metrics yet. + } + + mock_table = Mock() + mock_table.get_item.return_value = {"Item": metadata} + + with ( + patch.dict(os.environ, {"TRACKING_TABLE": "tracking"}), + patch.object(index.dynamodb, "Table", return_value=mock_table), + ): + result = index.get_test_results(test_run_id) + + assert result["testRunId"] == test_run_id + # Reports the true terminal status rather than fabricating one. + assert result["status"] == "COMPLETE" + assert result["filesCount"] == 3463 + assert result["completedFiles"] == 3460 + assert result["failedFiles"] == 3 + assert result["testSetId"] == "set-1" + assert result["configVersion"] == "v7" + # Metric fields are absent (not yet computed) but must not be required. + assert "overallAccuracy" not in result or result["overallAccuracy"] is None + + @pytest.mark.unit def test_handler_field_routing(): """Test GraphQL field routing""" diff --git a/nested/appsync/src/lambda/test_results_resolver/index.py b/nested/appsync/src/lambda/test_results_resolver/index.py index 545acf690..45fb0ffb5 100644 --- a/nested/appsync/src/lambda/test_results_resolver/index.py +++ b/nested/appsync/src/lambda/test_results_resolver/index.py @@ -408,16 +408,36 @@ def get_test_results(test_run_id): "config": _get_test_run_config(test_run_id), } else: - # Provide more specific message for ABORTED status + # No aggregate metrics have been cached yet. This happens when all + # files finished processing but the evaluation aggregation step hasn't + # written testRunResult (still running, or it timed out / failed on a + # large run). Don't raise — that surfaces as an opaque error and the UI + # spins on "Loading..." forever. Return a structured partial TestRun so + # the UI can render the in-progress status instead. if current_status == "ABORTED": - raise ValueError( - f"Test run {test_run_id} aborted, evaluating results for completed documents" + logger.info( + f"Test run {test_run_id} aborted; aggregate metrics not yet available" ) else: - raise ValueError( - f"Test run {test_run_id} processing completed, evaluating results" + logger.info( + f"Test run {test_run_id} processing complete; " + "aggregate metrics not yet available (evaluation in progress)" ) + return { + "testRunId": test_run_id, + "testSetId": metadata.get("TestSetId"), + "testSetName": metadata.get("TestSetName"), + "status": current_status, + "filesCount": metadata.get("FilesCount", 0), + "completedFiles": metadata.get("CompletedFiles", 0), + "failedFiles": metadata.get("FailedFiles", 0), + "createdAt": _format_datetime(metadata.get("CreatedAt")), + "completedAt": _format_datetime(metadata.get("CompletedAt")), + "context": metadata.get("Context"), + "configVersion": metadata.get("ConfigVersion"), + } + def _query_test_runs_from_gsi(table, start_iso, end_iso): """Query test runs from TypeDateIndex GSI instead of scanning the full table.