diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b003bf6e..4fcb3aee0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ SPDX-License-Identifier: MIT-0 ### Fixed +- **Test Studio results error for runs stuck in evaluation (#358)** — `getTestRun` (the `test_results_resolver` Lambda) raised an unhandled `ValueError` ("Test run … processing completed, evaluating results") when a run reached a terminal state but the evaluation-aggregation step never cached `testRunResult` — e.g. when aggregation is still running, timed out, or failed silently on a large run (the reporter hit this with 3 463 documents). The exception surfaced as an opaque error and the run spun on "Loading…" forever in Test Studio. The resolver now returns a structured partial `TestRun` (true status plus file counts and metadata, metric fields omitted) instead of raising, so the UI renders the in-progress/terminal state gracefully. This also stops a single not-yet-aggregated run from failing an entire `compareTestRuns` request. (The separate question of *why* aggregation can stall on very large runs is tracked as a follow-up.) - **Configuration version list silently truncated past the first page (#354)** — `ConfigurationManager.list_config_versions()` performed a single unpaginated `table.scan()` on the ConfigurationTable. Because a DynamoDB scan returns at most 1 MB per call, deployments with many config versions (e.g. 230+) only ever saw the ~58 that fit on the first page — uploaded-via-CLI and autotune-agent configs were invisible in the UI's View/Edit Configuration page and the upload-document config-version dropdown (the configs still worked when referenced by name). The method now paginates through `LastEvaluatedKey` so every version is returned. Fixes all callers (`update_configuration`, the AppSync `configuration_resolver`, `rules_discovery`, and the SDK). - **Build Info "update available" indicator broke against the public release bucket** — The `getLatestPublishedVersion` resolver discovered the newest published version by calling `ListObjectsV2` on the public artifacts bucket and parsing `idp-main_.yaml` keys. That bucket grants `GetObject` only (no listing), so the check failed on real public deployments. `idp-cli publish` now writes a small pointer object — `/idp-main-latest.json` (`{version, templateUrl}`) — at the version-stripped prefix on every release, and the resolver reads that one known key with a single `GetObject` (unsigned, falling back to signed), with a conventional `idp-main_.yaml` URL fallback if the pointer omits one. No version parsing or `ListObjectsV2`. The check stays disabled when `PUBLIC_ARTIFACTS_BUCKET` is unset. diff --git a/lib/idp_common_pkg/tests/unit/test_results_resolver.py b/lib/idp_common_pkg/tests/unit/test_results_resolver.py index b1a94c11a..2652e0119 100644 --- a/lib/idp_common_pkg/tests/unit/test_results_resolver.py +++ b/lib/idp_common_pkg/tests/unit/test_results_resolver.py @@ -213,6 +213,51 @@ def test_build_config_comparison(): assert "temperature" in [item["setting"] for item in config_diff] +@pytest.mark.unit +def test_get_test_results_missing_metrics_returns_partial_not_raises(): + """When processing reached a terminal state but the evaluation aggregation + never cached testRunResult (timed out / failed silently on a large run), + get_test_results returns a structured partial TestRun instead of raising an + opaque ValueError that leaves the UI spinning on "Loading..." (issue #358).""" + test_run_id = "TEST-SET-ID" + metadata = { + "PK": f"testrun#{test_run_id}", + "SK": "metadata", + # Already terminal, so the status-refresh branch is skipped and we fall + # straight through to the "no cached metrics" else branch. + "Status": "COMPLETE", + "TestSetId": "set-1", + "TestSetName": "big-classification-set", + "FilesCount": 3463, + "CompletedFiles": 3460, + "FailedFiles": 3, + "CreatedAt": "2025-01-01T00:00:00Z", + "Context": "ctx", + "ConfigVersion": "v7", + # No "testRunResult" key -> aggregation hasn't written metrics yet. + } + + mock_table = Mock() + mock_table.get_item.return_value = {"Item": metadata} + + with ( + patch.dict(os.environ, {"TRACKING_TABLE": "tracking"}), + patch.object(index.dynamodb, "Table", return_value=mock_table), + ): + result = index.get_test_results(test_run_id) + + assert result["testRunId"] == test_run_id + # Reports the true terminal status rather than fabricating one. + assert result["status"] == "COMPLETE" + assert result["filesCount"] == 3463 + assert result["completedFiles"] == 3460 + assert result["failedFiles"] == 3 + assert result["testSetId"] == "set-1" + assert result["configVersion"] == "v7" + # Metric fields are absent (not yet computed) but must not be required. + assert "overallAccuracy" not in result or result["overallAccuracy"] is None + + @pytest.mark.unit def test_handler_field_routing(): """Test GraphQL field routing""" diff --git a/nested/appsync/src/lambda/test_results_resolver/index.py b/nested/appsync/src/lambda/test_results_resolver/index.py index 545acf690..45fb0ffb5 100644 --- a/nested/appsync/src/lambda/test_results_resolver/index.py +++ b/nested/appsync/src/lambda/test_results_resolver/index.py @@ -408,16 +408,36 @@ def get_test_results(test_run_id): "config": _get_test_run_config(test_run_id), } else: - # Provide more specific message for ABORTED status + # No aggregate metrics have been cached yet. This happens when all + # files finished processing but the evaluation aggregation step hasn't + # written testRunResult (still running, or it timed out / failed on a + # large run). Don't raise — that surfaces as an opaque error and the UI + # spins on "Loading..." forever. Return a structured partial TestRun so + # the UI can render the in-progress status instead. if current_status == "ABORTED": - raise ValueError( - f"Test run {test_run_id} aborted, evaluating results for completed documents" + logger.info( + f"Test run {test_run_id} aborted; aggregate metrics not yet available" ) else: - raise ValueError( - f"Test run {test_run_id} processing completed, evaluating results" + logger.info( + f"Test run {test_run_id} processing complete; " + "aggregate metrics not yet available (evaluation in progress)" ) + return { + "testRunId": test_run_id, + "testSetId": metadata.get("TestSetId"), + "testSetName": metadata.get("TestSetName"), + "status": current_status, + "filesCount": metadata.get("FilesCount", 0), + "completedFiles": metadata.get("CompletedFiles", 0), + "failedFiles": metadata.get("FailedFiles", 0), + "createdAt": _format_datetime(metadata.get("CreatedAt")), + "completedAt": _format_datetime(metadata.get("CompletedAt")), + "context": metadata.get("Context"), + "configVersion": metadata.get("ConfigVersion"), + } + def _query_test_runs_from_gsi(table, start_iso, end_iso): """Query test runs from TypeDateIndex GSI instead of scanning the full table.