Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ SPDX-License-Identifier: MIT-0

### Fixed

- **Test Studio results error for runs stuck in evaluation (#358)** — `getTestRun` (the `test_results_resolver` Lambda) raised an unhandled `ValueError` ("Test run … processing completed, evaluating results") when a run reached a terminal state but the evaluation-aggregation step never cached `testRunResult` — e.g. when aggregation is still running, timed out, or failed silently on a large run (the reporter hit this with 3 463 documents). The exception surfaced as an opaque error and the run spun on "Loading…" forever in Test Studio. The resolver now returns a structured partial `TestRun` (true status plus file counts and metadata, metric fields omitted) instead of raising, so the UI renders the in-progress/terminal state gracefully. This also stops a single not-yet-aggregated run from failing an entire `compareTestRuns` request. (The separate question of *why* aggregation can stall on very large runs is tracked as a follow-up.)
- **Configuration version list silently truncated past the first page (#354)** — `ConfigurationManager.list_config_versions()` performed a single unpaginated `table.scan()` on the ConfigurationTable. Because a DynamoDB scan returns at most 1 MB per call, deployments with many config versions (e.g. 230+) only ever saw the ~58 that fit on the first page — uploaded-via-CLI and autotune-agent configs were invisible in the UI's View/Edit Configuration page and the upload-document config-version dropdown (the configs still worked when referenced by name). The method now paginates through `LastEvaluatedKey` so every version is returned. Fixes all callers (`update_configuration`, the AppSync `configuration_resolver`, `rules_discovery`, and the SDK).

- **Build Info "update available" indicator broke against the public release bucket** — The `getLatestPublishedVersion` resolver discovered the newest published version by calling `ListObjectsV2` on the public artifacts bucket and parsing `idp-main_<version>.yaml` keys. That bucket grants `GetObject` only (no listing), so the check failed on real public deployments. `idp-cli publish` now writes a small pointer object — `<prefix>/idp-main-latest.json` (`{version, templateUrl}`) — at the version-stripped prefix on every release, and the resolver reads that one known key with a single `GetObject` (unsigned, falling back to signed), with a conventional `idp-main_<version>.yaml` URL fallback if the pointer omits one. No version parsing or `ListObjectsV2`. The check stays disabled when `PUBLIC_ARTIFACTS_BUCKET` is unset.
Expand Down
45 changes: 45 additions & 0 deletions lib/idp_common_pkg/tests/unit/test_results_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,51 @@ def test_build_config_comparison():
assert "temperature" in [item["setting"] for item in config_diff]


@pytest.mark.unit
def test_get_test_results_missing_metrics_returns_partial_not_raises():
"""When processing reached a terminal state but the evaluation aggregation
never cached testRunResult (timed out / failed silently on a large run),
get_test_results returns a structured partial TestRun instead of raising an
opaque ValueError that leaves the UI spinning on "Loading..." (issue #358)."""
test_run_id = "TEST-SET-ID"
metadata = {
"PK": f"testrun#{test_run_id}",
"SK": "metadata",
# Already terminal, so the status-refresh branch is skipped and we fall
# straight through to the "no cached metrics" else branch.
"Status": "COMPLETE",
"TestSetId": "set-1",
"TestSetName": "big-classification-set",
"FilesCount": 3463,
"CompletedFiles": 3460,
"FailedFiles": 3,
"CreatedAt": "2025-01-01T00:00:00Z",
"Context": "ctx",
"ConfigVersion": "v7",
# No "testRunResult" key -> aggregation hasn't written metrics yet.
}

mock_table = Mock()
mock_table.get_item.return_value = {"Item": metadata}

with (
patch.dict(os.environ, {"TRACKING_TABLE": "tracking"}),
patch.object(index.dynamodb, "Table", return_value=mock_table),
):
result = index.get_test_results(test_run_id)

assert result["testRunId"] == test_run_id
# Reports the true terminal status rather than fabricating one.
assert result["status"] == "COMPLETE"
assert result["filesCount"] == 3463
assert result["completedFiles"] == 3460
assert result["failedFiles"] == 3
assert result["testSetId"] == "set-1"
assert result["configVersion"] == "v7"
# Metric fields are absent (not yet computed) but must not be required.
assert "overallAccuracy" not in result or result["overallAccuracy"] is None


@pytest.mark.unit
def test_handler_field_routing():
"""Test GraphQL field routing"""
Expand Down
30 changes: 25 additions & 5 deletions nested/appsync/src/lambda/test_results_resolver/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,16 +408,36 @@ def get_test_results(test_run_id):
"config": _get_test_run_config(test_run_id),
}
else:
# Provide more specific message for ABORTED status
# No aggregate metrics have been cached yet. This happens when all
# files finished processing but the evaluation aggregation step hasn't
# written testRunResult (still running, or it timed out / failed on a
# large run). Don't raise — that surfaces as an opaque error and the UI
# spins on "Loading..." forever. Return a structured partial TestRun so
# the UI can render the in-progress status instead.
if current_status == "ABORTED":
raise ValueError(
f"Test run {test_run_id} aborted, evaluating results for completed documents"
logger.info(
f"Test run {test_run_id} aborted; aggregate metrics not yet available"
)
else:
raise ValueError(
f"Test run {test_run_id} processing completed, evaluating results"
logger.info(
f"Test run {test_run_id} processing complete; "
"aggregate metrics not yet available (evaluation in progress)"
)

return {
"testRunId": test_run_id,
"testSetId": metadata.get("TestSetId"),
"testSetName": metadata.get("TestSetName"),
"status": current_status,
"filesCount": metadata.get("FilesCount", 0),
"completedFiles": metadata.get("CompletedFiles", 0),
"failedFiles": metadata.get("FailedFiles", 0),
"createdAt": _format_datetime(metadata.get("CreatedAt")),
"completedAt": _format_datetime(metadata.get("CompletedAt")),
"context": metadata.get("Context"),
"configVersion": metadata.get("ConfigVersion"),
}


def _query_test_runs_from_gsi(table, start_iso, end_iso):
"""Query test runs from TypeDateIndex GSI instead of scanning the full table.
Expand Down
Loading