fix(managed_client): wait for result ready on sync query path (#28)

eddietejeda · web-flow · commit 83ca2f2de82d · 2026-06-26T23:45:36.000-07:00
* fix(managed_client): wait for result ready on sync query path

fetch_table fetched the persisted result as Arrow using the result_id
from a synchronous QueryResponse without waiting for it to reach 'ready'.
Against the live async backend the result is often still 'processing',
so Arrow fetches failed on every read-modify-write (merge/append) and
state read. The async path already waited; the sync path now does too.

Adds a regression test driving the sync path with a 'processing' -&gt;
'ready' result, asserting Arrow is fetched only after readiness.

* refactor(managed_client): dedupe run/result polling, group constants

Extract a single typed _poll() helper shared by the query-run and
result-ready waits (removing two near-identical loops and the magic
0.5/0.3 sleep intervals), flatten _query_database_scoped via
_await_query_run, and hoist the class constants to the top. No behavior
change; verified by the unit suite and a full prod e2e re-run.

---------

Co-authored-by: Eddie A Tejeda &lt;669988+eddietejeda@users.noreply.github.com&gt;
diff --git a/hotdata_framework/managed_client.py b/hotdata_framework/managed_client.py
@@ -9,7 +9,7 @@
 
 import time
 from collections.abc import Callable
-from typing import Any, TypeVar
+from typing import Any, Protocol, TypeVar
 
 import pyarrow as pa
 from hotdata.api.query_api import QueryApi
@@ -30,6 +30,16 @@
 T = TypeVar("T")
 
 
+class _StatusResponse(Protocol):
+    """Async resources (query runs, results) expose a status and error message."""
+
+    status: str
+    error_message: str | None
+
+
+S = TypeVar("S", bound=_StatusResponse)
+
+
 class ManagedDatabaseClient:
     """Managed-database client with bounded retries over hotdata-framework.
 
@@ -39,6 +49,10 @@ class ManagedDatabaseClient:
     database lifecycle.
     """
 
+    _QUERY_TIMEOUT_SECONDS = 300.0
+    _POLL_INTERVAL_SECONDS = 0.4
+    _MAX_BACKOFF_SECONDS = 30.0
+
     def __init__(
         self,
         *,
@@ -100,49 +114,62 @@ def operation() -> pa.Table | None:
 
         return self._request_with_retry(operation)
 
-    _QUERY_TIMEOUT_SECONDS = 300.0
+    def _poll(
+        self,
+        fetch: Callable[[], S],
+        *,
+        is_ready: Callable[[S], bool],
+        describe: str,
+    ) -> S:
+        """Poll ``fetch`` until ``is_ready`` is satisfied, or raise on failure/timeout.
+
+        ``failed``/``cancelled`` statuses raise ``RuntimeError``; exceeding
+        :attr:`_QUERY_TIMEOUT_SECONDS` raises ``TimeoutError``.
+        """
+        deadline = time.monotonic() + self._QUERY_TIMEOUT_SECONDS
+        while time.monotonic() < deadline:
+            obj = fetch()
+            if obj.status in ("failed", "cancelled"):
+                raise RuntimeError(obj.error_message or f"{describe} {obj.status}")
+            if is_ready(obj):
+                return obj
+            time.sleep(self._POLL_INTERVAL_SECONDS)
+        raise TimeoutError(f"{describe} timed out after {self._QUERY_TIMEOUT_SECONDS}s")
 
     def _query_database_scoped(self, sql: str, *, database_id: str) -> str | None:
         raw = QueryApi(self._runtime.api).query(
             QueryRequest(sql=sql),
             x_database_id=database_id,
         )
         if isinstance(raw, QueryResponse):
-            return raw.result_id
-
+            # A synchronous response still persists its full result out-of-band
+            # under ``result_id``; that result may be ``processing`` when the
+            # inline preview returns, so wait for ``ready`` before the caller
+            # fetches it as Arrow.
+            return self._wait_result_ready(raw.result_id)
         if isinstance(raw, AsyncQueryResponse):
-            runs = QueryRunsApi(self._runtime.api)
-            deadline = time.monotonic() + self._QUERY_TIMEOUT_SECONDS
-            result_id: str | None = None
-            while time.monotonic() < deadline:
-                run = runs.get_query_run(raw.query_run_id)
-                if run.status == "succeeded":
-                    result_id = run.result_id
-                    break
-                if run.status in ("failed", "cancelled"):
-                    raise RuntimeError(run.error_message or f"Query {run.status}")
-                time.sleep(0.5)
-            else:
-                raise TimeoutError(
-                    f"Managed database query timed out after {self._QUERY_TIMEOUT_SECONDS}s"
-                )
-            return self._wait_result_ready(result_id)
-
+            return self._wait_result_ready(self._await_query_run(raw.query_run_id))
         return None
 
+    def _await_query_run(self, query_run_id: str) -> str | None:
+        runs = QueryRunsApi(self._runtime.api)
+        run = self._poll(
+            lambda: runs.get_query_run(query_run_id),
+            is_ready=lambda r: r.status == "succeeded",
+            describe="Query",
+        )
+        return run.result_id
+
     def _wait_result_ready(self, result_id: str | None) -> str | None:
         if result_id is None:
             return None
         results = ResultsApi(self._runtime.api)
-        deadline = time.monotonic() + self._QUERY_TIMEOUT_SECONDS
-        while time.monotonic() < deadline:
-            r = results.get_result(result_id)
-            if r.status == "ready":
-                return result_id
-            if r.status in ("failed", "cancelled"):
-                raise RuntimeError(r.error_message or f"Result {r.status}")
-            time.sleep(0.3)
-        raise TimeoutError(f"Result {result_id} not ready after {self._QUERY_TIMEOUT_SECONDS}s")
+        self._poll(
+            lambda: results.get_result(result_id),
+            is_ready=lambda r: r.status == "ready",
+            describe=f"Result {result_id}",
+        )
+        return result_id
 
     def fetch_table_rows(self, *, database: str, schema: str, table: str) -> list[dict[str, Any]]:
         result = self.fetch_table(database=database, schema=schema, table=table)
@@ -168,8 +195,6 @@ def load_managed_table(
             )
         )
 
-    _MAX_BACKOFF_SECONDS = 30.0
-
     def _request_with_retry(self, operation: Callable[[], T]) -> T:
         for attempt in range(1, self._max_retries + 1):
             try:
diff --git a/tests/test_managed_client.py b/tests/test_managed_client.py
@@ -0,0 +1,93 @@
+"""Regression tests for ManagedDatabaseClient result handling."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any
+
+import pyarrow as pa
+import pytest
+from hotdata.models.query_response import QueryResponse
+
+import hotdata_framework.managed_client as mc
+
+
+def _query_response(result_id: str) -> QueryResponse:
+    return QueryResponse(
+        columns=[],
+        rows=[],
+        row_count=0,
+        preview_row_count=0,
+        truncated=False,
+        nullable=[],
+        result_id=result_id,
+        query_run_id="qr",
+        execution_time_ms=1,
+    )
+
+
+def test_fetch_table_waits_for_ready_before_arrow(monkeypatch: pytest.MonkeyPatch) -> None:
+    """A synchronous ``QueryResponse`` persists its full result out-of-band, and
+    that result can still be ``processing`` when the inline preview returns.
+
+    ``fetch_table`` must poll the result to ``ready`` before fetching it as
+    Arrow. The earlier bug returned the ``result_id`` immediately on the sync
+    path, so Arrow was fetched against a ``processing`` result and failed.
+    """
+    calls: list[str] = []
+
+    class FakeQueryApi:
+        def __init__(self, api: object) -> None:
+            pass
+
+        def query(self, request: object, *, x_database_id: str) -> QueryResponse:
+            calls.append("query")
+            return _query_response("rslt1")
+
+    statuses = iter(["processing", "processing", "ready"])
+
+    class FakeResultsApi:
+        def __init__(self, api: object) -> None:
+            pass
+
+        def get_result(self, result_id: str) -> Any:
+            status = next(statuses)
+            calls.append(f"get_result:{status}")
+            return SimpleNamespace(status=status, result_id=result_id, error_message=None)
+
+    class FakeArrowResultsApi:
+        def __init__(self, api: object) -> None:
+            pass
+
+        def get_result_arrow(self, result_id: str) -> pa.Table:
+            calls.append("arrow")
+            return pa.table({"id": [1, 2]})
+
+    monkeypatch.setattr(mc, "QueryApi", FakeQueryApi)
+    monkeypatch.setattr(mc, "ResultsApi", FakeResultsApi)
+    monkeypatch.setattr(mc, "ArrowResultsApi", FakeArrowResultsApi)
+    monkeypatch.setattr(mc.time, "sleep", lambda _seconds: None)
+
+    client = mc.ManagedDatabaseClient(
+        api_key="k",
+        workspace_id="w",
+        api_base_url="https://example.test",
+        max_retries=1,
+        retry_backoff_seconds=0.0,
+    )
+    client._runtime = SimpleNamespace(  # type: ignore[assignment]
+        api=object(),
+        resolve_managed_database=lambda name: SimpleNamespace(id="db1", default_connection_id="c"),
+        list_managed_tables=lambda database, schema=None: [
+            SimpleNamespace(table="orders", synced=True)
+        ],
+    )
+
+    table = client.fetch_table(database="mydb", schema="public", table="orders")
+
+    assert table is not None
+    assert table.num_rows == 2
+    # The result was polled to readiness, and Arrow was fetched only afterwards.
+    assert "get_result:processing" in calls
+    assert "get_result:ready" in calls
+    assert calls.index("arrow") > calls.index("get_result:ready")