Skip to content

Commit 40d9fef

Browse files
committed
kubernetes: surface log-unavailable reason and observability link in placeholder
When log acquisition fails, replace the empty return value with a human- readable message that includes the pod name, namespace, and — when TANGLE_LOG_SEARCH_URL_TEMPLATE is set — a direct link to the pod's logs in the configured observability platform. The URL template supports two placeholders substituted at runtime: {pod_name} — Kubernetes pod name {start_time} — relative start derived from started_at (e.g. "now-125m", adding 5 min of padding); falls back to "now-1440m" (24 h) if the start time is not available in memory. Both started_at values (LaunchedKubernetesContainer from pod container state, LaunchedKubernetesJob from job status) are in-memory reads — no additional database queries are required to compute the time range. The placeholder is stored in GCS via upload_log and returned verbatim by the log-read API, so it surfaces wherever logs are displayed without any frontend or schema changes.
1 parent 27b57aa commit 40d9fef

1 file changed

Lines changed: 54 additions & 2 deletions

File tree

cloud_pipelines_backend/launchers/kubernetes_launchers.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,54 @@
6969
# Environment variables for multi-node execution.
7070
_MULTI_NODE_NODE_INDEX_ENV_VAR_NAME = "_TANGLE_MULTI_NODE_NODE_INDEX"
7171

72+
# Optional URL template for linking to pod logs in an external observability
73+
# platform when log acquisition fails. Set via TANGLE_LOG_SEARCH_URL_TEMPLATE.
74+
# Three placeholders are substituted at runtime:
75+
# {pod_name} — the Kubernetes pod name
76+
# {start_time} — absolute ISO 8601 UTC timestamp, started_at minus 5 min padding
77+
# (e.g. "2025-06-17T20:24:11.000Z"), or 24 h before now as fallback
78+
# {end_time} — absolute ISO 8601 UTC timestamp, ended_at plus 5 min padding,
79+
# or now when the pod is still running / end time is unavailable
80+
# Example for Observe: see oasis-backend deployment config.
81+
_LOG_SEARCH_URL_TEMPLATE: str | None = os.environ.get("TANGLE_LOG_SEARCH_URL_TEMPLATE")
82+
83+
_OBSERVE_TS_PADDING = datetime.timedelta(minutes=5)
84+
85+
86+
def _to_iso8601_ms(dt: datetime.datetime) -> str:
87+
"""Format a UTC datetime as the ISO 8601 millisecond string Observe expects."""
88+
utc = dt.astimezone(datetime.timezone.utc)
89+
return utc.strftime("%Y-%m-%dT%H:%M:%S.") + f"{utc.microsecond // 1000:03d}Z"
90+
91+
92+
def _format_log_unavailable_message(
93+
pod_name: str,
94+
namespace: str,
95+
started_at: "datetime.datetime | None",
96+
ended_at: "datetime.datetime | None",
97+
) -> str:
98+
"""Return a placeholder log string when the Kubernetes API cannot be read."""
99+
msg = (
100+
f"[Log unavailable: Kubernetes API returned a malformed response. "
101+
f"Pod: {pod_name}, Namespace: {namespace}."
102+
)
103+
if _LOG_SEARCH_URL_TEMPLATE:
104+
now = datetime.datetime.now(tz=datetime.timezone.utc)
105+
from_dt = (
106+
(started_at - _OBSERVE_TS_PADDING)
107+
if started_at
108+
else (now - datetime.timedelta(hours=24))
109+
)
110+
to_dt = (ended_at + _OBSERVE_TS_PADDING) if ended_at else now
111+
url = (
112+
_LOG_SEARCH_URL_TEMPLATE.replace("{pod_name}", pod_name)
113+
.replace("{start_time}", _to_iso8601_ms(from_dt))
114+
.replace("{end_time}", _to_iso8601_ms(to_dt))
115+
)
116+
msg += f" Search: {url}"
117+
msg += "]\n"
118+
return msg
119+
72120

73121
_T = typing.TypeVar("_T")
74122

@@ -944,7 +992,9 @@ def get_log(self) -> str:
944992
self._pod_name,
945993
exc_info=True,
946994
)
947-
return ""
995+
return _format_log_unavailable_message(
996+
self._pod_name, self._namespace, self.started_at, self.ended_at
997+
)
948998

949999
def upload_log(self):
9501000
launcher = self._get_launcher()
@@ -1528,7 +1578,9 @@ def _get_log_by_pod_key(self, pod_name: str) -> str | None:
15281578
pod_name,
15291579
exc_info=True,
15301580
)
1531-
return None
1581+
return _format_log_unavailable_message(
1582+
pod_name, self._namespace, self.started_at, self.ended_at
1583+
)
15321584

15331585
def _get_all_logs(self) -> dict[str, str]:
15341586
logs = {}

0 commit comments

Comments
 (0)