Skip to content

Commit d97adab

Browse files
committed
kubernetes: surface log-unavailable reason and observability link in placeholder
When log acquisition fails, replace the empty return value with a human- readable message that includes the pod name, namespace, and — when TANGLE_LOG_SEARCH_URL_TEMPLATE is set — a direct link to the pod's logs in the configured observability platform. The URL template supports two placeholders substituted at runtime: {pod_name} — Kubernetes pod name {start_time} — relative start derived from started_at (e.g. "now-125m", adding 5 min of padding); falls back to "now-1440m" (24 h) if the start time is not available in memory. Both started_at values (LaunchedKubernetesContainer from pod container state, LaunchedKubernetesJob from job status) are in-memory reads — no additional database queries are required to compute the time range. The placeholder is stored in GCS via upload_log and returned verbatim by the log-read API, so it surfaces wherever logs are displayed without any frontend or schema changes.
1 parent 27b57aa commit d97adab

1 file changed

Lines changed: 43 additions & 2 deletions

File tree

cloud_pipelines_backend/launchers/kubernetes_launchers.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,43 @@
6969
# Environment variables for multi-node execution.
7070
_MULTI_NODE_NODE_INDEX_ENV_VAR_NAME = "_TANGLE_MULTI_NODE_NODE_INDEX"
7171

72+
# Optional URL template for linking to pod logs in an external observability
73+
# platform when log acquisition fails. Set via TANGLE_LOG_SEARCH_URL_TEMPLATE.
74+
# Two placeholders are substituted at runtime:
75+
# {pod_name} — the Kubernetes pod name
76+
# {start_time} — relative start time, e.g. "now-125m" (elapsed + 5 min pad),
77+
# or "now-1440m" (24 h) when the pod start time is unavailable.
78+
# Example for Observe: see oasis-backend deployment config.
79+
_LOG_SEARCH_URL_TEMPLATE: str | None = os.environ.get("TANGLE_LOG_SEARCH_URL_TEMPLATE")
80+
81+
82+
def _format_log_unavailable_message(
83+
pod_name: str,
84+
namespace: str,
85+
started_at: "datetime.datetime | None",
86+
) -> str:
87+
"""Return a placeholder log string when the Kubernetes API cannot be read."""
88+
msg = (
89+
f"[Log unavailable: Kubernetes API returned a malformed response. "
90+
f"Pod: {pod_name}, Namespace: {namespace}."
91+
)
92+
if _LOG_SEARCH_URL_TEMPLATE:
93+
if started_at is not None:
94+
elapsed_seconds = (
95+
datetime.datetime.now(tz=datetime.timezone.utc) - started_at
96+
).total_seconds()
97+
# Add 5-minute padding so the window opens before the first log line.
98+
elapsed_minutes = max(1, int(elapsed_seconds / 60) + 5)
99+
start_time = f"now-{elapsed_minutes}m"
100+
else:
101+
start_time = "now-1440m" # 24 h fallback when start time is unknown
102+
url = _LOG_SEARCH_URL_TEMPLATE.replace("{pod_name}", pod_name).replace(
103+
"{start_time}", start_time
104+
)
105+
msg += f" Search: {url}"
106+
msg += "]\n"
107+
return msg
108+
72109

73110
_T = typing.TypeVar("_T")
74111

@@ -944,7 +981,9 @@ def get_log(self) -> str:
944981
self._pod_name,
945982
exc_info=True,
946983
)
947-
return ""
984+
return _format_log_unavailable_message(
985+
self._pod_name, self._namespace, self.started_at
986+
)
948987

949988
def upload_log(self):
950989
launcher = self._get_launcher()
@@ -1528,7 +1567,9 @@ def _get_log_by_pod_key(self, pod_name: str) -> str | None:
15281567
pod_name,
15291568
exc_info=True,
15301569
)
1531-
return None
1570+
return _format_log_unavailable_message(
1571+
pod_name, self._namespace, self.started_at
1572+
)
15321573

15331574
def _get_all_logs(self) -> dict[str, str]:
15341575
logs = {}

0 commit comments

Comments
 (0)