From ebb35621644c4293000b4fc2225768f06c220ade Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 11:24:41 +0200 Subject: [PATCH 01/11] fix: do not try to send request if url is none --- controller/culling.py | 2 +- controller/server_status.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/controller/culling.py b/controller/culling.py index d577b4ed..9e0a0429 100644 --- a/controller/culling.py +++ b/controller/culling.py @@ -67,7 +67,7 @@ def get_js_server_status(js_body): # } res = res.json() except JSONDecodeError as err: - logging.warning(f"Could not parse js server status for {server_url}, because: {err}") + logging.warning(f"Could not parse js server status {res.text} for {server_url}, because: {err}") return None if isinstance(res, dict): diff --git a/controller/server_status.py b/controller/server_status.py index 1de0582d..b8d9fb95 100644 --- a/controller/server_status.py +++ b/controller/server_status.py @@ -284,6 +284,8 @@ def is_unschedulable(self) -> bool: ) or (self.events.get("statefulset", {}).get("message") == config.QUOTA_EXCEEDED_MESSAGE) def server_url_is_eventually_responsive(self, timeout_seconds: int = 5) -> bool: + if self.server_url is None: + return False start = datetime.now() while True: try: From 6c57b479121964695291053d2bfd96d32241b048 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 13:25:03 +0200 Subject: [PATCH 02/11] fix: get server url more reliably --- controller/culling.py | 5 ++++- controller/server_status.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/controller/culling.py b/controller/culling.py index 9e0a0429..b12fd2f8 100644 --- a/controller/culling.py +++ b/controller/culling.py @@ -4,6 +4,7 @@ import requests from requests.exceptions import RequestException +from controller.k8s_resources import get_urls from controller.utils import get_pod_metrics, parse_pod_metrics @@ -43,7 +44,9 @@ def get_js_server_status(js_body): try: server_url = js_body["status"]["create_fn"]["fullServerURL"] except KeyError: - return None + server_url = get_urls(js_body)[1] + if server_url is None or len(server_url) == 0: + return None token = js_body["spec"]["auth"].get("token") payload = {} if not token else {"token": token} diff --git a/controller/server_status.py b/controller/server_status.py index b8d9fb95..35933d7f 100644 --- a/controller/server_status.py +++ b/controller/server_status.py @@ -30,6 +30,7 @@ import requests from controller import config +from controller.k8s_resources import get_urls from controller.server_status_enum import ServerStatusEnum @@ -238,6 +239,9 @@ def from_server_spec( reverse=True, ) hibernated = server.get("spec", {}).get("jupyterServer", {}).get("hibernated", False) + server_url=server.get("status", {}).get("create_fn", {}).get("fullServerURL") + if server_url is None or len(server_url) == 0: + server_url = get_urls(server)[1] return cls( init_statuses=init_container_statuses, statuses=container_statuses, @@ -245,7 +249,7 @@ def from_server_spec( pod_conditions=pod_conditions, deletion_timestamp=deletion_timestamp, events=server.get("status", {}).get("events", {}), - server_url=server.get("status", {}).get("create_fn", {}).get("fullServerURL"), + server_url=server_url, hibernated=hibernated, ) From 155140be3d5bd5882ef8bb855178d46e48438fe6 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 13:36:09 +0200 Subject: [PATCH 03/11] squashme: minor fix --- controller/server_controller.py | 1 - 1 file changed, 1 deletion(-) diff --git a/controller/server_controller.py b/controller/server_controller.py index 04841d79..23bf8cb2 100644 --- a/controller/server_controller.py +++ b/controller/server_controller.py @@ -138,7 +138,6 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): return {"createdResources": children_uids, "fullServerURL": get_urls(spec)[1]} -@kopf.on.delete(config.api_group, config.api_version, config.custom_resource_name) def delete_fn(labels, body, namespace, name, **_): """ The jupyter server has been deleted. From dfd36a5152164c87b61f169b9dae4409edecdc55 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 13:38:53 +0200 Subject: [PATCH 04/11] squashme: rollback some fixes --- controller/server_status.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/controller/server_status.py b/controller/server_status.py index 35933d7f..b6a2a534 100644 --- a/controller/server_status.py +++ b/controller/server_status.py @@ -288,8 +288,6 @@ def is_unschedulable(self) -> bool: ) or (self.events.get("statefulset", {}).get("message") == config.QUOTA_EXCEEDED_MESSAGE) def server_url_is_eventually_responsive(self, timeout_seconds: int = 5) -> bool: - if self.server_url is None: - return False start = datetime.now() while True: try: From c58d542b34d19c5303eb67163975f1d57dfd13a2 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 13:57:07 +0200 Subject: [PATCH 05/11] squashme: minor fix --- controller/culling.py | 2 +- controller/server_status.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/controller/culling.py b/controller/culling.py index b12fd2f8..1d859f4e 100644 --- a/controller/culling.py +++ b/controller/culling.py @@ -44,7 +44,7 @@ def get_js_server_status(js_body): try: server_url = js_body["status"]["create_fn"]["fullServerURL"] except KeyError: - server_url = get_urls(js_body)[1] + server_url = get_urls(js_body["spec"])[1] if server_url is None or len(server_url) == 0: return None diff --git a/controller/server_status.py b/controller/server_status.py index b6a2a534..62eeaa57 100644 --- a/controller/server_status.py +++ b/controller/server_status.py @@ -241,7 +241,7 @@ def from_server_spec( hibernated = server.get("spec", {}).get("jupyterServer", {}).get("hibernated", False) server_url=server.get("status", {}).get("create_fn", {}).get("fullServerURL") if server_url is None or len(server_url) == 0: - server_url = get_urls(server)[1] + server_url = get_urls(server["spec"])[1] return cls( init_statuses=init_container_statuses, statuses=container_statuses, From 005df3fb98e799760307008ffce1d645ed11d372 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 14:33:29 +0200 Subject: [PATCH 06/11] squashme: register deletion before creation --- controller/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/controller/main.py b/controller/main.py index d7d4499d..afe8e610 100644 --- a/controller/main.py +++ b/controller/main.py @@ -34,6 +34,12 @@ def register_jupyter_server_handlers( ) -> kopf.OperatorRegistry: logging.info("Populating regsitry") kopf.on.startup(registry=registry)(configure) + kopf.on.delete( + config.api_group, + config.api_version, + config.custom_resource_name, + registry=registry, + )(delete_fn) kopf.on.create( config.api_group, config.api_version, @@ -43,12 +49,6 @@ def register_jupyter_server_handlers( backoff=config.KOPF_CREATE_BACKOFF, registry=registry, )(create_fn) - kopf.on.delete( - config.api_group, - config.api_version, - config.custom_resource_name, - registry=registry, - )(delete_fn) kopf.on.event( version=config.api_version, kind=config.custom_resource_name, From bbc72985d297de0af711064c8a9744f0a1027b05 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 15:37:02 +0200 Subject: [PATCH 07/11] squashme: add logging --- controller/server_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/controller/server_controller.py b/controller/server_controller.py index 23bf8cb2..c6afde9e 100644 --- a/controller/server_controller.py +++ b/controller/server_controller.py @@ -90,6 +90,7 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): Watch the creation of jupyter server objects and create all the necessary k8s child resources which make the actual jupyter server. """ + logging.info(f"Starting create_fn for resource {name}") api = get_api(config.api_version, config.custom_resource_name, config.api_group) now = datetime.now(UTC).isoformat(timespec="seconds") try: From 0089e360c7f609059acf92d67c25497e8e411776 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 15:48:37 +0200 Subject: [PATCH 08/11] squashme: more logging --- controller/server_controller.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/controller/server_controller.py b/controller/server_controller.py index c6afde9e..834f95ba 100644 --- a/controller/server_controller.py +++ b/controller/server_controller.py @@ -93,6 +93,7 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): logging.info(f"Starting create_fn for resource {name}") api = get_api(config.api_version, config.custom_resource_name, config.api_group) now = datetime.now(UTC).isoformat(timespec="seconds") + logging.info(f"create_fn: got api for resource {name}") try: api.patch( namespace=namespace, @@ -112,6 +113,7 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): ) except NotFoundError: pass + logging.info(f"create_fn: attempted patch for {name}") children_specs = get_children_specs(name, spec, logger) @@ -122,6 +124,7 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): children_specs["statefulset"]["spec"]["template"], labels=get_labels(name, uid, labels, is_main_pod=True), ) + logging.info(f"create_fn: got child specs for {name}") # Add the labels to all child resources and create them in the cluster children_uids = {} @@ -133,10 +136,12 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): labels=get_labels(name, uid, labels, child_key=child_key), ) kopf.adopt(child_spec) - + logging.info(f"create_fn: created namespaced resource for for {child_key} for {name}") children_uids[child_key] = create_namespaced_resource(namespace=namespace, body=child_spec).metadata.uid - - return {"createdResources": children_uids, "fullServerURL": get_urls(spec)[1]} + + output = {"createdResources": children_uids, "fullServerURL": get_urls(spec)[1]} + logging.info(f"create_fn: completed for {name}") + return output def delete_fn(labels, body, namespace, name, **_): From abf19e283215df3fc003a3a7ccba3e1ab2755ef1 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 16:04:08 +0200 Subject: [PATCH 09/11] more logging --- controller/server_controller.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/controller/server_controller.py b/controller/server_controller.py index 834f95ba..32738fe6 100644 --- a/controller/server_controller.py +++ b/controller/server_controller.py @@ -67,8 +67,10 @@ def create_namespaced_resource(namespace, body): Create a k8s resource given the namespace and the full resource object. """ api = get_api(body["apiVersion"], body["kind"]) - return api.create(namespace=namespace, body=body) - + logging.info(f"Got the api for {body['kind']} for {body['metadata']['name']}") + res = api.create(namespace=namespace, body=body) + logging.info(f"Created resource {body['kind']} for {body['metadata']['name']}") + return res def configure(logger, settings, **_): """ @@ -136,7 +138,7 @@ def create_fn(labels, logger, name, namespace, spec, uid, body, **_): labels=get_labels(name, uid, labels, child_key=child_key), ) kopf.adopt(child_spec) - logging.info(f"create_fn: created namespaced resource for for {child_key} for {name}") + logging.info(f"create_fn: created namespaced resource for {child_key} for {name}") children_uids[child_key] = create_namespaced_resource(namespace=namespace, body=child_spec).metadata.uid output = {"createdResources": children_uids, "fullServerURL": get_urls(spec)[1]} From 5598ad67efcbe2144b5707249890b164320d39f0 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 16:31:04 +0200 Subject: [PATCH 10/11] clarify logging for status checks --- controller/culling.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/controller/culling.py b/controller/culling.py index 1d859f4e..78cdc71f 100644 --- a/controller/culling.py +++ b/controller/culling.py @@ -50,15 +50,16 @@ def get_js_server_status(js_body): token = js_body["spec"]["auth"].get("token") payload = {} if not token else {"token": token} + url = f"{server_url.rstrip('/')}/api/status" try: - res = requests.get(f"{server_url.rstrip('/')}/api/status", params=payload) + res = requests.get(url, params=payload) except RequestException as err: - logging.warning(f"Could not get js server status for {server_url}, because: {err}") + logging.warning(f"Could not get js server status for {url}, because: {err}") return None if res.status_code != 200: - logging.warning(f"Could not get js server status for {server_url}, response status code is {res.status_code}") + logging.warning(f"Could not get js server status for {url}, response status code is {res.status_code}") return None try: @@ -70,7 +71,7 @@ def get_js_server_status(js_body): # } res = res.json() except JSONDecodeError as err: - logging.warning(f"Could not parse js server status {res.text} for {server_url}, because: {err}") + logging.warning(f"Could not parse js server status {res.text} for {url}, because: {err}") return None if isinstance(res, dict): From 782b1a655e555e113226b0414b82cf38c9c46e9a Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Tue, 12 Aug 2025 17:24:22 +0200 Subject: [PATCH 11/11] squashme: print shorter payload --- controller/culling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controller/culling.py b/controller/culling.py index 78cdc71f..bf383f2f 100644 --- a/controller/culling.py +++ b/controller/culling.py @@ -71,7 +71,7 @@ def get_js_server_status(js_body): # } res = res.json() except JSONDecodeError as err: - logging.warning(f"Could not parse js server status {res.text} for {url}, because: {err}") + logging.warning(f"Could not parse js server status {res.text[:10]} for {url}, because: {err}") return None if isinstance(res, dict):