Skip to content

Commit cfc1aa1

Browse files
rgarciaclaude
andcommitted
feat: opt-in control-plane fallback for browser_gone 404s
Replace the prior broad-trigger fallback (fall back on any 5xx for any idempotent GET) with a tighter, opt-in design that assumes the paired metro-api change (kernel#2317): a routed request against a deleted/gone browser returns HTTP 404 with body {"code":"browser_gone"}. The routing layer now keeps a small registry of fallback-ELIGIBLE routed paths (subresource + suffix), default-OFF for everything else. Only the prospective GET /browsers/{id}/telemetry/events endpoint is pre-registered; adding future eligible endpoints is a one-line edit. Kernel.request / AsyncKernel.request fall back to the control plane IFF the request was actually routed to the VM, the method is GET, the routed path is eligible, and the VM returned a 404 whose JSON body code == "browser_gone". On fallback we evict the cached route and re-issue the ORIGINAL request to the control plane exactly once (CP URL, Authorization restored, jwt param dropped). Transient 5xx, connection errors, other 4xx, success, non-eligible paths, POSTs, and non-routed requests all propagate unchanged. This PR intentionally does NOT modify the default routing subresource list. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 903fe13 commit cfc1aa1

3 files changed

Lines changed: 379 additions & 2 deletions

File tree

src/kernel/_client.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
is_mapping_t,
2626
get_async_library,
2727
)
28-
from ._compat import cached_property
28+
from ._compat import model_copy, cached_property
2929
from ._models import FinalRequestOptions
3030
from ._version import __version__
3131
from ._streaming import Stream as Stream, AsyncStream as AsyncStream
@@ -39,8 +39,10 @@
3939
BrowserRouteCache,
4040
BrowserRoutingConfig,
4141
strip_direct_vm_auth,
42+
response_is_browser_gone,
4243
rewrite_direct_vm_options,
4344
browser_routing_config_from_env,
45+
fallback_session_id_for_options,
4446
maybe_evict_browser_route_from_response,
4547
maybe_populate_browser_route_cache_from_response,
4648
)
@@ -304,6 +306,35 @@ def default_headers(self) -> dict[str, str | Omit]:
304306
**self._custom_headers,
305307
}
306308

309+
@override
310+
def request(
311+
self,
312+
cast_to: Type[ResponseT],
313+
options: FinalRequestOptions,
314+
*,
315+
stream: bool = False,
316+
stream_cls: type[Stream[Any]] | None = None,
317+
) -> Any:
318+
# Capture the ORIGINAL (pre-rewrite) options so that, if the routed VM
319+
# reports the browser is gone, we can re-issue the exact same request to
320+
# the control plane. `super().request` rewrites these to target the VM.
321+
original_options = model_copy(options)
322+
fallback_session_id = fallback_session_id_for_options(
323+
original_options, cache=self.browser_route_cache, config=self._browser_routing
324+
)
325+
try:
326+
return super().request(cast_to, options, stream=stream, stream_cls=stream_cls)
327+
except APIStatusError as err:
328+
if fallback_session_id is None or not response_is_browser_gone(err.response):
329+
raise
330+
# The browser is authoritatively gone: evict its cached route so the
331+
# re-issued request is NOT rewritten back to the (dead) VM, then hit
332+
# the control plane exactly once with the original request. The route
333+
# is gone, so `_prepare_options` is a no-op, Authorization is restored
334+
# by the normal auth flow, and the jwt query param is dropped.
335+
self.browser_route_cache.delete(fallback_session_id)
336+
return super().request(cast_to, model_copy(original_options), stream=stream, stream_cls=stream_cls)
337+
307338
@override
308339
def _prepare_options(self, options: Any) -> Any:
309340
options = cast(Any, super()._prepare_options(options))
@@ -635,6 +666,37 @@ def default_headers(self) -> dict[str, str | Omit]:
635666
**self._custom_headers,
636667
}
637668

669+
@override
670+
async def request(
671+
self,
672+
cast_to: Type[ResponseT],
673+
options: FinalRequestOptions,
674+
*,
675+
stream: bool = False,
676+
stream_cls: type[AsyncStream[Any]] | None = None,
677+
) -> Any:
678+
# Capture the ORIGINAL (pre-rewrite) options so that, if the routed VM
679+
# reports the browser is gone, we can re-issue the exact same request to
680+
# the control plane. `super().request` rewrites these to target the VM.
681+
original_options = model_copy(options)
682+
fallback_session_id = fallback_session_id_for_options(
683+
original_options, cache=self.browser_route_cache, config=self._browser_routing
684+
)
685+
try:
686+
return await super().request(cast_to, options, stream=stream, stream_cls=stream_cls)
687+
except APIStatusError as err:
688+
if fallback_session_id is None or not response_is_browser_gone(err.response):
689+
raise
690+
# The browser is authoritatively gone: evict its cached route so the
691+
# re-issued request is NOT rewritten back to the (dead) VM, then hit
692+
# the control plane exactly once with the original request. The route
693+
# is gone, so `_prepare_options` is a no-op, Authorization is restored
694+
# by the normal auth flow, and the jwt query param is dropped.
695+
self.browser_route_cache.delete(fallback_session_id)
696+
return await super().request(
697+
cast_to, model_copy(original_options), stream=stream, stream_cls=stream_cls
698+
)
699+
638700
@override
639701
async def _prepare_options(self, options: Any) -> Any:
640702
options = cast(Any, await super()._prepare_options(options))

src/kernel/lib/browser_routing/routing.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,54 @@ class BrowserRoutingConfig:
3737
_BROWSER_POOL_ACQUIRE_PATH = re.compile(r"^/(?:v\d+/)?browser_pools/[^/]+/acquire/?$")
3838
_BROWSER_POOL_RELEASE_PATH = re.compile(r"^/(?:v\d+/)?browser_pools/[^/]+/release/?$")
3939

40+
# Body code returned by the VM proxy (metro-api, kernel#2317) when a routed
41+
# request targets a DELETED/GONE browser. There is intentionally no special
42+
# response header: we key off this body code only. A live VM's own 404s do not
43+
# carry this code, and transient/real upstream failures return 5xx instead.
44+
BROWSER_GONE_CODE = "browser_gone"
45+
46+
# Registry of routed paths that are ELIGIBLE for control-plane fallback when the
47+
# VM reports the browser is gone (404 + code == "browser_gone"). Eligibility is
48+
# expressed against the parsed routed path as (subresource, suffix). Everything
49+
# not listed here is default-OFF: a browser_gone 404 on a non-eligible path
50+
# propagates unchanged. Adding a future eligible endpoint is a one-line edit.
51+
_FALLBACK_ELIGIBLE_ROUTED_PATHS: frozenset[tuple[str, str]] = frozenset(
52+
{
53+
# PROSPECTIVE: GET /browsers/{id}/telemetry/events. The pull endpoint /
54+
# `telemetry.events(...)` method does NOT exist yet; this pre-registers
55+
# the opt-in so control-plane fallback works the moment that method
56+
# ships, with no further routing-layer changes required.
57+
("telemetry", "/events"),
58+
}
59+
)
60+
61+
62+
def is_fallback_eligible_routed_path(subresource: str, suffix: str) -> bool:
63+
"""Return True if a routed path opted into control-plane fallback.
64+
65+
`subresource` and `suffix` are the components produced by
66+
`match_direct_vm_path` for a `/browsers/{id}/{subresource}{suffix}` URL.
67+
"""
68+
return (subresource, suffix) in _FALLBACK_ELIGIBLE_ROUTED_PATHS
69+
70+
71+
def response_is_browser_gone(response: httpx.Response) -> bool:
72+
"""Return True iff a 404 response body has JSON code == "browser_gone".
73+
74+
Only call this for a 404. The body is read defensively; any
75+
parse/shape problem is treated as "not browser_gone" so the original
76+
response propagates unchanged.
77+
"""
78+
if response.status_code != 404:
79+
return False
80+
try:
81+
body = response.json()
82+
except Exception:
83+
return False
84+
if not isinstance(body, Mapping):
85+
return False
86+
return cast(Mapping[object, object], body).get("code") == BROWSER_GONE_CODE
87+
4088

4189
def browser_routing_config_from_env() -> BrowserRoutingConfig:
4290
raw = os.environ.get("KERNEL_BROWSER_ROUTING_SUBRESOURCES")
@@ -216,6 +264,42 @@ def rewrite_direct_vm_options(
216264
return rewritten
217265

218266

267+
def fallback_session_id_for_options(
268+
options: FinalRequestOptions,
269+
*,
270+
cache: BrowserRouteCache,
271+
config: BrowserRoutingConfig,
272+
) -> str | None:
273+
"""Return the session id to fall back for, or None if not eligible.
274+
275+
Decides — from the ORIGINAL (pre-rewrite) request options — whether a
276+
control-plane fallback is permitted. All must hold:
277+
1. the request was actually routed to the VM (allowlisted subresource +
278+
a cached route exists for the session);
279+
2. the HTTP method is GET;
280+
3. the routed path is in the fallback-eligible registry.
281+
282+
The caller is still responsible for confirming the VM returned a
283+
browser_gone 404 before acting on the returned session id.
284+
"""
285+
if options.method.upper() != "GET":
286+
return None
287+
288+
match = match_direct_vm_path(options.url)
289+
if match is None:
290+
return None
291+
292+
session_id, subresource, suffix = match
293+
if subresource not in set(config.subresources):
294+
return None
295+
if cache.get(session_id) is None:
296+
return None
297+
if not is_fallback_eligible_routed_path(subresource, suffix):
298+
return None
299+
300+
return session_id
301+
302+
219303
def strip_direct_vm_auth(request: httpx.Request, *, cache: BrowserRouteCache) -> None:
220304
raw = str(request.url)
221305
for route in cache.values():

0 commit comments

Comments
 (0)