Skip to content

Commit 82f5c9d

Browse files
committed
fix(gateway): split actor response wait errors
1 parent a4663b0 commit 82f5c9d

12 files changed

Lines changed: 163 additions & 22 deletions

File tree

engine/artifacts/errors/guard.actor_stopped_while_waiting.json

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

engine/artifacts/errors/guard.gateway_response_start_timeout.json

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

engine/artifacts/errors/guard.tunnel_message_timeout.json

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

engine/artifacts/errors/guard.tunnel_request_aborted.json

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

engine/artifacts/errors/guard.tunnel_response_closed.json

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

engine/packages/guard-core/src/errors.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,46 @@ pub struct ConnectionError {
100100
#[error("guard", "service_unavailable", "Service unavailable.")]
101101
pub struct ServiceUnavailable;
102102

103+
#[derive(RivetError, Serialize, Deserialize)]
104+
#[error(
105+
"guard",
106+
"actor_stopped_while_waiting",
107+
"Actor stopped while waiting for a response."
108+
)]
109+
pub struct ActorStoppedWhileWaiting;
110+
111+
#[derive(RivetError, Serialize, Deserialize)]
112+
#[error(
113+
"guard",
114+
"tunnel_request_aborted",
115+
"Actor tunnel aborted the request."
116+
)]
117+
pub struct TunnelRequestAborted;
118+
119+
#[derive(RivetError, Serialize, Deserialize)]
120+
#[error(
121+
"guard",
122+
"tunnel_message_timeout",
123+
"Actor tunnel message timed out."
124+
)]
125+
pub struct TunnelMessageTimeout;
126+
127+
#[derive(RivetError, Serialize, Deserialize)]
128+
#[error(
129+
"guard",
130+
"tunnel_response_closed",
131+
"Actor tunnel closed before sending a response."
132+
)]
133+
pub struct TunnelResponseClosed;
134+
135+
#[derive(RivetError, Serialize, Deserialize)]
136+
#[error(
137+
"guard",
138+
"gateway_response_start_timeout",
139+
"Timed out waiting for actor response start."
140+
)]
141+
pub struct GatewayResponseStartTimeout;
142+
103143
#[derive(RivetError, Serialize, Deserialize)]
104144
#[error(
105145
"guard",

engine/packages/guard-core/src/utils.rs

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ pub(crate) fn err_into_response(err: anyhow::Error) -> Result<Response<ResponseB
179179
("guard", "retry_attempts_exceeded") => StatusCode::BAD_GATEWAY,
180180
("actor", "not_found") => StatusCode::NOT_FOUND,
181181
("guard", "service_unavailable") => StatusCode::SERVICE_UNAVAILABLE,
182+
("guard", "actor_stopped_while_waiting") => StatusCode::SERVICE_UNAVAILABLE,
183+
("guard", "tunnel_request_aborted") => StatusCode::SERVICE_UNAVAILABLE,
184+
("guard", "tunnel_message_timeout") => StatusCode::GATEWAY_TIMEOUT,
185+
("guard", "tunnel_response_closed") => StatusCode::SERVICE_UNAVAILABLE,
186+
("guard", "gateway_response_start_timeout") => StatusCode::GATEWAY_TIMEOUT,
182187
("guard", "actor_ready_timeout") => StatusCode::SERVICE_UNAVAILABLE,
183188
("guard", "no_route") => StatusCode::NOT_FOUND,
184189
("guard", "invalid_request_body") => StatusCode::PAYLOAD_TOO_LARGE,
@@ -218,19 +223,35 @@ pub(crate) fn should_retry_request(res: &Result<Response<ResponseBody>>) -> bool
218223
Ok(resp) => should_retry_request_inner(resp.status(), resp.headers()),
219224
Err(err) => {
220225
if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::<RivetError>()) {
221-
rivet_err.group() == "guard" && rivet_err.code() == "service_unavailable"
226+
rivet_err.group() == "guard" && is_retryable_guard_http_error(rivet_err.code())
222227
} else {
223228
false
224229
}
225230
}
226231
}
227232
}
228233

229-
// Determine if a response should trigger a retry. Guard-specific actor startup
230-
// failures, including guard.actor_ready_timeout, are signaled as 503 with
231-
// x-rivet-error and should be retried against a freshly resolved target.
234+
fn is_retryable_guard_http_error(code: &str) -> bool {
235+
matches!(
236+
code,
237+
"service_unavailable"
238+
| "actor_ready_timeout"
239+
| "actor_stopped_while_waiting"
240+
| "tunnel_request_aborted"
241+
| "tunnel_message_timeout"
242+
| "tunnel_response_closed"
243+
| "gateway_response_start_timeout"
244+
)
245+
}
246+
247+
// Determine if a response should trigger a retry: transient status and x-rivet-error.
232248
pub(crate) fn should_retry_request_inner(status: StatusCode, headers: &hyper::HeaderMap) -> bool {
233-
status == StatusCode::SERVICE_UNAVAILABLE && headers.contains_key(X_RIVET_ERROR)
249+
(status == StatusCode::SERVICE_UNAVAILABLE || status == StatusCode::GATEWAY_TIMEOUT)
250+
&& headers
251+
.get(X_RIVET_ERROR)
252+
.and_then(|value| value.to_str().ok())
253+
.and_then(|value| value.split_once('.'))
254+
.is_some_and(|(group, code)| group == "guard" && is_retryable_guard_http_error(code))
234255
}
235256

236257
// Determine if a websocket error is retryable (e.g., transient UPS/tunnel issues)

engine/packages/pegboard-gateway/src/lib.rs

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ use rivet_error::*;
99
use rivet_guard_core::{
1010
ResponseBody, WebSocketHandle,
1111
custom_serve::{CustomServeTrait, HibernationResult},
12-
errors::{ServiceUnavailable, WebSocketServiceUnavailable},
12+
errors::{
13+
ActorStoppedWhileWaiting, GatewayResponseStartTimeout, TunnelMessageTimeout,
14+
TunnelRequestAborted, TunnelResponseClosed, WebSocketServiceUnavailable,
15+
},
1316
request_context::RequestContext,
1417
utils::is_ws_hibernate,
1518
websocket_handle::WebSocketReceiver,
@@ -168,7 +171,7 @@ impl PegboardGateway {
168171
}
169172
protocol::mk2::ToServerTunnelMessageKind::ToServerResponseAbort => {
170173
tracing::warn!("request aborted");
171-
return Err(ServiceUnavailable.build());
174+
return Err(TunnelRequestAborted.build());
172175
}
173176
_ => {
174177
tracing::warn!("received non-response message from pubsub");
@@ -179,21 +182,19 @@ impl PegboardGateway {
179182
request_id=%protocol::util::id_to_string(&request_id),
180183
"received no message response during request init",
181184
);
182-
break;
185+
return Err(TunnelResponseClosed.build());
183186
}
184187
}
185188
_ = stopped_sub.next() => {
186189
tracing::debug!("actor stopped while waiting for request response");
187-
return Err(ServiceUnavailable.build());
190+
return Err(ActorStoppedWhileWaiting.build());
188191
}
189192
_ = drop_rx.changed() => {
190193
tracing::warn!(reason=?drop_rx.borrow(), "tunnel message timeout");
191-
return Err(ServiceUnavailable.build());
194+
return Err(TunnelMessageTimeout.build());
192195
}
193196
}
194197
}
195-
196-
Err(ServiceUnavailable.build())
197198
};
198199
let response_start_timeout = Duration::from_millis(
199200
self.ctx
@@ -206,7 +207,7 @@ impl PegboardGateway {
206207
.map_err(|_| {
207208
tracing::warn!("timed out waiting for response start from runner");
208209

209-
ServiceUnavailable.build()
210+
GatewayResponseStartTimeout.build()
210211
})??;
211212
tracing::debug!("response handler task ended");
212213

engine/packages/pegboard-gateway2/src/lib.rs

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ use rivet_error::*;
99
use rivet_guard_core::{
1010
ResponseBody, WebSocketHandle,
1111
custom_serve::{CustomServeTrait, HibernationResult},
12-
errors::{ServiceUnavailable, WebSocketServiceUnavailable},
12+
errors::{
13+
ActorStoppedWhileWaiting, GatewayResponseStartTimeout, TunnelMessageTimeout,
14+
TunnelRequestAborted, TunnelResponseClosed, WebSocketServiceUnavailable,
15+
},
1316
request_context::RequestContext,
1417
utils::is_ws_hibernate,
1518
};
@@ -171,7 +174,7 @@ impl PegboardGateway2 {
171174
}
172175
protocol::ToRivetTunnelMessageKind::ToRivetResponseAbort => {
173176
tracing::warn!("request aborted");
174-
return Err(ServiceUnavailable.build());
177+
return Err(TunnelRequestAborted.build());
175178
}
176179
_ => {
177180
tracing::warn!("received non-response message from pubsub");
@@ -182,21 +185,19 @@ impl PegboardGateway2 {
182185
request_id=%protocol::util::id_to_string(&request_id),
183186
"received no message response during request init",
184187
);
185-
break;
188+
return Err(TunnelResponseClosed.build());
186189
}
187190
}
188191
_ = stopped_sub.next() => {
189192
tracing::debug!("actor stopped while waiting for request response");
190-
return Err(ServiceUnavailable.build());
193+
return Err(ActorStoppedWhileWaiting.build());
191194
}
192195
_ = drop_rx.changed() => {
193196
tracing::warn!(reason=?drop_rx.borrow(), "tunnel message timeout");
194-
return Err(ServiceUnavailable.build());
197+
return Err(TunnelMessageTimeout.build());
195198
}
196199
}
197200
}
198-
199-
Err(ServiceUnavailable.build())
200201
}
201202
.instrument(tracing::info_span!("wait_for_tunnel_response"));
202203
let response_start_timeout = Duration::from_millis(
@@ -210,7 +211,7 @@ impl PegboardGateway2 {
210211
.map_err(|_| {
211212
tracing::warn!("timed out waiting for response start from envoy");
212213

213-
ServiceUnavailable.build()
214+
GatewayResponseStartTimeout.build()
214215
})??;
215216
tracing::debug!("response handler task ended");
216217

rivetkit-typescript/packages/rivetkit/src/client/lifecycle-errors.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,21 @@ function classifyActorError(
7676
);
7777
}
7878

79+
if (
80+
error.group === "guard" &&
81+
isRetryableGuardGatewayHttpError(error.code)
82+
) {
83+
return buildLifecycleBoundaryInfo(
84+
"request_retry",
85+
"actor_error",
86+
error.message,
87+
{
88+
group: error.group,
89+
code: error.code,
90+
},
91+
);
92+
}
93+
7994
// TODO(RVT-6193): Remove this legacy match after structured restart errors
8095
// are authoritative everywhere.
8196
if (
@@ -144,6 +159,17 @@ function classifyActorError(
144159
return undefined;
145160
}
146161

162+
function isRetryableGuardGatewayHttpError(code: string): boolean {
163+
return (
164+
code === "service_unavailable" ||
165+
code === "actor_stopped_while_waiting" ||
166+
code === "tunnel_request_aborted" ||
167+
code === "tunnel_message_timeout" ||
168+
code === "tunnel_response_closed" ||
169+
code === "gateway_response_start_timeout"
170+
);
171+
}
172+
147173
function classifyTransportError(
148174
error: Error,
149175
): LifecycleBoundaryInfo | undefined {

0 commit comments

Comments
 (0)