@@ -8,7 +8,9 @@ use super::super::common;
88#[ test]
99fn envoy_reconnects_after_server_side_tcp_reset ( ) {
1010 common:: run (
11- common:: TestOpts :: new ( 1 ) . with_timeout ( 90 ) . with_network_faults ( ) ,
11+ common:: TestOpts :: new ( 1 )
12+ . with_timeout ( 90 )
13+ . with_network_faults ( ) ,
1214 |ctx| async move {
1315 let dc = ctx. leader_dc ( ) ;
1416 let ( namespace, _) = common:: setup_test_namespace ( dc) . await ;
@@ -71,8 +73,9 @@ fn envoy_reconnects_after_server_side_tcp_reset() {
7173 // The ping task writes every few seconds in the test config.
7274 disconnect. wait ( ) . await ;
7375
74- let reconnect = envoy
75- . wait_for_next_connection_event ( common:: test_envoy:: EnvoyConnectionEvent :: Connected ) ;
76+ let reconnect = envoy. wait_for_next_connection_event (
77+ common:: test_envoy:: EnvoyConnectionEvent :: Connected ,
78+ ) ;
7679 envoy_proxy
7780 . clear_toxics ( )
7881 . await
@@ -90,6 +93,100 @@ fn envoy_reconnects_after_server_side_tcp_reset() {
9093 ) ;
9194}
9295
96+ #[ test]
97+ fn engine_closes_envoy_ws_after_ping_timeout_while_envoy_remains_unaware ( ) {
98+ common:: run (
99+ common:: TestOpts :: new ( 1 ) . with_timeout ( 120 ) ,
100+ |ctx| async move {
101+ let dc = ctx. leader_dc ( ) ;
102+ let ( namespace, _) = common:: setup_test_namespace ( dc) . await ;
103+
104+ // Stand up our own forwarder so we can simulate a true network partition.
105+ // Toxiproxy can stall traffic but always relays a peer's TCP close to the other
106+ // side, which would let envoy-client notice the engine has hung up.
107+ let freeze_proxy = common:: freeze_proxy:: FreezeProxy :: start (
108+ std:: net:: SocketAddr :: from ( ( [ 127 , 0 , 0 , 1 ] , dc. guard_port ( ) ) ) ,
109+ )
110+ . await
111+ . expect ( "failed to start freeze proxy" ) ;
112+
113+ let envoy = common:: setup_envoy ( dc, & namespace, |builder| {
114+ builder
115+ . with_endpoint ( freeze_proxy. endpoint ( ) )
116+ . with_actor_behavior ( "network-fault-actor" , |_| {
117+ Box :: new (
118+ common:: test_envoy:: CustomActorBuilder :: new ( )
119+ . on_start ( |_| {
120+ Box :: pin ( async {
121+ Ok ( common:: test_envoy:: ActorStartResult :: Running )
122+ } )
123+ } )
124+ . build ( ) ,
125+ )
126+ } )
127+ } )
128+ . await ;
129+
130+ let res = common:: create_actor (
131+ dc. guard_port ( ) ,
132+ & namespace,
133+ "network-fault-actor" ,
134+ envoy. pool_name ( ) ,
135+ rivet_types:: actors:: CrashPolicy :: Sleep ,
136+ )
137+ . await ;
138+ let actor_id = res. actor . actor_id . to_string ( ) ;
139+ wait_for_envoy_actor ( & envoy, & actor_id) . await ;
140+ wait_for_connectable ( dc. guard_port ( ) , & namespace, & actor_id) . await ;
141+
142+ let response = common:: ping_actor_via_guard ( dc, & actor_id) . await ;
143+ assert_eq ! ( response[ "status" ] , "ok" ) ;
144+
145+ // Subscribe before injecting the fault so we can assert no event slips through.
146+ let mut disconnect = envoy. wait_for_next_connection_event (
147+ common:: test_envoy:: EnvoyConnectionEvent :: Disconnected ,
148+ ) ;
149+ disconnect. assert_no_event ( ) ;
150+
151+ // Black-hole the link in both directions. Bytes are read from both peers and
152+ // discarded, and EOFs are swallowed so neither peer's TCP stack ever sees a FIN.
153+ // The engine still keeps sending pings every few seconds (default 3s) but no pongs
154+ // come back, so its application-level ping timeout (default 15s) will eventually
155+ // fire and close the WebSocket. The envoy-client has no application-level
156+ // liveness check of its own, so as long as its TCP socket stays open it continues
157+ // to believe the connection is healthy.
158+ freeze_proxy. freeze ( ) ;
159+
160+ // Wait well past the engine's 15s ping timeout, then assert the engine did in fact
161+ // tear down its side via the ping task error log...
162+ tokio:: time:: sleep ( std:: time:: Duration :: from_secs ( 20 ) ) . await ;
163+ let logs = common:: captured_logs_snapshot ( ) ;
164+ assert ! (
165+ logs. contains( "ping task failed" ) ,
166+ "expected engine to log a ping timeout, got logs:\n {logs}"
167+ ) ;
168+
169+ // ...and that the envoy-client is still oblivious. The engine's close frame and
170+ // TCP FIN never reach it because the freeze proxy is holding the link open from
171+ // envoy-client's perspective.
172+ disconnect. assert_no_event ( ) ;
173+
174+ // Even though the envoy-client thinks the WebSocket is alive, its own ping-tracker
175+ // must report unhealthy because no engine ping arrived in the last 20s. This is
176+ // the signal the rivetkit `/health` endpoint uses to ask its host to recycle the
177+ // container.
178+ let healthy = envoy
179+ . is_ping_healthy ( )
180+ . await
181+ . expect ( "envoy handle should exist" ) ;
182+ assert ! (
183+ !healthy,
184+ "envoy-client should report unhealthy after 20s without an engine ping"
185+ ) ;
186+ } ,
187+ ) ;
188+ }
189+
93190async fn wait_for_envoy_actor ( envoy : & common:: test_envoy:: TestEnvoy , actor_id : & str ) {
94191 common:: wait_with_poll (
95192 std:: time:: Duration :: from_secs ( 5 ) ,
@@ -138,7 +235,9 @@ async fn ping_actor_via_gateway(guard_port: u16, actor_id: &str) -> serde_json::
138235 . expect ( "failed to build reqwest client" ) ;
139236
140237 let response = client
141- . get ( format ! ( "http://127.0.0.1:{guard_port}/gateway/{actor_id}/ping" ) )
238+ . get ( format ! (
239+ "http://127.0.0.1:{guard_port}/gateway/{actor_id}/ping"
240+ ) )
142241 . send ( )
143242 . await
144243 . expect ( "failed to ping actor through gateway" ) ;
0 commit comments