Skip to content

Commit 7638112

Browse files
codeSamuraiiCopilot
andcommitted
ping
Co-authored-by: Copilot <copilot@github.com>
1 parent 958f594 commit 7638112

2 files changed

Lines changed: 43 additions & 9 deletions

File tree

pyfuse/worker/sandbox/docker.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -209,21 +209,47 @@ async def _run_container(self) -> None:
209209
raise WorkerError(f"Failed to start Docker container: {stderr.strip()}")
210210

211211
async def _wait_for_agent(self) -> None:
212+
"""Block until the guest agent is actually answering on the wire.
213+
214+
A bare TCP ``open_connection`` is not sufficient on Linux: when
215+
a container port is published, ``docker-proxy`` accepts host
216+
connections *before* the in-container process is listening,
217+
which causes the very first request to race with agent startup
218+
and come back as ``IncompleteReadError``. Performing an actual
219+
ping/pong exchange guarantees end-to-end readiness.
220+
"""
212221
assert self._host_port is not None
213-
for _ in range(int(self.boot_timeout)):
222+
deadline = asyncio.get_running_loop().time() + self.boot_timeout
223+
last_err: Exception | None = None
224+
while asyncio.get_running_loop().time() < deadline:
214225
try:
215-
_r, _w = await asyncio.wait_for(
226+
reader, writer = await asyncio.wait_for(
216227
asyncio.open_connection("127.0.0.1", self._host_port),
217228
timeout=2.0,
218229
)
219-
_w.close()
220-
with contextlib.suppress(ConnectionError, OSError):
221-
await _w.wait_closed()
222-
return
223-
except (OSError, asyncio.TimeoutError):
224-
await asyncio.sleep(1)
230+
try:
231+
await asyncio.wait_for(
232+
async_send(writer, {"op": "ping"}), timeout=2.0,
233+
)
234+
msg = await asyncio.wait_for(async_recv(reader), timeout=2.0)
235+
finally:
236+
writer.close()
237+
with contextlib.suppress(ConnectionError, OSError):
238+
await writer.wait_closed()
239+
if msg.get("status") == "pong":
240+
return
241+
last_err = WorkerError(f"Unexpected handshake reply: {msg!r}")
242+
except (
243+
OSError,
244+
asyncio.TimeoutError,
245+
asyncio.IncompleteReadError,
246+
ConnectionError,
247+
) as exc:
248+
last_err = exc
249+
await asyncio.sleep(0.5)
225250
raise WorkerError(
226-
f"Docker guest agent did not become reachable within {self.boot_timeout}s"
251+
f"Docker guest agent did not become reachable within "
252+
f"{self.boot_timeout}s: {last_err!r}"
227253
)
228254

229255
async def _connect(self) -> None:

pyfuse/worker/sandbox/guest_agent.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,14 @@ async def _handle_client(
264264
try:
265265
while True:
266266
req = await _recv(reader)
267+
# Cheap liveness handshake used by the host to confirm the
268+
# in-container agent is actually accepting requests (a TCP
269+
# connection alone isn't sufficient: on Linux docker-proxy
270+
# accepts the connection on the host port even before the
271+
# guest agent process has started listening).
272+
if req.get("op") == "ping":
273+
await _send(writer, {"status": "pong"})
274+
continue
267275
resp = await _execute_request(req, writer)
268276
await _send(writer, resp)
269277
except (asyncio.IncompleteReadError, ConnectionError, OSError):

0 commit comments

Comments
 (0)