From 3925eb124c48c950b369d5d4a022d1c10955133f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Wed, 25 Mar 2026 21:09:39 +0800 Subject: [PATCH 01/20] fix(execd): fix mismatched image tag --- components/execd/RELEASE_NOTES.md | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/components/execd/RELEASE_NOTES.md b/components/execd/RELEASE_NOTES.md index 1b89a2d58..4bae9f307 100644 --- a/components/execd/RELEASE_NOTES.md +++ b/components/execd/RELEASE_NOTES.md @@ -35,8 +35,8 @@ Thanks to these contributors ❤️ - @csdbianhua --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.8 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.8 # components/execd 1.0.7 @@ -70,8 +70,8 @@ Thanks to these contributors ❤️ - @dependabot --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.7 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7 # components/execd 1.0.6 @@ -93,8 +93,8 @@ Thanks to these contributors ❤️ - @dependabot --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.6 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.6 # components/execd 1.0.5 @@ -111,8 +111,8 @@ Thanks to these contributors ❤️ - @Pangjiping --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.5 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.5 # components/execd 1.0.4 @@ -135,8 +135,8 @@ Thanks to these contributors ❤️ - @ninan-nn --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.4 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.4 # components/execd 1.0.3 @@ -159,8 +159,8 @@ Thanks to these contributors ❤️ - @jwx0925 --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.3 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.3 # components/execd 1.0.2 @@ -188,8 +188,8 @@ Thanks to these contributors ❤️ - @ninan-nn --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.2 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.2 # components/execd 1.0.1 @@ -221,8 +221,8 @@ Thanks to these contributors ❤️ - @jwx0925 --- -- Docker Hub: opensandbox/execd:v1.0.9 -- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.9 +- Docker Hub: opensandbox/execd:v1.0.1 +- Aliyun Registry: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.1 # components/execd 1.0.0 From 68935834034398e44b0883f11964e28578d49684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Wed, 25 Mar 2026 21:34:52 +0800 Subject: [PATCH 02/20] feat(execd): span event-id(eid) by sse statement --- components/execd/pkg/runtime/bash_session.go | 2 +- .../execd/pkg/runtime/bash_session_test.go | 20 ++++----- components/execd/pkg/runtime/command.go | 4 +- components/execd/pkg/runtime/command_test.go | 8 ++-- .../execd/pkg/runtime/command_windows.go | 4 +- components/execd/pkg/runtime/jupyter.go | 4 +- components/execd/pkg/runtime/types.go | 45 +++++++++++++++++-- components/execd/pkg/web/controller/sse.go | 6 ++- .../execd/pkg/web/model/codeinterpreting.go | 29 ++---------- .../pkg/web/model/codeinterpreting_test.go | 5 ++- components/execd/pkg/web/model/command.go | 31 ++++++++++++- 11 files changed, 104 insertions(+), 54 deletions(-) diff --git a/components/execd/pkg/runtime/bash_session.go b/components/execd/pkg/runtime/bash_session.go index 5e5d01d3c..dd5f77e3b 100644 --- a/components/execd/pkg/runtime/bash_session.go +++ b/components/execd/pkg/runtime/bash_session.go @@ -236,7 +236,7 @@ func (s *bashSession) run(ctx context.Context, request *ExecuteCodeRequest) erro continue } if request.Hooks.OnExecuteStdout != nil { - request.Hooks.OnExecuteStdout(line) + request.Hooks.OnExecuteStdout(request.nextStdoutStderrEventID(), line) } } } diff --git a/components/execd/pkg/runtime/bash_session_test.go b/components/execd/pkg/runtime/bash_session_test.go index b18af3de0..853baad92 100644 --- a/components/execd/pkg/runtime/bash_session_test.go +++ b/components/execd/pkg/runtime/bash_session_test.go @@ -55,7 +55,7 @@ func TestBashSession_NonZeroExitEmitsError(t *testing.T) { Timeout: 5 * time.Second, Hooks: ExecuteResultHook{ OnExecuteInit: func(s string) { sessionID = s }, - OnExecuteStdout: func(s string) { stdoutLine = s }, + OnExecuteStdout: func(_ int64, s string) { stdoutLine = s }, OnExecuteError: func(err *execute.ErrorOutput) { errCh <- err }, OnExecuteComplete: func(_ time.Duration) { completeCh <- struct{}{} @@ -104,7 +104,7 @@ func TestBashSession_envAndExitCode(t *testing.T) { require.Equal(t, session.config.Session, ctx, "unexpected session in OnExecuteInit") initCalls++ }, - OnExecuteStdout: func(text string) { + OnExecuteStdout: func(_ int64, text string) { t.Log(text) stdoutLines = append(stdoutLines, text) }, @@ -177,7 +177,7 @@ func TestBashSession_envLargeOutputChained(t *testing.T) { require.Equal(t, session.config.Session, ctx, "unexpected session in OnExecuteInit") initCalls++ }, - OnExecuteStdout: func(text string) { + OnExecuteStdout: func(_ int64, text string) { t.Log(text) stdoutLines = append(stdoutLines, text) }, @@ -222,7 +222,7 @@ func TestBashSession_cwdPersistsWithoutOverride(t *testing.T) { targetDir := t.TempDir() var stdoutLines []string hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { stdoutLines = append(stdoutLines, line) }, } @@ -264,7 +264,7 @@ func TestBashSession_requestCwdOverridesAfterCd(t *testing.T) { var stdoutLines []string hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { stdoutLines = append(stdoutLines, line) }, } @@ -306,7 +306,7 @@ func TestBashSession_envDumpNotLeakedWhenNoTrailingNewline(t *testing.T) { var stdoutLines []string hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { stdoutLines = append(stdoutLines, line) }, } @@ -334,7 +334,7 @@ func TestBashSession_envDumpNotLeakedWhenNoOutput(t *testing.T) { var stdoutLines []string hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { stdoutLines = append(stdoutLines, line) }, } @@ -365,7 +365,7 @@ func TestBashSession_heredoc(t *testing.T) { t.Cleanup(func() { _ = controller.DeleteBashSession(sessionID) }) hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { fmt.Printf("[stdout] %s\n", line) }, OnExecuteComplete: func(d time.Duration) { @@ -418,7 +418,7 @@ func TestBashSession_execReplacesShell(t *testing.T) { var stdoutLines []string hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { stdoutLines = append(stdoutLines, line) }, } @@ -458,7 +458,7 @@ func TestBashSession_complexExec(t *testing.T) { var stdoutLines []string hooks := ExecuteResultHook{ - OnExecuteStdout: func(line string) { + OnExecuteStdout: func(_ int64, line string) { stdoutLines = append(stdoutLines, line) }, } diff --git a/components/execd/pkg/runtime/command.go b/components/execd/pkg/runtime/command.go index 208b541ab..a18c34041 100644 --- a/components/execd/pkg/runtime/command.go +++ b/components/execd/pkg/runtime/command.go @@ -128,11 +128,11 @@ func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest wg.Add(2) safego.Go(func() { defer wg.Done() - c.tailStdPipe(stdoutPath, request.Hooks.OnExecuteStdout, done) + c.tailStdPipe(stdoutPath, request.wrapStdoutPipeHook(), done) }) safego.Go(func() { defer wg.Done() - c.tailStdPipe(stderrPath, request.Hooks.OnExecuteStderr, done) + c.tailStdPipe(stderrPath, request.wrapStderrPipeHook(), done) }) err = cmd.Start() diff --git a/components/execd/pkg/runtime/command_test.go b/components/execd/pkg/runtime/command_test.go index e282d40a1..1866aa2b5 100644 --- a/components/execd/pkg/runtime/command_test.go +++ b/components/execd/pkg/runtime/command_test.go @@ -133,10 +133,10 @@ func TestRunCommand_Echo(t *testing.T) { Timeout: 5 * time.Second, Hooks: ExecuteResultHook{ OnExecuteInit: func(s string) { sessionID = s }, - OnExecuteStdout: func(s string) { + OnExecuteStdout: func(_ int64, s string) { stdoutLines = append(stdoutLines, s) }, - OnExecuteStderr: func(s string) { + OnExecuteStderr: func(_ int64, s string) { stderrLines = append(stderrLines, s) }, OnExecuteError: func(err *execute.ErrorOutput) { @@ -188,8 +188,8 @@ func TestRunCommand_Error(t *testing.T) { Timeout: 5 * time.Second, Hooks: ExecuteResultHook{ OnExecuteInit: func(s string) { sessionID = s }, - OnExecuteStdout: func(s string) { stdoutLines = append(stdoutLines, s) }, - OnExecuteStderr: func(s string) { stderrLines = append(stderrLines, s) }, + OnExecuteStdout: func(_ int64, s string) { stdoutLines = append(stdoutLines, s) }, + OnExecuteStderr: func(_ int64, s string) { stderrLines = append(stderrLines, s) }, OnExecuteError: func(err *execute.ErrorOutput) { gotErr = err completeCh <- struct{}{} diff --git a/components/execd/pkg/runtime/command_windows.go b/components/execd/pkg/runtime/command_windows.go index 888bd5e89..ffa7bef89 100644 --- a/components/execd/pkg/runtime/command_windows.go +++ b/components/execd/pkg/runtime/command_windows.go @@ -53,10 +53,10 @@ func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest done := make(chan struct{}, 1) safego.Go(func() { - c.tailStdPipe(c.stdoutFileName(session), request.Hooks.OnExecuteStdout, done) + c.tailStdPipe(c.stdoutFileName(session), request.wrapStdoutPipeHook(), done) }) safego.Go(func() { - c.tailStdPipe(c.stderrFileName(session), request.Hooks.OnExecuteStderr, done) + c.tailStdPipe(c.stderrFileName(session), request.wrapStderrPipeHook(), done) }) err = cmd.Start() diff --git a/components/execd/pkg/runtime/jupyter.go b/components/execd/pkg/runtime/jupyter.go index 9ea33b13b..9228073fb 100644 --- a/components/execd/pkg/runtime/jupyter.go +++ b/components/execd/pkg/runtime/jupyter.go @@ -103,9 +103,9 @@ func (c *Controller) runJupyterCode(ctx context.Context, kernel *jupyterKernel, for _, stream := range result.Stream { switch stream.Name { case execute.StreamStdout: - request.Hooks.OnExecuteStdout(stream.Text) + request.Hooks.OnExecuteStdout(0, stream.Text) case execute.StreamStderr: - request.Hooks.OnExecuteStderr(stream.Text) + request.Hooks.OnExecuteStderr(0, stream.Text) default: } } diff --git a/components/execd/pkg/runtime/types.go b/components/execd/pkg/runtime/types.go index cd0615c63..4740902f8 100644 --- a/components/execd/pkg/runtime/types.go +++ b/components/execd/pkg/runtime/types.go @@ -17,18 +17,21 @@ package runtime import ( "fmt" "sync" + "sync/atomic" "time" "github.com/alibaba/opensandbox/execd/pkg/jupyter/execute" ) // ExecuteResultHook groups execution callbacks. +// Eid is only assigned for stdout/stderr on run command (pipe tail) and run-in-session (bash stdout pipe); +// other paths pass eid=0. Ids are allocated in runtime at pipe sync, not in HTTP/SSE writers. type ExecuteResultHook struct { OnExecuteInit func(context string) OnExecuteResult func(result map[string]any, count int) OnExecuteStatus func(status string) - OnExecuteStdout func(stdout string) //nolint:predeclared - OnExecuteStderr func(stderr string) //nolint:predeclared + OnExecuteStdout func(eid int64, stdout string) //nolint:predeclared + OnExecuteStderr func(eid int64, stderr string) //nolint:predeclared OnExecuteError func(err *execute.ErrorOutput) OnExecuteComplete func(executionTime time.Duration) } @@ -44,6 +47,40 @@ type ExecuteCodeRequest struct { Uid *uint32 `json:"uid,omitempty"` Gid *uint32 `json:"gid,omitempty"` Hooks ExecuteResultHook + + // eventSeq assigns monotonic eids (1-based) for stdout/stderr on run command and bash session only. + eventSeq atomic.Uint64 +} + +// nextStdoutStderrEventID returns the next eid for stdout/stderr lines. Used only from run command +// pipe tailers and bash session stdout; other callers should pass 0 into OnExecuteStdout/Stderr. +func (req *ExecuteCodeRequest) nextStdoutStderrEventID() int64 { + if req == nil { + return 0 + } + return int64(req.eventSeq.Add(1)) +} + +// wrapStdoutPipeHook wraps stdout delivery so eid is assigned when a line is flushed from the pipe tailer, not in SSE writes. +func (req *ExecuteCodeRequest) wrapStdoutPipeHook() func(string) { + return func(text string) { + if text == "" || req.Hooks.OnExecuteStdout == nil { + return + } + eid := req.nextStdoutStderrEventID() + req.Hooks.OnExecuteStdout(eid, text) + } +} + +// wrapStderrPipeHook wraps stderr delivery so eid is assigned when a line is flushed from the pipe tailer, not in SSE writes. +func (req *ExecuteCodeRequest) wrapStderrPipeHook() func(string) { + return func(text string) { + if text == "" || req.Hooks.OnExecuteStderr == nil { + return + } + eid := req.nextStdoutStderrEventID() + req.Hooks.OnExecuteStderr(eid, text) + } } // SetDefaultHooks installs stdout logging fallbacks for unset hooks. @@ -55,10 +92,10 @@ func (req *ExecuteCodeRequest) SetDefaultHooks() { req.Hooks.OnExecuteStatus = func(status string) { fmt.Printf("OnExecuteStatus: %s\n", status) } } if req.Hooks.OnExecuteStdout == nil { - req.Hooks.OnExecuteStdout = func(stdout string) { fmt.Printf("OnExecuteStdout: %s\n", stdout) } + req.Hooks.OnExecuteStdout = func(eid int64, stdout string) { fmt.Printf("OnExecuteStdout: eid=%d %s\n", eid, stdout) } } if req.Hooks.OnExecuteStderr == nil { - req.Hooks.OnExecuteStderr = func(stderr string) { fmt.Printf("OnExecuteStderr: %s\n", stderr) } + req.Hooks.OnExecuteStderr = func(eid int64, stderr string) { fmt.Printf("OnExecuteStderr: eid=%d %s\n", eid, stderr) } } if req.Hooks.OnExecuteError == nil { req.Hooks.OnExecuteError = func(err *execute.ErrorOutput) { fmt.Printf("OnExecuteError: %++v\n", err) } diff --git a/components/execd/pkg/web/controller/sse.go b/components/execd/pkg/web/controller/sse.go index 9e87bda6b..1f00bcdf1 100644 --- a/components/execd/pkg/web/controller/sse.go +++ b/components/execd/pkg/web/controller/sse.go @@ -123,12 +123,13 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) payload := event.ToJSON() c.writeSingleEvent("OnExecuteStatus", payload, true, event.Summary()) }, - OnExecuteStdout: func(text string) { + OnExecuteStdout: func(eid int64, text string) { if text == "" { return } event := model.ServerStreamEvent{ + Eid: eid, Type: model.StreamEventTypeStdout, Text: text, Timestamp: time.Now().UnixMilli(), @@ -136,12 +137,13 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) payload := event.ToJSON() c.writeSingleEvent("OnExecuteStdout", payload, true, event.Summary()) }, - OnExecuteStderr: func(text string) { + OnExecuteStderr: func(eid int64, text string) { if text == "" { return } event := model.ServerStreamEvent{ + Eid: eid, Type: model.StreamEventTypeStderr, Text: text, Timestamp: time.Now().UnixMilli(), diff --git a/components/execd/pkg/web/model/codeinterpreting.go b/components/execd/pkg/web/model/codeinterpreting.go index 771b6d75b..c473b94e4 100644 --- a/components/execd/pkg/web/model/codeinterpreting.go +++ b/components/execd/pkg/web/model/codeinterpreting.go @@ -16,7 +16,6 @@ package model import ( "encoding/json" - "errors" "fmt" "strings" @@ -47,30 +46,6 @@ type CodeContextRequest struct { Cwd string `json:"cwd,omitempty"` } -// RunCommandRequest represents a shell command execution request. -type RunCommandRequest struct { - Command string `json:"command" validate:"required"` - Cwd string `json:"cwd,omitempty"` - Background bool `json:"background,omitempty"` - // TimeoutMs caps execution duration; 0 uses server default. - TimeoutMs int64 `json:"timeout,omitempty" validate:"omitempty,gte=1"` - - Uid *uint32 `json:"uid,omitempty"` - Gid *uint32 `json:"gid,omitempty"` - Envs map[string]string `json:"envs,omitempty"` -} - -func (r *RunCommandRequest) Validate() error { - validate := validator.New() - if err := validate.Struct(r); err != nil { - return err - } - if r.Gid != nil && r.Uid == nil { - return errors.New("uid is required when gid is provided") - } - return nil -} - type ServerStreamEventType string const ( @@ -87,6 +62,7 @@ const ( // ServerStreamEvent is emitted to clients over SSE. type ServerStreamEvent struct { + Eid int64 `json:"eid,omitempty"` Type ServerStreamEventType `json:"type,omitempty"` Text string `json:"text,omitempty"` ExecutionCount int `json:"execution_count,omitempty"` @@ -105,6 +81,9 @@ func (s ServerStreamEvent) ToJSON() []byte { // Summary renders a lightweight, log-friendly string without JSON. func (s ServerStreamEvent) Summary() string { parts := []string{fmt.Sprintf("type=%s", s.Type)} + if s.Eid > 0 { + parts = append(parts, fmt.Sprintf("eid=%d", s.Eid)) + } if s.Text != "" { parts = append(parts, fmt.Sprintf("text=%s", truncateString(s.Text, 100))) } diff --git a/components/execd/pkg/web/model/codeinterpreting_test.go b/components/execd/pkg/web/model/codeinterpreting_test.go index f0903bf05..0d83b645f 100644 --- a/components/execd/pkg/web/model/codeinterpreting_test.go +++ b/components/execd/pkg/web/model/codeinterpreting_test.go @@ -67,6 +67,7 @@ func TestRunCommandRequestValidateUidGid(t *testing.T) { func TestServerStreamEventToJSON(t *testing.T) { event := ServerStreamEvent{ + Eid: 42, Type: StreamEventTypeStdout, Text: "hello", ExecutionCount: 3, @@ -75,6 +76,7 @@ func TestServerStreamEventToJSON(t *testing.T) { data := event.ToJSON() var decoded ServerStreamEvent require.NoError(t, json.Unmarshal(data, &decoded)) + require.Equal(t, event.Eid, decoded.Eid) require.Equal(t, event.Type, decoded.Type) require.Equal(t, event.Text, decoded.Text) require.Equal(t, event.ExecutionCount, decoded.ExecutionCount) @@ -90,11 +92,12 @@ func TestServerStreamEventSummary(t *testing.T) { { name: "basic stdout", event: ServerStreamEvent{ + Eid: 7, Type: StreamEventTypeStdout, Text: "hello", ExecutionCount: 2, }, - contains: []string{"type=stdout", "text=hello"}, + contains: []string{"type=stdout", "eid=7", "text=hello"}, }, { name: "truncated text and error", diff --git a/components/execd/pkg/web/model/command.go b/components/execd/pkg/web/model/command.go index 0d35aa823..06be14177 100644 --- a/components/execd/pkg/web/model/command.go +++ b/components/execd/pkg/web/model/command.go @@ -14,7 +14,36 @@ package model -import "time" +import ( + "errors" + "time" + + "github.com/go-playground/validator/v10" +) + +// RunCommandRequest represents a shell command execution request. +type RunCommandRequest struct { + Command string `json:"command" validate:"required"` + Cwd string `json:"cwd,omitempty"` + Background bool `json:"background,omitempty"` + // TimeoutMs caps execution duration; 0 uses server default. + TimeoutMs int64 `json:"timeout,omitempty" validate:"omitempty,gte=1"` + + Uid *uint32 `json:"uid,omitempty"` + Gid *uint32 `json:"gid,omitempty"` + Envs map[string]string `json:"envs,omitempty"` +} + +func (r *RunCommandRequest) Validate() error { + validate := validator.New() + if err := validate.Struct(r); err != nil { + return err + } + if r.Gid != nil && r.Uid == nil { + return errors.New("uid is required when gid is provided") + } + return nil +} // CommandStatusResponse represents command status for REST APIs. type CommandStatusResponse struct { From 97b6d8c47ba125945acb4758b01df9552645a6da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Wed, 25 Mar 2026 21:44:19 +0800 Subject: [PATCH 03/20] feat(execd): add command/session resume api --- components/execd/pkg/runtime/types.go | 7 ------- .../pkg/web/controller/codeinterpreting.go | 19 +++++++++++++++++++ .../execd/pkg/web/controller/command.go | 15 +++++++++++++++ components/execd/pkg/web/model/command.go | 3 +++ components/execd/pkg/web/model/error.go | 1 + components/execd/pkg/web/router.go | 2 ++ 6 files changed, 40 insertions(+), 7 deletions(-) diff --git a/components/execd/pkg/runtime/types.go b/components/execd/pkg/runtime/types.go index 4740902f8..3beedc23b 100644 --- a/components/execd/pkg/runtime/types.go +++ b/components/execd/pkg/runtime/types.go @@ -24,8 +24,6 @@ import ( ) // ExecuteResultHook groups execution callbacks. -// Eid is only assigned for stdout/stderr on run command (pipe tail) and run-in-session (bash stdout pipe); -// other paths pass eid=0. Ids are allocated in runtime at pipe sync, not in HTTP/SSE writers. type ExecuteResultHook struct { OnExecuteInit func(context string) OnExecuteResult func(result map[string]any, count int) @@ -48,12 +46,9 @@ type ExecuteCodeRequest struct { Gid *uint32 `json:"gid,omitempty"` Hooks ExecuteResultHook - // eventSeq assigns monotonic eids (1-based) for stdout/stderr on run command and bash session only. eventSeq atomic.Uint64 } -// nextStdoutStderrEventID returns the next eid for stdout/stderr lines. Used only from run command -// pipe tailers and bash session stdout; other callers should pass 0 into OnExecuteStdout/Stderr. func (req *ExecuteCodeRequest) nextStdoutStderrEventID() int64 { if req == nil { return 0 @@ -61,7 +56,6 @@ func (req *ExecuteCodeRequest) nextStdoutStderrEventID() int64 { return int64(req.eventSeq.Add(1)) } -// wrapStdoutPipeHook wraps stdout delivery so eid is assigned when a line is flushed from the pipe tailer, not in SSE writes. func (req *ExecuteCodeRequest) wrapStdoutPipeHook() func(string) { return func(text string) { if text == "" || req.Hooks.OnExecuteStdout == nil { @@ -72,7 +66,6 @@ func (req *ExecuteCodeRequest) wrapStdoutPipeHook() func(string) { } } -// wrapStderrPipeHook wraps stderr delivery so eid is assigned when a line is flushed from the pipe tailer, not in SSE writes. func (req *ExecuteCodeRequest) wrapStderrPipeHook() func(string) { return func(text string) { if text == "" || req.Hooks.OnExecuteStderr == nil { diff --git a/components/execd/pkg/web/controller/codeinterpreting.go b/components/execd/pkg/web/controller/codeinterpreting.go index b0facef4e..e2aa65393 100644 --- a/components/execd/pkg/web/controller/codeinterpreting.go +++ b/components/execd/pkg/web/controller/codeinterpreting.go @@ -321,6 +321,25 @@ func (c *CodeInterpretingController) RunInSession() { time.Sleep(flag.ApiGracefulShutdownTimeout) } +func (c *CodeInterpretingController) ResumeSessionStream() { + sessionID := c.ctx.Param("sessionId") + if sessionID == "" { + c.RespondError( + http.StatusBadRequest, + model.ErrorCodeMissingQuery, + "missing path parameter 'sessionId'", + ) + return + } + _ = c.QueryInt64(c.ctx.Query(model.SessionResumeAfterEidQuery), 0) + + c.RespondError( + http.StatusNotImplemented, + model.ErrorCodeNotImplemented, + "session stream resume is not implemented yet", + ) +} + // DeleteSession deletes a bash session (delete_session API). func (c *CodeInterpretingController) DeleteSession() { sessionID := c.ctx.Param("sessionId") diff --git a/components/execd/pkg/web/controller/command.go b/components/execd/pkg/web/controller/command.go index d4da90df5..a8552004a 100644 --- a/components/execd/pkg/web/controller/command.go +++ b/components/execd/pkg/web/controller/command.go @@ -125,6 +125,21 @@ func (c *CodeInterpretingController) GetBackgroundCommandOutput() { c.ctx.String(http.StatusOK, "%s", output) } +func (c *CodeInterpretingController) ResumeCommandStream() { + commandID := c.ctx.Param("id") + if commandID == "" { + c.RespondError(http.StatusBadRequest, model.ErrorCodeInvalidRequest, "missing command execution id") + return + } + _ = c.QueryInt64(c.ctx.Query(model.CommandResumeAfterEidQuery), 0) + + c.RespondError( + http.StatusNotImplemented, + model.ErrorCodeNotImplemented, + "command stream resume is not implemented yet", + ) +} + func (c *CodeInterpretingController) buildExecuteCommandRequest(request model.RunCommandRequest) *runtime.ExecuteCodeRequest { timeout := time.Duration(request.TimeoutMs) * time.Millisecond if request.Background { diff --git a/components/execd/pkg/web/model/command.go b/components/execd/pkg/web/model/command.go index 06be14177..b4d855c71 100644 --- a/components/execd/pkg/web/model/command.go +++ b/components/execd/pkg/web/model/command.go @@ -21,6 +21,9 @@ import ( "github.com/go-playground/validator/v10" ) +const CommandResumeAfterEidQuery = "after_eid" +const SessionResumeAfterEidQuery = CommandResumeAfterEidQuery + // RunCommandRequest represents a shell command execution request. type RunCommandRequest struct { Command string `json:"command" validate:"required"` diff --git a/components/execd/pkg/web/model/error.go b/components/execd/pkg/web/model/error.go index 80e0ef23a..519e26f12 100644 --- a/components/execd/pkg/web/model/error.go +++ b/components/execd/pkg/web/model/error.go @@ -26,6 +26,7 @@ const ( ErrorCodeFileNotFound ErrorCode = "FILE_NOT_FOUND" ErrorCodeUnknown ErrorCode = "UNKNOWN" ErrorCodeContextNotFound ErrorCode = "CONTEXT_NOT_FOUND" + ErrorCodeNotImplemented ErrorCode = "NOT_IMPLEMENTED" ) type ErrorResponse struct { diff --git a/components/execd/pkg/web/router.go b/components/execd/pkg/web/router.go index 8894257d2..1f317459b 100644 --- a/components/execd/pkg/web/router.go +++ b/components/execd/pkg/web/router.go @@ -66,6 +66,7 @@ func NewRouter(accessToken string) *gin.Engine { { session.POST("", withCode(func(c *controller.CodeInterpretingController) { c.CreateSession() })) session.POST("/:sessionId/run", withCode(func(c *controller.CodeInterpretingController) { c.RunInSession() })) + session.GET("/:sessionId/resume", withCode(func(c *controller.CodeInterpretingController) { c.ResumeSessionStream() })) session.DELETE("/:sessionId", withCode(func(c *controller.CodeInterpretingController) { c.DeleteSession() })) } @@ -74,6 +75,7 @@ func NewRouter(accessToken string) *gin.Engine { command.POST("", withCode(func(c *controller.CodeInterpretingController) { c.RunCommand() })) command.DELETE("", withCode(func(c *controller.CodeInterpretingController) { c.InterruptCommand() })) command.GET("/status/:id", withCode(func(c *controller.CodeInterpretingController) { c.GetCommandStatus() })) + command.GET("/:id/resume", withCode(func(c *controller.CodeInterpretingController) { c.ResumeCommandStream() })) command.GET("/:id/logs", withCode(func(c *controller.CodeInterpretingController) { c.GetBackgroundCommandOutput() })) } From 409858065b941f91c6d634bbcdeebb8194af3baa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Wed, 25 Mar 2026 21:59:03 +0800 Subject: [PATCH 04/20] feat(execd): extract stream buffer (sbuf) package for ring buffer queue --- components/execd/pkg/sbuf/config.go | 37 +++ components/execd/pkg/sbuf/errors.go | 23 ++ components/execd/pkg/sbuf/event.go | 21 ++ components/execd/pkg/sbuf/ring.go | 91 ++++++ components/execd/pkg/sbuf/store.go | 108 +++++++ .../execd/pkg/sbuf/store_benchmark_test.go | 136 +++++++++ components/execd/pkg/sbuf/store_test.go | 80 +++++ .../pkg/web/controller/codeinterpreting.go | 30 +- .../execd/pkg/web/controller/command.go | 65 ++++- .../pkg/web/controller/command_stream.go | 190 ++++++++++++ components/execd/pkg/web/controller/sse.go | 62 ++-- components/execd/pkg/web/model/command.go | 1 - components/execd/pkg/web/router.go | 1 - components/execd/tests/command_resume_test.py | 274 ++++++++++++++++++ 14 files changed, 1072 insertions(+), 47 deletions(-) create mode 100644 components/execd/pkg/sbuf/config.go create mode 100644 components/execd/pkg/sbuf/errors.go create mode 100644 components/execd/pkg/sbuf/event.go create mode 100644 components/execd/pkg/sbuf/ring.go create mode 100644 components/execd/pkg/sbuf/store.go create mode 100644 components/execd/pkg/sbuf/store_benchmark_test.go create mode 100644 components/execd/pkg/sbuf/store_test.go create mode 100644 components/execd/pkg/web/controller/command_stream.go create mode 100644 components/execd/tests/command_resume_test.py diff --git a/components/execd/pkg/sbuf/config.go b/components/execd/pkg/sbuf/config.go new file mode 100644 index 000000000..12dda926d --- /dev/null +++ b/components/execd/pkg/sbuf/config.go @@ -0,0 +1,37 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sbuf + +// Config controls per-stream bounds and append policy. +type Config struct { + // MaxEvents is the maximum number of events retained per stream. Oldest events are dropped when exceeded. + // Zero defaults to DefaultMaxEvents. + MaxEvents int + // MaxBytes is the approximate upper bound on total payload bytes per stream (sum of len(Payload)). + // Oldest events are dropped until under the limit. Zero means no byte limit. + MaxBytes int64 + // StrictMonotonic rejects Append when eid <= last eid for that stream. Recommended for execd SSE eids. + StrictMonotonic bool +} + +const DefaultMaxEvents = 1024 + +func (c *Config) normalized() Config { + out := *c + if out.MaxEvents <= 0 { + out.MaxEvents = DefaultMaxEvents + } + return out +} diff --git a/components/execd/pkg/sbuf/errors.go b/components/execd/pkg/sbuf/errors.go new file mode 100644 index 000000000..8a37ac6e5 --- /dev/null +++ b/components/execd/pkg/sbuf/errors.go @@ -0,0 +1,23 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sbuf + +import "errors" + +var ( + // ErrOutOfOrder is returned when StrictMonotonic is enabled and eid is not greater than the last appended eid. + ErrOutOfOrder = errors.New("sbuf: eid out of order for stream") + ErrEmptyStreamID = errors.New("sbuf: empty stream id") +) diff --git a/components/execd/pkg/sbuf/event.go b/components/execd/pkg/sbuf/event.go new file mode 100644 index 000000000..d65219e83 --- /dev/null +++ b/components/execd/pkg/sbuf/event.go @@ -0,0 +1,21 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sbuf + +// Event is one stored record (e.g. a single SSE JSON line body). Payload is owned by the buffer after Append. +type Event struct { + EID int64 + Payload []byte +} diff --git a/components/execd/pkg/sbuf/ring.go b/components/execd/pkg/sbuf/ring.go new file mode 100644 index 000000000..37426328a --- /dev/null +++ b/components/execd/pkg/sbuf/ring.go @@ -0,0 +1,91 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sbuf + +// ring is a FIFO queue with a fixed max length; push drops oldest when full. +type ring struct { + maxLen int + slots []eventSlot + head int + n int + bytes int64 +} + +type eventSlot struct { + eid int64 + payload []byte +} + +func newRing(maxLen int) *ring { + if maxLen < 1 { + maxLen = 1 + } + return &ring{ + maxLen: maxLen, + slots: make([]eventSlot, maxLen), + } +} + +func (r *ring) push(eid int64, payload []byte, maxBytes int64) { + pld := append([]byte(nil), payload...) + size := int64(len(pld)) + + if r.n == r.maxLen { + r.evictHead() + } + idx := (r.head + r.n) % r.maxLen + r.slots[idx] = eventSlot{eid: eid, payload: pld} + r.n++ + r.bytes += size + + if maxBytes > 0 { + for r.bytes > maxBytes && r.n > 0 { + r.evictHead() + } + } +} + +func (r *ring) evictHead() { + if r.n == 0 { + return + } + old := r.slots[r.head] + r.bytes -= int64(len(old.payload)) + r.slots[r.head] = eventSlot{} + r.head = (r.head + 1) % r.maxLen + r.n-- +} + +func (r *ring) iterAfter(afterEid int64, fn func(eid int64, payload []byte)) { + for i := range r.n { + idx := (r.head + i) % r.maxLen + s := r.slots[idx] + if s.eid > afterEid { + fn(s.eid, s.payload) + } + } +} + +// snapshotAfter returns a copy slice for safe iteration outside the ring lock. +func (r *ring) snapshotAfter(afterEid int64) []Event { + var out []Event + r.iterAfter(afterEid, func(eid int64, payload []byte) { + out = append(out, Event{ + EID: eid, + Payload: append([]byte(nil), payload...), + }) + }) + return out +} diff --git a/components/execd/pkg/sbuf/store.go b/components/execd/pkg/sbuf/store.go new file mode 100644 index 000000000..ead4666a4 --- /dev/null +++ b/components/execd/pkg/sbuf/store.go @@ -0,0 +1,108 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sbuf provides bounded, per-stream FIFO buffers for SSE (or similar) events keyed by eid, +// used to serve disconnect resume (catch-up by event id). +// It is storage-only: callers assign eids and decide when to delete a stream. +package sbuf + +import ( + "sync" +) + +// Store holds bounded event rings keyed by caller-defined stream IDs (e.g. command execution id). +type Store struct { + cfg Config + mu sync.Mutex + streams map[string]*streamBuf +} + +type streamBuf struct { + mu sync.Mutex + lastEid int64 + ring *ring + maxBytes int64 +} + +// NewStore creates an empty store. cfg is copied after normalization. +func NewStore(cfg Config) *Store { + cfg = cfg.normalized() + return &Store{ + cfg: cfg, + streams: make(map[string]*streamBuf), + } +} + +// Append adds one event to the stream's ring. Payload is copied. +// With StrictMonotonic, returns ErrOutOfOrder if eid <= previous eid for this stream. +func (s *Store) Append(streamID string, eid int64, payload []byte) error { + if streamID == "" { + return ErrEmptyStreamID + } + sb := s.getOrCreate(streamID) + sb.mu.Lock() + defer sb.mu.Unlock() + + if s.cfg.StrictMonotonic { + if eid <= sb.lastEid { + return ErrOutOfOrder + } + } + sb.lastEid = eid + sb.ring.push(eid, payload, sb.maxBytes) + return nil +} + +func (s *Store) getOrCreate(streamID string) *streamBuf { + s.mu.Lock() + defer s.mu.Unlock() + if sb, ok := s.streams[streamID]; ok { + return sb + } + sb := &streamBuf{ + ring: newRing(s.cfg.MaxEvents), + maxBytes: s.cfg.MaxBytes, + } + s.streams[streamID] = sb + return sb +} + +// EventsAfter returns a snapshot of events with EID > afterEid in order. +// If the stream does not exist, ok is false and events is nil. +func (s *Store) EventsAfter(streamID string, afterEid int64) (events []Event, ok bool) { + s.mu.Lock() + sb, found := s.streams[streamID] + s.mu.Unlock() + if !found { + return nil, false + } + sb.mu.Lock() + defer sb.mu.Unlock() + return sb.ring.snapshotAfter(afterEid), true +} + +// Delete removes a stream buffer. No-op if missing. +func (s *Store) Delete(streamID string) { + s.mu.Lock() + defer s.mu.Unlock() + delete(s.streams, streamID) +} + +// Has reports whether a stream currently exists. +func (s *Store) Has(streamID string) bool { + s.mu.Lock() + defer s.mu.Unlock() + _, ok := s.streams[streamID] + return ok +} diff --git a/components/execd/pkg/sbuf/store_benchmark_test.go b/components/execd/pkg/sbuf/store_benchmark_test.go new file mode 100644 index 000000000..e769db981 --- /dev/null +++ b/components/execd/pkg/sbuf/store_benchmark_test.go @@ -0,0 +1,136 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sbuf + +import ( + "fmt" + "sync/atomic" + "testing" +) + +// payload used across benchmarks (typical SSE JSON line order of magnitude). +var benchPayload = []byte(`{"type":"stdout","eid":1,"text":"hello","timestamp":0}`) + +// BenchmarkRing_pushNoEvict measures ring.push when the ring is not full (no evictHead). +func BenchmarkRing_pushNoEvict(b *testing.B) { + r := newRing(1 << 20) + b.SetBytes(int64(len(benchPayload))) + var eid atomic.Int64 + b.ResetTimer() + for i := 0; i < b.N; i++ { + r.push(eid.Add(1), benchPayload, 0) + } +} + +// BenchmarkRing_pushWithEvict measures push when the ring stays at capacity (each push may evict oldest). +func BenchmarkRing_pushWithEvict(b *testing.B) { + const cap = 64 + r := newRing(cap) + // Fill ring so every subsequent push evicts one slot. + for i := int64(1); i <= cap; i++ { + r.push(i, benchPayload, 0) + } + b.SetBytes(int64(len(benchPayload))) + var eid atomic.Int64 + eid.Store(cap) + b.ResetTimer() + for i := 0; i < b.N; i++ { + r.push(eid.Add(1), benchPayload, 0) + } +} + +// BenchmarkStore_Append_noEvict is Append on a warm stream with a large MaxEvents (no ring eviction). +func BenchmarkStore_Append_noEvict(b *testing.B) { + s := NewStore(Config{MaxEvents: 1 << 20, StrictMonotonic: true}) + if err := s.Append("s", 1, benchPayload); err != nil { + b.Fatal(err) + } + var eid int64 = 1 + b.SetBytes(int64(len(benchPayload))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + eid++ + if err := s.Append("s", eid, benchPayload); err != nil { + b.Fatal(err) + } + } +} + +// BenchmarkStore_Append_evicting keeps a small ring so almost every Append evicts the oldest event. +func BenchmarkStore_Append_evicting(b *testing.B) { + s := NewStore(Config{MaxEvents: 64, StrictMonotonic: true}) + if err := s.Append("s", 1, benchPayload); err != nil { + b.Fatal(err) + } + var eid int64 = 1 + b.SetBytes(int64(len(benchPayload))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + eid++ + if err := s.Append("s", eid, benchPayload); err != nil { + b.Fatal(err) + } + } +} + +// BenchmarkStore_EventsAfter measures snapshot copy cost after many appends. +func BenchmarkStore_EventsAfter(b *testing.B) { + const n = 1000 + s := NewStore(Config{MaxEvents: n + 10, StrictMonotonic: true}) + for i := int64(1); i <= n; i++ { + if err := s.Append("s", i, benchPayload); err != nil { + b.Fatal(err) + } + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = s.EventsAfter("s", 0) + } +} + +// BenchmarkStore_Append_ParallelDifferentStreams: one stream per goroutine (minimal lock contention on streamBuf). +func BenchmarkStore_Append_ParallelDifferentStreams(b *testing.B) { + s := NewStore(Config{MaxEvents: 1 << 16, StrictMonotonic: true}) + b.SetBytes(int64(len(benchPayload))) + var id atomic.Int64 + b.RunParallel(func(pb *testing.PB) { + // Unique stream id per goroutine iteration batch. + my := id.Add(1) + sid := fmt.Sprintf("s-%d", my) + var e int64 + for pb.Next() { + e++ + if err := s.Append(sid, e, benchPayload); err != nil { + b.Fatal(err) + } + } + }) +} + +// BenchmarkStore_Append_ParallelSameStream: all goroutines append to one stream (serialized on streamBuf.mu). +// StrictMonotonic is off: parallel workers would observe eids out of arrival order if enforced. +func BenchmarkStore_Append_ParallelSameStream(b *testing.B) { + s := NewStore(Config{MaxEvents: 1 << 20, StrictMonotonic: false}) + var eid atomic.Int64 + b.SetBytes(int64(len(benchPayload))) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + n := eid.Add(1) + if err := s.Append("s", n, benchPayload); err != nil { + b.Fatal(err) + } + } + }) +} diff --git a/components/execd/pkg/sbuf/store_test.go b/components/execd/pkg/sbuf/store_test.go new file mode 100644 index 000000000..aeb1554d9 --- /dev/null +++ b/components/execd/pkg/sbuf/store_test.go @@ -0,0 +1,80 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sbuf + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestStore_EventsAfter_NotFound(t *testing.T) { + s := NewStore(Config{MaxEvents: 8, StrictMonotonic: true}) + ev, ok := s.EventsAfter("missing", 0) + require.False(t, ok) + require.Nil(t, ev) +} + +func TestStore_AppendStrictMonotonic(t *testing.T) { + s := NewStore(Config{MaxEvents: 8, StrictMonotonic: true}) + require.NoError(t, s.Append("stream-a", 1, []byte(`{"a":1}`))) + require.ErrorIs(t, s.Append("stream-a", 1, []byte(`dup`)), ErrOutOfOrder) + require.NoError(t, s.Append("stream-a", 2, []byte(`{"a":2}`))) + + ev, ok := s.EventsAfter("stream-a", 0) + require.True(t, ok) + require.Len(t, ev, 2) + require.Equal(t, int64(1), ev[0].EID) + require.Equal(t, `{"a":1}`, string(ev[0].Payload)) + + ev2, _ := s.EventsAfter("stream-a", 1) + require.Len(t, ev2, 1) + require.Equal(t, int64(2), ev2[0].EID) +} + +func TestStore_MaxEventsEvictsOldest(t *testing.T) { + s := NewStore(Config{MaxEvents: 3, StrictMonotonic: true}) + for i := int64(1); i <= 5; i++ { + require.NoError(t, s.Append("s", i, []byte{byte(i)})) + } + ev, ok := s.EventsAfter("s", 0) + require.True(t, ok) + require.Len(t, ev, 3) + require.Equal(t, int64(3), ev[0].EID) + require.Equal(t, byte(3), ev[0].Payload[0]) +} + +func TestStore_MaxBytesEvicts(t *testing.T) { + s := NewStore(Config{MaxEvents: 100, MaxBytes: 10, StrictMonotonic: true}) + require.NoError(t, s.Append("s", 1, []byte("1234567890"))) + require.NoError(t, s.Append("s", 2, []byte("1234567890"))) + ev, ok := s.EventsAfter("s", 0) + require.True(t, ok) + require.Len(t, ev, 1) + require.Equal(t, int64(2), ev[0].EID) +} + +func TestStore_Delete(t *testing.T) { + s := NewStore(Config{MaxEvents: 8, StrictMonotonic: true}) + require.NoError(t, s.Append("x", 1, []byte("a"))) + require.True(t, s.Has("x")) + s.Delete("x") + require.False(t, s.Has("x")) +} + +func TestStore_EmptyStreamID(t *testing.T) { + s := NewStore(Config{}) + require.ErrorIs(t, s.Append("", 1, nil), ErrEmptyStreamID) +} diff --git a/components/execd/pkg/web/controller/codeinterpreting.go b/components/execd/pkg/web/controller/codeinterpreting.go index e2aa65393..1e9f28cdd 100644 --- a/components/execd/pkg/web/controller/codeinterpreting.go +++ b/components/execd/pkg/web/controller/codeinterpreting.go @@ -42,6 +42,11 @@ type CodeInterpretingController struct { // chunkWriter serializes SSE event writes to prevent interleaved output. chunkWriter sync.Mutex + + resumeStreamMu sync.Mutex + resumeStreamID string + // resumeEnabled opts into disconnect resume (event buffer + live hub) for RunCommand / RunCode. + resumeEnabled bool } func NewCodeInterpretingController(ctx *gin.Context) *CodeInterpretingController { @@ -111,6 +116,11 @@ func (c *CodeInterpretingController) RunCode() { ctx, cancel := context.WithCancel(c.ctx.Request.Context()) defer cancel() + c.resumeEnabled = true + defer func() { + deferResumeCleanup(c) + c.resumeEnabled = false + }() runCodeRequest := c.buildExecuteCodeRequest(request) eventsHandler := c.setServerEventsHandler(ctx) runCodeRequest.Hooks = eventsHandler @@ -305,6 +315,7 @@ func (c *CodeInterpretingController) RunInSession() { } ctx, cancel := context.WithCancel(c.ctx.Request.Context()) defer cancel() + runReq.Hooks = c.setServerEventsHandler(ctx) c.setupSSEResponse() @@ -321,25 +332,6 @@ func (c *CodeInterpretingController) RunInSession() { time.Sleep(flag.ApiGracefulShutdownTimeout) } -func (c *CodeInterpretingController) ResumeSessionStream() { - sessionID := c.ctx.Param("sessionId") - if sessionID == "" { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeMissingQuery, - "missing path parameter 'sessionId'", - ) - return - } - _ = c.QueryInt64(c.ctx.Query(model.SessionResumeAfterEidQuery), 0) - - c.RespondError( - http.StatusNotImplemented, - model.ErrorCodeNotImplemented, - "session stream resume is not implemented yet", - ) -} - // DeleteSession deletes a bash session (delete_session API). func (c *CodeInterpretingController) DeleteSession() { sessionID := c.ctx.Param("sessionId") diff --git a/components/execd/pkg/web/controller/command.go b/components/execd/pkg/web/controller/command.go index a8552004a..89eefe217 100644 --- a/components/execd/pkg/web/controller/command.go +++ b/components/execd/pkg/web/controller/command.go @@ -16,12 +16,14 @@ package controller import ( "context" + "errors" "fmt" "net/http" "strconv" "time" "github.com/alibaba/opensandbox/execd/pkg/flag" + "github.com/alibaba/opensandbox/execd/pkg/log" "github.com/alibaba/opensandbox/execd/pkg/runtime" "github.com/alibaba/opensandbox/execd/pkg/web/model" ) @@ -50,6 +52,11 @@ func (c *CodeInterpretingController) RunCommand() { ctx, cancel := context.WithCancel(c.ctx.Request.Context()) defer cancel() + c.resumeEnabled = true + defer func() { + deferResumeCleanup(c) + c.resumeEnabled = false + }() runCodeRequest := c.buildExecuteCommandRequest(request) eventsHandler := c.setServerEventsHandler(ctx) @@ -125,19 +132,65 @@ func (c *CodeInterpretingController) GetBackgroundCommandOutput() { c.ctx.String(http.StatusOK, "%s", output) } +// ResumeCommandStream sends buffered events after after_eid, then if the command is still running +// and no other client holds the live slot, streams further events until completion or client disconnect. func (c *CodeInterpretingController) ResumeCommandStream() { commandID := c.ctx.Param("id") if commandID == "" { c.RespondError(http.StatusBadRequest, model.ErrorCodeInvalidRequest, "missing command execution id") return } - _ = c.QueryInt64(c.ctx.Query(model.CommandResumeAfterEidQuery), 0) + afterEid := c.QueryInt64(c.ctx.Query(model.CommandResumeAfterEidQuery), 0) - c.RespondError( - http.StatusNotImplemented, - model.ErrorCodeNotImplemented, - "command stream resume is not implemented yet", - ) + hub := commandStreams.getHub(commandID) + st, errSt := codeRunner.GetCommandStatus(commandID) + if errSt != nil && hub == nil { + c.RespondError(http.StatusNotFound, model.ErrorCodeInvalidRequest, errSt.Error()) + return + } + + events, bufferOK := resumeBuffer.EventsAfter(commandID, afterEid) + if !bufferOK && hub == nil { + c.RespondError(http.StatusNotFound, model.ErrorCodeInvalidRequest, "command stream resume buffer not available") + return + } + + if st != nil && st.Running && hub != nil && hub.isHolderAlive() { + c.RespondError( + http.StatusConflict, + model.ErrorCodeInvalidRequest, + "primary SSE stream is still active; disconnect it before resuming", + ) + return + } + + c.setupSSEResponse() + for _, ev := range events { + c.writeSingleEvent("ResumeBuffer", ev.Payload, false, fmt.Sprintf("buffer eid=%d", ev.EID), 0) + } + + st2, _ := codeRunner.GetCommandStatus(commandID) + if st2 == nil || !st2.Running { + return + } + + hub = commandStreams.getHub(commandID) + if hub == nil { + return + } + + h, err := commandStreams.tryAttachResume(commandID, c.ctx.Writer, c.ctx.Request.Context()) + if err != nil { + if errors.Is(err, errLiveStreamPrimaryActive) { + log.Error("ResumeCommandStream: attach conflict after buffered history (another client may have attached)") + } + return + } + + select { + case <-h.waitDone(): + case <-c.ctx.Request.Context().Done(): + } } func (c *CodeInterpretingController) buildExecuteCommandRequest(request model.RunCommandRequest) *runtime.ExecuteCodeRequest { diff --git a/components/execd/pkg/web/controller/command_stream.go b/components/execd/pkg/web/controller/command_stream.go new file mode 100644 index 000000000..bf04ff5d3 --- /dev/null +++ b/components/execd/pkg/web/controller/command_stream.go @@ -0,0 +1,190 @@ +// Copyright 2025 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Bounded event buffer (sbuf) plus at-most-one live SSE writer per command id for disconnect resume; +// GET /command/:id/resume sends buffered events then may take over as the sole live consumer. + +package controller + +import ( + "bytes" + "context" + "errors" + "io" + "net/http" + "sync" + + "github.com/alibaba/opensandbox/execd/pkg/log" + "github.com/alibaba/opensandbox/execd/pkg/sbuf" +) + +var ( + resumeBuffer *sbuf.Store + + errLiveStreamPrimaryActive = errors.New("primary SSE stream is still active") + errLiveHubNotFound = errors.New("command live hub not found") +) + +func init() { + resumeBuffer = sbuf.NewStore(sbuf.Config{StrictMonotonic: true}) +} + +func deferResumeCleanup(c *CodeInterpretingController) { + c.resumeStreamMu.Lock() + id := c.resumeStreamID + c.resumeStreamID = "" + c.resumeStreamMu.Unlock() + if id == "" { + return + } + commandStreams.closeAndRemove(id) + resumeBuffer.Delete(id) +} + +// --- live SSE routing (mutually exclusive main vs resume) --- + +type streamRegistry struct { + mu sync.Mutex + m map[string]*streamHub +} + +var commandStreams = &streamRegistry{m: make(map[string]*streamHub)} + +type streamHub struct { + streamID string + mu sync.Mutex + holder *streamHolder + done chan struct{} +} + +type streamHolder struct { + writer http.ResponseWriter + ctx context.Context +} + +func (r *streamRegistry) registerPrimary(id string, w http.ResponseWriter, ctx context.Context) { + r.mu.Lock() + h := &streamHub{ + streamID: id, + done: make(chan struct{}), + } + h.holder = &streamHolder{writer: w, ctx: ctx} + r.m[id] = h + r.mu.Unlock() + + r.watchHolderRelease(id, h, ctx) +} + +func (r *streamRegistry) watchHolderRelease(id string, h *streamHub, ctx context.Context) { + go func() { + <-ctx.Done() + r.mu.Lock() + if cur, ok := r.m[id]; ok && cur == h && h.holder != nil && h.holder.ctx == ctx { + h.holder = nil + } + r.mu.Unlock() + }() +} + +func (r *streamRegistry) getHub(id string) *streamHub { + r.mu.Lock() + defer r.mu.Unlock() + return r.m[id] +} + +func (r *streamRegistry) closeAndRemove(id string) { + r.mu.Lock() + h := r.m[id] + delete(r.m, id) + r.mu.Unlock() + if h != nil { + h.closeDone() + } +} + +func (h *streamHub) closeDone() { + h.mu.Lock() + defer h.mu.Unlock() + select { + case <-h.done: + default: + close(h.done) + } +} + +func (h *streamHub) waitDone() <-chan struct{} { + return h.done +} + +func (h *streamHub) isHolderAlive() bool { + if h == nil { + return false + } + h.mu.Lock() + defer h.mu.Unlock() + return h.holder != nil && h.holder.ctx.Err() == nil +} + +func (r *streamRegistry) tryAttachResume(id string, w http.ResponseWriter, ctx context.Context) (*streamHub, error) { + r.mu.Lock() + h := r.m[id] + if h == nil { + r.mu.Unlock() + return nil, errLiveHubNotFound + } + if h.holder != nil && h.holder.ctx.Err() == nil { + r.mu.Unlock() + return nil, errLiveStreamPrimaryActive + } + h.holder = &streamHolder{writer: w, ctx: ctx} + r.mu.Unlock() + + r.watchHolderRelease(id, h, ctx) + return h, nil +} + +func (r *streamRegistry) writeSSE(id string, data []byte, bufEid int64, handler, summary string) { + r.mu.Lock() + h := r.m[id] + r.mu.Unlock() + if h == nil { + if bufEid > 0 { + _ = resumeBuffer.Append(id, bufEid, bytes.Clone(data)) + } + return + } + h.writeFrame(data, bufEid, handler, summary) +} + +func (h *streamHub) writeFrame(data []byte, bufEid int64, handler, summary string) { + h.mu.Lock() + defer h.mu.Unlock() + + payload := append(data, '\n', '\n') + if h.holder != nil { + n, err := h.holder.writer.Write(payload) + if err == nil && n != len(payload) { + err = io.ErrShortWrite + } + if err != nil { + log.Error("StreamEvent.%s write data %s error: %v", handler, summary, err) + } else if flusher, ok := h.holder.writer.(http.Flusher); ok { + flusher.Flush() + } + } + + if bufEid > 0 { + _ = resumeBuffer.Append(h.streamID, bufEid, bytes.Clone(data)) + } +} diff --git a/components/execd/pkg/web/controller/sse.go b/components/execd/pkg/web/controller/sse.go index 1f00bcdf1..1fe789f12 100644 --- a/components/execd/pkg/web/controller/sse.go +++ b/components/execd/pkg/web/controller/sse.go @@ -49,13 +49,20 @@ func (c *basicController) setupSSEResponse() { func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) runtime.ExecuteResultHook { return runtime.ExecuteResultHook{ OnExecuteInit: func(session string) { + if c.resumeEnabled { + c.resumeStreamMu.Lock() + c.resumeStreamID = session + c.resumeStreamMu.Unlock() + commandStreams.registerPrimary(session, c.ctx.Writer, c.ctx.Request.Context()) + } + event := model.ServerStreamEvent{ Type: model.StreamEventTypeInit, Text: session, Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteInit", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteInit", payload, true, event.Summary(), 0) safego.Go(func() { c.ping(ctx) }) }, @@ -80,7 +87,7 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteResult", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteResult", payload, true, event.Summary(), 0) } if len(mutated) > 0 { event := model.ServerStreamEvent{ @@ -89,7 +96,7 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteResult", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteResult", payload, true, event.Summary(), 0) } }, OnExecuteComplete: func(executionTime time.Duration) { @@ -99,7 +106,7 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteComplete", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteComplete", payload, true, event.Summary(), 0) }, OnExecuteError: func(err *execute.ErrorOutput) { if err == nil { @@ -112,7 +119,7 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteError", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteError", payload, true, event.Summary(), 0) }, OnExecuteStatus: func(status string) { event := model.ServerStreamEvent{ @@ -121,7 +128,7 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteStatus", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteStatus", payload, true, event.Summary(), 0) }, OnExecuteStdout: func(eid int64, text string) { if text == "" { @@ -135,7 +142,7 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteStdout", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteStdout", payload, true, event.Summary(), eid) }, OnExecuteStderr: func(eid int64, text string) { if text == "" { @@ -149,17 +156,32 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteStderr", payload, true, event.Summary()) + c.writeSingleEvent("OnExecuteStderr", payload, true, event.Summary(), eid) }, } } -// writeSingleEvent serializes one SSE frame. -func (c *CodeInterpretingController) writeSingleEvent(handler string, data []byte, verbose bool, summary string) { +// writeSingleEvent serializes one SSE frame. When resumeStreamID is set, writes go through commandStreams (live hub + buffer). +// bufEid is stdout/stderr event id for the resume buffer; 0 skips Append (control events, resume catch-up frames). +func (c *CodeInterpretingController) writeSingleEvent(handler string, data []byte, verbose bool, summary string, bufEid int64) { if c == nil || c.ctx == nil || c.ctx.Writer == nil { return } + var streamID string + if c.resumeEnabled { + c.resumeStreamMu.Lock() + streamID = c.resumeStreamID + c.resumeStreamMu.Unlock() + } + if streamID != "" { + commandStreams.writeSSE(streamID, data, bufEid, handler, summary) + if verbose { + log.Info("StreamEvent.%s write data %s", handler, summary) + } + return + } + select { case <-c.ctx.Request.Context().Done(): log.Error("StreamEvent.%s: client disconnected", handler) @@ -169,11 +191,6 @@ func (c *CodeInterpretingController) writeSingleEvent(handler string, data []byt c.chunkWriter.Lock() defer c.chunkWriter.Unlock() - defer func() { - if flusher, ok := c.ctx.Writer.(http.Flusher); ok { - flusher.Flush() - } - }() payload := append(data, '\n', '\n') n, err := c.ctx.Writer.Write(payload) @@ -183,10 +200,15 @@ func (c *CodeInterpretingController) writeSingleEvent(handler string, data []byt if err != nil { log.Error("StreamEvent.%s write data %s error: %v", handler, summary, err) - } else { - if verbose { - log.Info("StreamEvent.%s write data %s", handler, summary) - } + return + } + + if flusher, ok := c.ctx.Writer.(http.Flusher); ok { + flusher.Flush() + } + + if verbose { + log.Info("StreamEvent.%s write data %s", handler, summary) } } @@ -202,6 +224,6 @@ func (c *CodeInterpretingController) ping(ctx context.Context) { Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("Ping", payload, false, event.Summary()) + c.writeSingleEvent("Ping", payload, false, event.Summary(), 0) }, 3*time.Second, ctx.Done()) } diff --git a/components/execd/pkg/web/model/command.go b/components/execd/pkg/web/model/command.go index b4d855c71..8876eaa06 100644 --- a/components/execd/pkg/web/model/command.go +++ b/components/execd/pkg/web/model/command.go @@ -22,7 +22,6 @@ import ( ) const CommandResumeAfterEidQuery = "after_eid" -const SessionResumeAfterEidQuery = CommandResumeAfterEidQuery // RunCommandRequest represents a shell command execution request. type RunCommandRequest struct { diff --git a/components/execd/pkg/web/router.go b/components/execd/pkg/web/router.go index 1f317459b..e049c0425 100644 --- a/components/execd/pkg/web/router.go +++ b/components/execd/pkg/web/router.go @@ -66,7 +66,6 @@ func NewRouter(accessToken string) *gin.Engine { { session.POST("", withCode(func(c *controller.CodeInterpretingController) { c.CreateSession() })) session.POST("/:sessionId/run", withCode(func(c *controller.CodeInterpretingController) { c.RunInSession() })) - session.GET("/:sessionId/resume", withCode(func(c *controller.CodeInterpretingController) { c.ResumeSessionStream() })) session.DELETE("/:sessionId", withCode(func(c *controller.CodeInterpretingController) { c.DeleteSession() })) } diff --git a/components/execd/tests/command_resume_test.py b/components/execd/tests/command_resume_test.py new file mode 100644 index 000000000..7772270fe --- /dev/null +++ b/components/execd/tests/command_resume_test.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +# Copyright 2025 Alibaba Group Holding Ltd. +# +# Manual local test: POST /command (streaming output) -> disconnect -> GET /resume (catch-up + live tail), +# repeat at least 3 disconnect/resume rounds, then read until execution_complete on the last connection. +# +# Configure EXECD_URL (and optional EXECD_TOKEN). Examples: +# EXECD_URL=localhost:44772 +# EXECD_URL=https://remote.example +# python3 components/execd/tests/command_resume_test.py + +from __future__ import annotations + +import http.client +import json +import os +import ssl +import sys +import urllib.parse +from typing import Any + +API_ACCESS_TOKEN_HEADER = "X-EXECD-ACCESS-TOKEN" + + +class RunCollector: + """Aggregates stdout and execution_complete across connections for final assertions.""" + + def __init__(self) -> None: + self.stdout_by_eid: dict[int, str] = {} + self.primary_stdout_lines = 0 + self.resume_stdout_lines = 0 + self.saw_complete = False + + def record(self, tag: str, ev: dict[str, Any]) -> None: + t = ev.get("type") + if t == "execution_complete": + self.saw_complete = True + return + if t != "stdout": + return + eid = int(ev.get("eid") or 0) + txt = (ev.get("text") or "").strip() + if eid in self.stdout_by_eid: + assert self.stdout_by_eid[eid] == txt, ( + f"duplicate eid {eid} with different text: {self.stdout_by_eid[eid]!r} vs {txt!r}" + ) + else: + self.stdout_by_eid[eid] = txt + if tag == "primary": + self.primary_stdout_lines += 1 + elif tag.startswith("resume"): + self.resume_stdout_lines += 1 + + def assert_ok(self) -> None: + assert self.saw_complete, "expected execution_complete" + assert self.resume_stdout_lines > 0, ( + "resume delivered no stdout lines; disconnect resume may not be working (check 409, STDOUT_PER_CHOP)" + ) + assert len(self.stdout_by_eid) == OUTPUT_LINES, ( + f"expected {OUTPUT_LINES} stdout lines, got distinct eid count={len(self.stdout_by_eid)}" + ) + for n in range(1, OUTPUT_LINES + 1): + assert n in self.stdout_by_eid, f"missing eid={n}" + assert self.stdout_by_eid[n] == f"tick{n}", ( + f"eid={n} text should be tick{n}, got {self.stdout_by_eid[n]!r}" + ) + assert self.primary_stdout_lines >= 1, "primary connection should receive at least one stdout line" + assert self.primary_stdout_lines + self.resume_stdout_lines == OUTPUT_LINES, ( + "primary + resume stdout line counts should equal total stdout lines (each line counted once): " + f"primary={self.primary_stdout_lines} resume={self.resume_stdout_lines} expected={OUTPUT_LINES}" + ) + print( + "ASSERT ok: execution_complete + resume delivered output + tick1..tick" + + str(OUTPUT_LINES) + + " with eid 1.." + + str(OUTPUT_LINES) + + " complete", + flush=True, + ) + +# Execd base URL (host:port is ok; http:// is prepended if missing). +EXECD_URL = os.environ.get("EXECD_URL", "http://127.0.0.1:44772") +if "://" not in EXECD_URL: + EXECD_URL = "http://" + EXECD_URL + +TOKEN = os.environ.get("EXECD_TOKEN", "") + +# Close each connection after this many stdout lines (three disconnect/resume rounds before the final read). +STDOUT_PER_CHOP = 15 + +# One primary disconnect plus (RESUME_CHOPS - 1) partial resume disconnects; last resume reads until complete. +RESUME_CHOPS = 3 + +# Bounded output: sleep 0.1s between lines, OUTPUT_LINES total; wall time ~ OUTPUT_LINES * 0.1s. +OUTPUT_LINES = 200 + +TIMEOUT_MS = 300_000 + +COMMAND = ( + "sh -c 'n=0; while [ \"$n\" -lt " + + str(OUTPUT_LINES) + + " ]; do n=$((n+1)); echo tick$n; sleep 0.1; done'" +) + + +def parse_frames(buf: bytes) -> tuple[list[dict[str, Any]], bytes]: + out: list[dict[str, Any]] = [] + while True: + i = buf.find(b"\n\n") + if i < 0: + return out, buf + raw = buf[:i].strip() + buf = buf[i + 2 :] + if not raw: + continue + try: + out.append(json.loads(raw.decode("utf-8"))) + except (json.JSONDecodeError, UnicodeDecodeError): + pass + + +def connect(scheme: str, host: str, port: int) -> http.client.HTTPConnection: + if scheme == "https": + return http.client.HTTPSConnection( + host, port, timeout=600, context=ssl.create_default_context() + ) + return http.client.HTTPConnection(host, port, timeout=600) + + +def parse_url(base: str) -> tuple[str, str, int, str]: + u = urllib.parse.urlparse(base.rstrip("/")) + scheme = (u.scheme or "http").lower() + host = u.hostname or "127.0.0.1" + port = u.port or (443 if scheme == "https" else 80) + return scheme, host, port, u.path or "" + + +def path_join(prefix: str, p: str) -> str: + if not prefix: + return p if p.startswith("/") else "/" + p + return prefix.rstrip("/") + (p if p.startswith("/") else "/" + p) + + +def headers() -> dict[str, str]: + h: dict[str, str] = { + "Content-Type": "application/json", + "Accept": "text/event-stream", + } + if TOKEN: + h[API_ACCESS_TOKEN_HEADER] = TOKEN + return h + + +def pump( + resp: http.client.HTTPResponse, + tag: str, + max_eid: int, + *, + stop_after_stdout: int | None, + collector: RunCollector | None = None, +) -> tuple[int, bool, str | None]: + """Read SSE; update max_eid. If stop_after_stdout is a number, stop after that many stdout lines; if None, read until execution_complete.""" + buf = b"" + cmd_id: str | None = None + stdout_n = 0 + complete = False + while True: + chunk = resp.read(8192) + if not chunk: + break + buf += chunk + frames, buf = parse_frames(buf) + for ev in frames: + if collector is not None: + collector.record(tag, ev) + t = ev.get("type") + if t == "init": + cmd_id = ev.get("text") + print(f"[{tag}] init id={cmd_id}") + elif t in ("stdout", "stderr"): + eid = int(ev.get("eid") or 0) + max_eid = max(max_eid, eid) + txt = ev.get("text", "") + print(f"[{tag}] {t} eid={eid} {txt!r}") + if t == "stdout": + stdout_n += 1 + elif t == "execution_complete": + complete = True + print(f"[{tag}] execution_complete ms={ev.get('execution_time')}") + elif t != "ping": + print(f"[{tag}] {t}") + + if complete: + return max_eid, True, cmd_id + if stop_after_stdout is not None and stdout_n >= stop_after_stdout: + return max_eid, False, cmd_id + return max_eid, complete, cmd_id + + +def main() -> int: + scheme, host, port, prefix = parse_url(EXECD_URL) + h = headers() + cmd_path = path_join(prefix, "/command") + resume_tmpl = path_join(prefix, "/command/{id}/resume") + collector = RunCollector() + + body = json.dumps({"command": COMMAND, "timeout": TIMEOUT_MS}) + conn = connect(scheme, host, port) + conn.request("POST", cmd_path, body.encode("utf-8"), h) + r = conn.getresponse() + if r.status != 200: + print(f"POST /command HTTP {r.status}", r.read().decode("utf-8", "replace"), file=sys.stderr) + conn.close() + return 1 + + max_eid = 0 + max_eid, done, cid = pump( + r, + "primary", + max_eid, + stop_after_stdout=STDOUT_PER_CHOP, + collector=collector, + ) + conn.close() + if not cid: + print("no init", file=sys.stderr) + return 1 + if done: + print("command finished on primary connection (unexpected)", file=sys.stderr) + return 0 + + for round_i in range(RESUME_CHOPS): + path = resume_tmpl.format(id=cid) + f"?after_eid={max_eid}" + tag = f"resume{round_i + 1}" + c2 = connect(scheme, host, port) + c2.request("GET", path, headers=h) + r2 = c2.getresponse() + if r2.status == 409: + print( + f"{tag} HTTP 409: primary SSE still active; retry later or increase STDOUT_PER_CHOP", + file=sys.stderr, + ) + print(r2.read().decode("utf-8", "replace"), file=sys.stderr) + c2.close() + return 1 + if r2.status != 200: + print(f"{tag} HTTP {r2.status}", r2.read().decode("utf-8", "replace"), file=sys.stderr) + c2.close() + return 1 + + last = round_i == RESUME_CHOPS - 1 + max_eid, done, _ = pump( + r2, + tag, + max_eid, + stop_after_stdout=None if last else STDOUT_PER_CHOP, + collector=collector, + ) + c2.close() + if done: + try: + collector.assert_ok() + except AssertionError as e: + print(f"ASSERT failed: {e}", file=sys.stderr) + return 1 + print("done.") + return 0 + + print("done (unexpected: should have completed in last resume)", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From 441e3b584896744185d2e54fcd9c96c3573cba4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Fri, 27 Mar 2026 13:41:33 +0800 Subject: [PATCH 05/20] feat(execd): extract stream buffer (sbuf) package for ring buffer queue --- .github/workflows/execd-test.yml | 1 + components/execd/tests/command_resume_test.py | 2 +- specs/execd-api.yaml | 60 +++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/.github/workflows/execd-test.yml b/.github/workflows/execd-test.yml index 16d9b8c62..5357883cb 100644 --- a/.github/workflows/execd-test.yml +++ b/.github/workflows/execd-test.yml @@ -101,6 +101,7 @@ jobs: sleep 5 python3 tests/smoke_api.py + python3 tests/command_resume_test.py - name: Show logs if: always() run: | diff --git a/components/execd/tests/command_resume_test.py b/components/execd/tests/command_resume_test.py index 7772270fe..4bbc2d74a 100644 --- a/components/execd/tests/command_resume_test.py +++ b/components/execd/tests/command_resume_test.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright 2025 Alibaba Group Holding Ltd. +# Copyright 2026 Alibaba Group Holding Ltd. # # Manual local test: POST /command (streaming output) -> disconnect -> GET /resume (catch-up + live tail), # repeat at least 3 disconnect/resume rounds, then read until execution_complete on the last connection. diff --git a/specs/execd-api.yaml b/specs/execd-api.yaml index 2a46a484b..229b8a7d2 100644 --- a/specs/execd-api.yaml +++ b/specs/execd-api.yaml @@ -502,6 +502,56 @@ paths: "500": $ref: "#/components/responses/InternalServerError" + /command/{id}/resume: + get: + summary: Resume command SSE stream (replay and optional live tail) + description: | + Replays stdout/stderr events from the server-side ring buffer for events with + `eid` strictly greater than `after_eid`, then—if the command is still running and + no other client holds the primary SSE slot—continues streaming live events until + completion or client disconnect. Event shape matches `POST /command` (`ServerStreamEvent`). + + This endpoint is mutually exclusive with the primary `POST /command` SSE: if that + connection is still active, the server responds with 409 Conflict. + operationId: resumeCommandStream + tags: + - Command + parameters: + - name: id + in: path + required: true + description: Command ID returned by RunCommand + schema: + type: string + example: cmd-abc123 + - name: after_eid + in: query + required: false + description: | + Only events with `eid` greater than this value are replayed from the buffer first + (then optional live tail). Omit or use `0` to replay from the oldest buffered events. + schema: + type: integer + format: int64 + minimum: 0 + default: 0 + example: 42 + responses: + "200": + description: Stream of command execution events (replay then optional live continuation) + content: + text/event-stream: + schema: + $ref: "#/components/schemas/ServerStreamEvent" + "400": + $ref: "#/components/responses/BadRequest" + "404": + $ref: "#/components/responses/NotFound" + "409": + $ref: "#/components/responses/Conflict" + "500": + $ref: "#/components/responses/InternalServerError" + /command/{id}/logs: get: summary: Get background command stdout/stderr (non-streamed) @@ -1396,6 +1446,16 @@ components: code: FILE_NOT_FOUND message: "file not found" + Conflict: + description: Request conflicts with current server state (e.g. resource in use) + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + example: + code: INVALID_REQUEST_BODY + message: "primary SSE stream is still active; disconnect it before resuming" + InternalServerError: description: Runtime server error during operation content: From 49e349484d39b4d97fd6fb8312875d4423489c23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sat, 28 Mar 2026 15:48:51 +0800 Subject: [PATCH 06/20] fix(execd): flush resume buffer tail after SSE attach --- .../execd/pkg/web/controller/command.go | 7 +++ .../pkg/web/controller/command_stream.go | 33 ++++++++++++- .../pkg/web/controller/command_stream_test.go | 49 +++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 components/execd/pkg/web/controller/command_stream_test.go diff --git a/components/execd/pkg/web/controller/command.go b/components/execd/pkg/web/controller/command.go index 89eefe217..ec6835f1d 100644 --- a/components/execd/pkg/web/controller/command.go +++ b/components/execd/pkg/web/controller/command.go @@ -165,8 +165,12 @@ func (c *CodeInterpretingController) ResumeCommandStream() { } c.setupSSEResponse() + lastReplayMaxEid := afterEid for _, ev := range events { c.writeSingleEvent("ResumeBuffer", ev.Payload, false, fmt.Sprintf("buffer eid=%d", ev.EID), 0) + if ev.EID > lastReplayMaxEid { + lastReplayMaxEid = ev.EID + } } st2, _ := codeRunner.GetCommandStatus(commandID) @@ -187,6 +191,9 @@ func (c *CodeInterpretingController) ResumeCommandStream() { return } + // Catch up events appended while the snapshot slice was replayed (holder still nil); same mutex as writeFrame. + h.flushResumeTail(commandID, lastReplayMaxEid) + select { case <-h.waitDone(): case <-c.ctx.Request.Context().Done(): diff --git a/components/execd/pkg/web/controller/command_stream.go b/components/execd/pkg/web/controller/command_stream.go index bf04ff5d3..648722ea8 100644 --- a/components/execd/pkg/web/controller/command_stream.go +++ b/components/execd/pkg/web/controller/command_stream.go @@ -1,4 +1,4 @@ -// Copyright 2025 Alibaba Group Holding Ltd. +// Copyright 2026 Alibaba Group Holding Ltd. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -167,6 +167,37 @@ func (r *streamRegistry) writeSSE(id string, data []byte, bufEid int64, handler, h.writeFrame(data, bufEid, handler, summary) } +// flushResumeTail writes all buffered events with EID > afterEid to the current holder while holding h.mu. +// Live writeFrame calls block on the same mutex, so chunks appended only to the ring during the initial +// snapshot replay cannot be missed on this connection (see ResumeCommandStream). +func (h *streamHub) flushResumeTail(commandID string, afterEid int64) { + if h == nil || h.holder == nil { + return + } + h.mu.Lock() + defer h.mu.Unlock() + + tail, ok := resumeBuffer.EventsAfter(commandID, afterEid) + if !ok || len(tail) == 0 { + return + } + writer := h.holder.writer + for _, ev := range tail { + payload := append(append([]byte(nil), ev.Payload...), '\n', '\n') + n, err := writer.Write(payload) + if err == nil && n != len(payload) { + err = io.ErrShortWrite + } + if err != nil { + log.Error("flushResumeTail: write eid=%d: %v", ev.EID, err) + return + } + if flusher, ok := writer.(http.Flusher); ok { + flusher.Flush() + } + } +} + func (h *streamHub) writeFrame(data []byte, bufEid int64, handler, summary string) { h.mu.Lock() defer h.mu.Unlock() diff --git a/components/execd/pkg/web/controller/command_stream_test.go b/components/execd/pkg/web/controller/command_stream_test.go new file mode 100644 index 000000000..72d913388 --- /dev/null +++ b/components/execd/pkg/web/controller/command_stream_test.go @@ -0,0 +1,49 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controller + +import ( + "context" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestFlushResumeTail_NilHubNoPanic(t *testing.T) { + var h *streamHub + h.flushResumeTail("any", 0) +} + +func TestFlushResumeTail_WritesBufferedEvents(t *testing.T) { + cmdID := "flush-resume-test-cmd" + payload := []byte(`{"type":"stdout","eid":1}`) + require.NoError(t, resumeBuffer.Append(cmdID, 1, payload)) + + w := httptest.NewRecorder() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + h := &streamHub{ + streamID: cmdID, + done: make(chan struct{}), + holder: &streamHolder{writer: w, ctx: ctx}, + } + h.flushResumeTail(cmdID, 0) + + body := w.Body.String() + require.Contains(t, body, "stdout") + require.Contains(t, body, `"eid":1`) +} From 7ac7afdcda982d7940cc3b2872fcb365e17dd095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sat, 28 Mar 2026 16:24:02 +0800 Subject: [PATCH 07/20] fix(execd): serialize streamHub holder access under h.mu --- .../execd/pkg/web/controller/command.go | 8 +++- .../pkg/web/controller/command_stream.go | 44 +++++++++++++------ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/components/execd/pkg/web/controller/command.go b/components/execd/pkg/web/controller/command.go index ec6835f1d..8c02b3e3e 100644 --- a/components/execd/pkg/web/controller/command.go +++ b/components/execd/pkg/web/controller/command.go @@ -175,6 +175,10 @@ func (c *CodeInterpretingController) ResumeCommandStream() { st2, _ := codeRunner.GetCommandStatus(commandID) if st2 == nil || !st2.Running { + if len(events) > 0 { + log.Info("resume stream: command_id=%s after_eid=%d snapshot_events=%d (replay only)", + commandID, afterEid, len(events)) + } return } @@ -192,7 +196,9 @@ func (c *CodeInterpretingController) ResumeCommandStream() { } // Catch up events appended while the snapshot slice was replayed (holder still nil); same mutex as writeFrame. - h.flushResumeTail(commandID, lastReplayMaxEid) + tailN := h.flushResumeTail(commandID, lastReplayMaxEid) + log.Info("resume stream: command_id=%s after_eid=%d snapshot_events=%d post_attach_tail=%d (live)", + commandID, afterEid, len(events), tailN) select { case <-h.waitDone(): diff --git a/components/execd/pkg/web/controller/command_stream.go b/components/execd/pkg/web/controller/command_stream.go index 648722ea8..6ec07d7d0 100644 --- a/components/execd/pkg/web/controller/command_stream.go +++ b/components/execd/pkg/web/controller/command_stream.go @@ -50,6 +50,7 @@ func deferResumeCleanup(c *CodeInterpretingController) { } commandStreams.closeAndRemove(id) resumeBuffer.Delete(id) + log.Info("command stream: hub and resume buffer cleaned up id=%s", id) } // --- live SSE routing (mutually exclusive main vs resume) --- @@ -79,21 +80,26 @@ func (r *streamRegistry) registerPrimary(id string, w http.ResponseWriter, ctx c streamID: id, done: make(chan struct{}), } - h.holder = &streamHolder{writer: w, ctx: ctx} r.m[id] = h + h.mu.Lock() + h.holder = &streamHolder{writer: w, ctx: ctx} + h.mu.Unlock() r.mu.Unlock() - r.watchHolderRelease(id, h, ctx) + log.Info("command stream: primary hub registered id=%s", id) + watchHolderRelease(h, ctx) } -func (r *streamRegistry) watchHolderRelease(id string, h *streamHub, ctx context.Context) { +// watchHolderRelease clears h.holder when ctx is cancelled. All holder mutations use h.mu only +// (see tryAttachResume, registerPrimary, writeFrame) so r.mu and h.mu are not split across h.holder. +func watchHolderRelease(h *streamHub, ctx context.Context) { go func() { <-ctx.Done() - r.mu.Lock() - if cur, ok := r.m[id]; ok && cur == h && h.holder != nil && h.holder.ctx == ctx { + h.mu.Lock() + defer h.mu.Unlock() + if h.holder != nil && h.holder.ctx == ctx { h.holder = nil } - r.mu.Unlock() }() } @@ -143,14 +149,17 @@ func (r *streamRegistry) tryAttachResume(id string, w http.ResponseWriter, ctx c r.mu.Unlock() return nil, errLiveHubNotFound } + h.mu.Lock() if h.holder != nil && h.holder.ctx.Err() == nil { + h.mu.Unlock() r.mu.Unlock() return nil, errLiveStreamPrimaryActive } h.holder = &streamHolder{writer: w, ctx: ctx} + h.mu.Unlock() r.mu.Unlock() - r.watchHolderRelease(id, h, ctx) + watchHolderRelease(h, ctx) return h, nil } @@ -170,32 +179,39 @@ func (r *streamRegistry) writeSSE(id string, data []byte, bufEid int64, handler, // flushResumeTail writes all buffered events with EID > afterEid to the current holder while holding h.mu. // Live writeFrame calls block on the same mutex, so chunks appended only to the ring during the initial // snapshot replay cannot be missed on this connection (see ResumeCommandStream). -func (h *streamHub) flushResumeTail(commandID string, afterEid int64) { - if h == nil || h.holder == nil { - return +// Returns how many extra events were written after the initial snapshot replay. +func (h *streamHub) flushResumeTail(commandID string, afterEid int64) int { + if h == nil { + return 0 } h.mu.Lock() defer h.mu.Unlock() + if h.holder == nil { + return 0 + } tail, ok := resumeBuffer.EventsAfter(commandID, afterEid) if !ok || len(tail) == 0 { - return + return 0 } writer := h.holder.writer + written := 0 for _, ev := range tail { payload := append(append([]byte(nil), ev.Payload...), '\n', '\n') - n, err := writer.Write(payload) - if err == nil && n != len(payload) { + nw, err := writer.Write(payload) + if err == nil && nw != len(payload) { err = io.ErrShortWrite } if err != nil { log.Error("flushResumeTail: write eid=%d: %v", ev.EID, err) - return + return written } if flusher, ok := writer.(http.Flusher); ok { flusher.Flush() } + written++ } + return written } func (h *streamHub) writeFrame(data []byte, bufEid int64, handler, summary string) { From a13bf02d3e9b30d223f8a0e4761a8caf72a9b6a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 30 Mar 2026 09:52:26 +0800 Subject: [PATCH 08/20] chore(execd): update README for sse stream resume --- components/execd/README.md | 1 + components/execd/README_zh.md | 1 + 2 files changed, 2 insertions(+) diff --git a/components/execd/README.md b/components/execd/README.md index dce1fd739..300254b42 100644 --- a/components/execd/README.md +++ b/components/execd/README.md @@ -52,6 +52,7 @@ English | [中文](README_zh.md) - Proper signal forwarding with process groups - Real-time stdout/stderr streaming - Context-aware interruption +- **SSE resume (foreground `POST /command` and `POST /code`)**: stdout/stderr are copied to a bounded ring buffer while the primary SSE is active. If the client disconnects, **`GET /command/{id}/resume?after_eid=`** replays buffered events (`eid > after_eid`) and may attach as the sole live consumer if the command is still running. An active primary stream returns **`409 Conflict`**. See `specs/execd-api.yaml`. ### Filesystem diff --git a/components/execd/README_zh.md b/components/execd/README_zh.md index 35c3e4aac..a664fe15a 100644 --- a/components/execd/README_zh.md +++ b/components/execd/README_zh.md @@ -50,6 +50,7 @@ - 通过进程组管理正确转发信号 - 实时 stdout/stderr 流式输出 - 支持上下文感知的中断 +- **SSE 断线续传(前台 `POST /command` / `POST /code`)**:主 SSE 存活期间 stdout/stderr 会写入有界环形缓冲;客户端断开后可用 **`GET /command/{id}/resume?after_eid=`** 按 `eid` 重放并可在命令仍运行时独占续传;主连接仍占用时返回 **`409 Conflict`**。OpenAPI 见 `specs/execd-api.yaml`。 ### 文件系统 From 788c3d3949b5b8841f07b0f9fcc4cba12422c013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 07:04:15 +0800 Subject: [PATCH 09/20] fix(execd): resolve merge conflicts and fix test compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update OnExecuteStdout/OnExecuteStderr callbacks to match new (eid int64, string) signature. Remove stale cwd existence validation test — Validate() delegates to runtime for path resolution. Co-Authored-By: Claude Opus 4.7 --- components/execd/pkg/runtime/command_test.go | 8 ++++---- components/execd/pkg/web/model/codeinterpreting_test.go | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/components/execd/pkg/runtime/command_test.go b/components/execd/pkg/runtime/command_test.go index 1fefea96d..1eab21509 100644 --- a/components/execd/pkg/runtime/command_test.go +++ b/components/execd/pkg/runtime/command_test.go @@ -241,8 +241,8 @@ func TestRunCommand_ExpandsHomeInCwd(t *testing.T) { Timeout: 5 * time.Second, Hooks: ExecuteResultHook{ OnExecuteInit: func(_ string) {}, - OnExecuteStdout: func(s string) { stdoutLines = append(stdoutLines, s) }, - OnExecuteStderr: func(_ string) {}, + OnExecuteStdout: func(_ int64, s string) { stdoutLines = append(stdoutLines, s) }, + OnExecuteStderr: func(_ int64, _ string) {}, OnExecuteError: func(err *execute.ErrorOutput) { require.Failf(t, "unexpected error hook", "%+v", err) }, @@ -303,8 +303,8 @@ func TestRunCommand_ExpandsCwdFromRequestEnvWithHigherPriority(t *testing.T) { }, Hooks: ExecuteResultHook{ OnExecuteInit: func(_ string) {}, - OnExecuteStdout: func(s string) { stdoutLines = append(stdoutLines, s) }, - OnExecuteStderr: func(_ string) {}, + OnExecuteStdout: func(_ int64, s string) { stdoutLines = append(stdoutLines, s) }, + OnExecuteStderr: func(_ int64, _ string) {}, OnExecuteError: func(err *execute.ErrorOutput) { gotErr = err }, diff --git a/components/execd/pkg/web/model/codeinterpreting_test.go b/components/execd/pkg/web/model/codeinterpreting_test.go index c80f09ad9..b7f3db0c3 100644 --- a/components/execd/pkg/web/model/codeinterpreting_test.go +++ b/components/execd/pkg/web/model/codeinterpreting_test.go @@ -55,10 +55,11 @@ func TestRunCommandRequestValidateCwd(t *testing.T) { req := RunCommandRequest{Command: "ls", Cwd: tmp} require.NoError(t, req.Validate()) + // Cwd is not validated for existence — the runtime resolves it. Only + // structural constraints (non-empty command, non-negative timeout, uid/gid) + // are enforced at the API layer. req.Cwd = filepath.Join(tmp, "missing-subdir") - err := req.Validate() - require.Error(t, err) - require.Contains(t, err.Error(), "working directory") + require.NoError(t, req.Validate()) } func ptr32(v uint32) *uint32 { return &v } From f604769ae78b66c85b01be60f0fdf3b0b68c6b96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 07:32:24 +0800 Subject: [PATCH 10/20] fix(execd): review fixes for SSE stream resume - Drop resumeEnabled from RunCode (no /code resume endpoint) - Give terminal events (complete/error) eids so they enter resume buffer - Merge nextStdoutStderrEventID/NextControlEventID into NextEventID - Use atomic.Bool for resumeEnabled to avoid data race - Remove unused ErrorCodeNotImplemented constant Co-Authored-By: Claude Opus 4.7 --- components/execd/pkg/runtime/bash_session.go | 2 +- components/execd/pkg/runtime/jupyter.go | 4 ++-- components/execd/pkg/runtime/types.go | 6 +++--- .../execd/pkg/web/controller/codeinterpreting.go | 14 +++++--------- components/execd/pkg/web/controller/command.go | 6 +++--- components/execd/pkg/web/controller/sse.go | 14 +++++++++----- components/execd/pkg/web/model/error.go | 1 - 7 files changed, 23 insertions(+), 24 deletions(-) diff --git a/components/execd/pkg/runtime/bash_session.go b/components/execd/pkg/runtime/bash_session.go index ef0820599..94b89a34e 100644 --- a/components/execd/pkg/runtime/bash_session.go +++ b/components/execd/pkg/runtime/bash_session.go @@ -252,7 +252,7 @@ func (s *bashSession) run(ctx context.Context, request *ExecuteCodeRequest) erro continue } if request.Hooks.OnExecuteStdout != nil { - request.Hooks.OnExecuteStdout(request.nextStdoutStderrEventID(), line) + request.Hooks.OnExecuteStdout(request.NextEventID(), line) } } } diff --git a/components/execd/pkg/runtime/jupyter.go b/components/execd/pkg/runtime/jupyter.go index fc697d25a..1dff6a9e2 100644 --- a/components/execd/pkg/runtime/jupyter.go +++ b/components/execd/pkg/runtime/jupyter.go @@ -123,12 +123,12 @@ func dispatchExecutionResultHooks(request *ExecuteCodeRequest, result *execute.E switch stream.Name { case execute.StreamStdout: if stream.Text != "" && request.Hooks.OnExecuteStdout != nil { - eid := request.nextStdoutStderrEventID() + eid := request.NextEventID() request.Hooks.OnExecuteStdout(eid, stream.Text) } case execute.StreamStderr: if stream.Text != "" && request.Hooks.OnExecuteStderr != nil { - eid := request.nextStdoutStderrEventID() + eid := request.NextEventID() request.Hooks.OnExecuteStderr(eid, stream.Text) } default: diff --git a/components/execd/pkg/runtime/types.go b/components/execd/pkg/runtime/types.go index 3beedc23b..a5ce7cb95 100644 --- a/components/execd/pkg/runtime/types.go +++ b/components/execd/pkg/runtime/types.go @@ -49,7 +49,7 @@ type ExecuteCodeRequest struct { eventSeq atomic.Uint64 } -func (req *ExecuteCodeRequest) nextStdoutStderrEventID() int64 { +func (req *ExecuteCodeRequest) NextEventID() int64 { if req == nil { return 0 } @@ -61,7 +61,7 @@ func (req *ExecuteCodeRequest) wrapStdoutPipeHook() func(string) { if text == "" || req.Hooks.OnExecuteStdout == nil { return } - eid := req.nextStdoutStderrEventID() + eid := req.NextEventID() req.Hooks.OnExecuteStdout(eid, text) } } @@ -71,7 +71,7 @@ func (req *ExecuteCodeRequest) wrapStderrPipeHook() func(string) { if text == "" || req.Hooks.OnExecuteStderr == nil { return } - eid := req.nextStdoutStderrEventID() + eid := req.NextEventID() req.Hooks.OnExecuteStderr(eid, text) } } diff --git a/components/execd/pkg/web/controller/codeinterpreting.go b/components/execd/pkg/web/controller/codeinterpreting.go index 82019c650..2c955a8a1 100644 --- a/components/execd/pkg/web/controller/codeinterpreting.go +++ b/components/execd/pkg/web/controller/codeinterpreting.go @@ -21,6 +21,7 @@ import ( "io" "net/http" "sync" + "sync/atomic" "time" "github.com/gin-gonic/gin" @@ -47,8 +48,8 @@ type CodeInterpretingController struct { resumeStreamMu sync.Mutex resumeStreamID string - // resumeEnabled opts into disconnect resume (event buffer + live hub) for RunCommand / RunCode. - resumeEnabled bool + // resumeEnabled opts into disconnect resume (event buffer + live hub) for RunCommand. + resumeEnabled atomic.Bool } type codeExecutionRunner interface { @@ -137,11 +138,6 @@ func (c *CodeInterpretingController) RunCode() { ctx, cancel := context.WithCancel(c.ctx.Request.Context()) defer cancel() - c.resumeEnabled = true - defer func() { - deferResumeCleanup(c) - c.resumeEnabled = false - }() execStart := time.Now() var recordOnce sync.Once recordExecution := func(result string) { @@ -155,7 +151,7 @@ func (c *CodeInterpretingController) RunCode() { }) } runCodeRequest := c.buildExecuteCodeRequest(request) - eventsHandler := c.setServerEventsHandler(ctx) + eventsHandler := c.setServerEventsHandler(ctx, runCodeRequest) // completeCh is closed when OnExecuteComplete fires, meaning the final SSE // event has been written and flushed. We only wait for this callback as a @@ -395,7 +391,7 @@ func (c *CodeInterpretingController) RunInSession() { close(completeCh) }) } - hooks := c.setServerEventsHandler(ctx) + hooks := c.setServerEventsHandler(ctx, runReq) origComplete := hooks.OnExecuteComplete hooks.OnExecuteComplete = func(executionTime time.Duration) { origComplete(executionTime) diff --git a/components/execd/pkg/web/controller/command.go b/components/execd/pkg/web/controller/command.go index bd9fcc331..3a99d8487 100644 --- a/components/execd/pkg/web/controller/command.go +++ b/components/execd/pkg/web/controller/command.go @@ -55,10 +55,10 @@ func (c *CodeInterpretingController) RunCommand() { ctx, cancel := context.WithCancel(c.ctx.Request.Context()) defer cancel() - c.resumeEnabled = true + c.resumeEnabled.Store(true) defer func() { deferResumeCleanup(c) - c.resumeEnabled = false + c.resumeEnabled.Store(false) }() execStart := time.Now() var recordOnce sync.Once @@ -74,7 +74,7 @@ func (c *CodeInterpretingController) RunCommand() { } runCodeRequest := c.buildExecuteCommandRequest(request) - eventsHandler := c.setServerEventsHandler(ctx) + eventsHandler := c.setServerEventsHandler(ctx, runCodeRequest) origComplete := eventsHandler.OnExecuteComplete eventsHandler.OnExecuteComplete = func(executionTime time.Duration) { origComplete(executionTime) diff --git a/components/execd/pkg/web/controller/sse.go b/components/execd/pkg/web/controller/sse.go index c172ec57a..211c88bcb 100644 --- a/components/execd/pkg/web/controller/sse.go +++ b/components/execd/pkg/web/controller/sse.go @@ -46,10 +46,10 @@ func (c *basicController) setupSSEResponse() { } // setServerEventsHandler adapts runtime callbacks to SSE events. -func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) runtime.ExecuteResultHook { +func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context, req *runtime.ExecuteCodeRequest) runtime.ExecuteResultHook { return runtime.ExecuteResultHook{ OnExecuteInit: func(session string) { - if c.resumeEnabled { + if c.resumeEnabled.Load() { c.resumeStreamMu.Lock() c.resumeStreamID = session c.resumeStreamMu.Unlock() @@ -100,26 +100,30 @@ func (c *CodeInterpretingController) setServerEventsHandler(ctx context.Context) } }, OnExecuteComplete: func(executionTime time.Duration) { + eid := req.NextEventID() event := model.ServerStreamEvent{ + Eid: eid, Type: model.StreamEventTypeComplete, ExecutionTime: executionTime.Milliseconds(), Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteComplete", payload, true, event.Summary(), 0) + c.writeSingleEvent("OnExecuteComplete", payload, true, event.Summary(), eid) }, OnExecuteError: func(err *execute.ErrorOutput) { if err == nil { return } + eid := req.NextEventID() event := model.ServerStreamEvent{ + Eid: eid, Type: model.StreamEventTypeError, Error: err, Timestamp: time.Now().UnixMilli(), } payload := event.ToJSON() - c.writeSingleEvent("OnExecuteError", payload, true, event.Summary(), 0) + c.writeSingleEvent("OnExecuteError", payload, true, event.Summary(), eid) }, OnExecuteStatus: func(status string) { event := model.ServerStreamEvent{ @@ -169,7 +173,7 @@ func (c *CodeInterpretingController) writeSingleEvent(handler string, data []byt } var streamID string - if c.resumeEnabled { + if c.resumeEnabled.Load() { c.resumeStreamMu.Lock() streamID = c.resumeStreamID c.resumeStreamMu.Unlock() diff --git a/components/execd/pkg/web/model/error.go b/components/execd/pkg/web/model/error.go index 22ae50e23..46c21ac5c 100644 --- a/components/execd/pkg/web/model/error.go +++ b/components/execd/pkg/web/model/error.go @@ -26,7 +26,6 @@ const ( ErrorCodeFileNotFound ErrorCode = "FILE_NOT_FOUND" ErrorCodeUnknown ErrorCode = "UNKNOWN" ErrorCodeContextNotFound ErrorCode = "CONTEXT_NOT_FOUND" - ErrorCodeNotImplemented ErrorCode = "NOT_IMPLEMENTED" ErrorCodeNotSupported ErrorCode = "NOT_SUPPORTED" ) From 2473b33a36cef1d71c58bb85aa0a40ba6c93982b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 07:34:12 +0800 Subject: [PATCH 11/20] docs(execd): update SSE resume spec to reflect buffered terminal events Co-Authored-By: Claude Opus 4.7 --- specs/execd-api.yaml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/specs/execd-api.yaml b/specs/execd-api.yaml index 229b8a7d2..317d048cf 100644 --- a/specs/execd-api.yaml +++ b/specs/execd-api.yaml @@ -506,10 +506,11 @@ paths: get: summary: Resume command SSE stream (replay and optional live tail) description: | - Replays stdout/stderr events from the server-side ring buffer for events with - `eid` strictly greater than `after_eid`, then—if the command is still running and - no other client holds the primary SSE slot—continues streaming live events until - completion or client disconnect. Event shape matches `POST /command` (`ServerStreamEvent`). + Replays buffered events (stdout, stderr, execution_complete, execution_error) from + the server-side ring buffer for events with `eid` strictly greater than + `after_eid`, then—if the command is still running and no other client holds the + primary SSE slot—continues streaming live events until completion or client + disconnect. Event shape matches `POST /command` (`ServerStreamEvent`). This endpoint is mutually exclusive with the primary `POST /command` SSE: if that connection is still active, the server responds with 409 Conflict. @@ -528,8 +529,11 @@ paths: in: query required: false description: | - Only events with `eid` greater than this value are replayed from the buffer first - (then optional live tail). Omit or use `0` to replay from the oldest buffered events. + Only events with `eid` greater than this value are replayed. All event types + (stdout, stderr, execution_complete, execution_error) carry monotonically + increasing `eid` values. Omit or use `0` to replay from the oldest buffered + events. After replay, if the command is still running and the primary SSE slot + is free, live events continue to stream. schema: type: integer format: int64 From dd67b7508ca0118be74c2a17fd4ac6010ee16dac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 07:44:38 +0800 Subject: [PATCH 12/20] feat(sdk): auto-resume SSE stream on connection drop Transparently retry via GET /command/{id}/resume?after_eid={last_eid} when the SSE connection drops. Tracks eid from each event so the server-side ring buffer can replay only missed events. - Add eid field to EventNode - Wrap _execute_streaming_request with retry loop (max 3 attempts) - Handle 409 (retry after 1s), 404 (graceful exit) - Both async and sync adapters Co-Authored-By: Claude Opus 4.7 --- .../opensandbox/adapters/command_adapter.py | 102 ++++++++++++++---- .../adapters/converter/event_node.py | 1 + .../sync/adapters/command_adapter.py | 85 ++++++++++++--- 3 files changed, 150 insertions(+), 38 deletions(-) diff --git a/sdks/sandbox/python/src/opensandbox/adapters/command_adapter.py b/sdks/sandbox/python/src/opensandbox/adapters/command_adapter.py index e8874b04c..71ce7ea68 100644 --- a/sdks/sandbox/python/src/opensandbox/adapters/command_adapter.py +++ b/sdks/sandbox/python/src/opensandbox/adapters/command_adapter.py @@ -21,6 +21,7 @@ synchronous and streaming execution modes with proper session management. """ +import asyncio import json import logging from datetime import timedelta @@ -137,6 +138,8 @@ class CommandsAdapter(Commands): INTERRUPT_COMMAND_PATH = "/command/{execution_id}/interrupt" SESSION_PATH = "/session" RUN_IN_SESSION_PATH = "/session/{session_id}/run" + _RESUME_PATH = "/command/{id}/resume" + _MAX_RESUME_RETRIES = 3 def __init__( self, @@ -225,30 +228,83 @@ async def _execute_streaming_request( result=[], error=None, ) - client = await self._get_sse_client() - - async with client.stream("POST", url, json=json_body) as response: - if response.status_code != 200: - await response.aread() - error_body = response.text - logger.error( - "%s. Status: %s, Body: %s", - failure_message, - response.status_code, - error_body, + command_id: str | None = None + last_eid: int = 0 + + for attempt in range(self._MAX_RESUME_RETRIES + 1): + try: + if attempt == 0: + client = await self._get_sse_client() + response_ctx = client.stream("POST", url, json=json_body) + else: + resume_url = self._get_execd_url( + self._RESUME_PATH.format(id=command_id) + + f"?after_eid={last_eid}" + ) + logger.info( + "SSE resume attempt %d/%d: command_id=%s after_eid=%d", + attempt, + self._MAX_RESUME_RETRIES, + command_id, + last_eid, + ) + client = await self._get_sse_client() + response_ctx = client.stream("GET", resume_url) + + async with response_ctx as response: + if response.status_code == 409: + await response.aread() + if attempt < self._MAX_RESUME_RETRIES: + await asyncio.sleep(1) + continue + logger.warning("SSE resume: 409 conflict, primary still active after retries") + break + if response.status_code == 404 and attempt > 0: + logger.info("SSE resume: 404, command finished or buffer expired") + break + if response.status_code != 200: + await response.aread() + error_body = response.text + logger.error( + "%s. Status: %s, Body: %s", + failure_message, + response.status_code, + error_body, + ) + raise SandboxApiException( + message=f"{failure_message}. Status code: {response.status_code}", + status_code=response.status_code, + request_id=extract_request_id(response.headers), + ) + + dispatcher = ExecutionEventDispatcher(execution, handlers) + async for line in response.aiter_lines(): + event_node = _decode_sse_event_line(line) + if event_node is None: + continue + if event_node.type == "init" and not command_id: + command_id = event_node.text + if event_node.eid: + last_eid = max(last_eid, event_node.eid) + await dispatcher.dispatch(event_node) + + # Stream completed normally + break + + except ( + httpx.ReadError, + httpx.RemoteProtocolError, + httpx.ConnectError, + httpx.ConnectTimeout, + ) as e: + if not command_id or attempt >= self._MAX_RESUME_RETRIES: + raise + logger.warning( + "SSE stream disconnected (attempt %d/%d): %s", + attempt + 1, + self._MAX_RESUME_RETRIES + 1, + e, ) - raise SandboxApiException( - message=f"{failure_message}. Status code: {response.status_code}", - status_code=response.status_code, - request_id=extract_request_id(response.headers), - ) - - dispatcher = ExecutionEventDispatcher(execution, handlers) - async for line in response.aiter_lines(): - event_node = _decode_sse_event_line(line) - if event_node is None: - continue - await dispatcher.dispatch(event_node) if infer_exit_code: execution.exit_code = _infer_foreground_exit_code(execution) diff --git a/sdks/sandbox/python/src/opensandbox/adapters/converter/event_node.py b/sdks/sandbox/python/src/opensandbox/adapters/converter/event_node.py index 9de7ffb40..47aa47769 100644 --- a/sdks/sandbox/python/src/opensandbox/adapters/converter/event_node.py +++ b/sdks/sandbox/python/src/opensandbox/adapters/converter/event_node.py @@ -60,6 +60,7 @@ class EventNode(BaseModel): Corresponds to ServerStreamEvent in OpenAPI spec. """ + eid: int | None = None type: str text: str | None = None execution_count: int | None = Field(default=None, alias="execution_count") diff --git a/sdks/sandbox/python/src/opensandbox/sync/adapters/command_adapter.py b/sdks/sandbox/python/src/opensandbox/sync/adapters/command_adapter.py index e33bdffd1..ef7a1c1cd 100644 --- a/sdks/sandbox/python/src/opensandbox/sync/adapters/command_adapter.py +++ b/sdks/sandbox/python/src/opensandbox/sync/adapters/command_adapter.py @@ -19,6 +19,7 @@ import json import logging +import time from datetime import timedelta import httpx @@ -127,6 +128,8 @@ class CommandsAdapterSync(CommandsSync): RUN_COMMAND_PATH = "/command" SESSION_PATH = "/session" RUN_IN_SESSION_PATH = "/session/{session_id}/run" + _RESUME_PATH = "/command/{id}/resume" + _MAX_RESUME_RETRIES = 3 def __init__(self, connection_config: ConnectionConfigSync, execd_endpoint: SandboxEndpoint) -> None: """ @@ -192,23 +195,75 @@ def _execute_streaming_request( failure_message: str, ) -> Execution: execution = Execution(id=None, execution_count=None, result=[], error=None) - dispatcher = ExecutionEventDispatcherSync(execution, handlers) - - with self._sse_client.stream("POST", url, json=json_body) as response: - if response.status_code != 200: - response.read() - raise SandboxApiException( - message=f"{failure_message}. Status code: {response.status_code}", - status_code=response.status_code, - request_id=extract_request_id(response.headers), + command_id: str | None = None + last_eid: int = 0 + + for attempt in range(self._MAX_RESUME_RETRIES + 1): + try: + if attempt == 0: + response_ctx = self._sse_client.stream("POST", url, json=json_body) + else: + resume_url = self._get_execd_url( + self._RESUME_PATH.format(id=command_id) + + f"?after_eid={last_eid}" + ) + logger.info( + "SSE resume attempt %d/%d: command_id=%s after_eid=%d", + attempt, + self._MAX_RESUME_RETRIES, + command_id, + last_eid, + ) + response_ctx = self._sse_client.stream("GET", resume_url) + + with response_ctx as response: + if response.status_code == 409: + response.read() + if attempt < self._MAX_RESUME_RETRIES: + time.sleep(1) + continue + logger.warning("SSE resume: 409 conflict, primary still active after retries") + break + if response.status_code == 404 and attempt > 0: + logger.info("SSE resume: 404, command finished or buffer expired") + break + if response.status_code != 200: + response.read() + raise SandboxApiException( + message=f"{failure_message}. Status code: {response.status_code}", + status_code=response.status_code, + request_id=extract_request_id(response.headers), + ) + + dispatcher = ExecutionEventDispatcherSync(execution, handlers) + for line in response.iter_lines(): + event_node = _decode_sse_event_line(line) + if event_node is None: + continue + if event_node.type == "init" and not command_id: + command_id = event_node.text + if event_node.eid: + last_eid = max(last_eid, event_node.eid) + dispatcher.dispatch(event_node) + + # Stream completed normally + break + + except ( + httpx.ReadError, + httpx.RemoteProtocolError, + httpx.ConnectError, + httpx.ConnectTimeout, + ) as e: + if not command_id or attempt >= self._MAX_RESUME_RETRIES: + raise + logger.warning( + "SSE stream disconnected (attempt %d/%d): %s", + attempt + 1, + self._MAX_RESUME_RETRIES + 1, + e, ) - for line in response.iter_lines(): - event_node = _decode_sse_event_line(line) - if event_node is None: - continue - dispatcher.dispatch(event_node) - if infer_exit_code: execution.exit_code = _infer_foreground_exit_code(execution) From c9be8ef5a3d415336890e2c03b7f99831da9435b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 07:52:20 +0800 Subject: [PATCH 13/20] test(sdk): add unit tests for automatic SSE resume on disconnect Simulate mid-stream ReadError via custom httpx byte streams, then verify the adapter transparently resumes via GET /command/{id}/resume?after_eid= and delivers all events including the completion on the retry. Both async (_ErrorAfterAsyncStream) and sync (_SyncErrorAfterStream). Co-Authored-By: Claude Opus 4.7 --- .../test_command_service_adapter_streaming.py | 86 +++++++++++++++++++ ..._sync_command_service_adapter_streaming.py | 85 ++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py b/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py index eae23a8f8..8880a4933 100644 --- a/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py +++ b/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py @@ -27,6 +27,68 @@ from opensandbox.models.sandboxes import SandboxEndpoint +class _ErrorAfterAsyncStream(httpx.AsyncByteStream): + """Async byte stream that yields data then raises ReadError.""" + + def __init__(self, *chunks: bytes) -> None: + self._chunks = iter(chunks) + + def __aiter__(self) -> httpx.AsyncByteStream: + return self + + async def __anext__(self) -> bytes: + try: + chunk = next(self._chunks) + except StopIteration: + raise httpx.ReadError("simulated disconnect") + if chunk is None: + raise httpx.ReadError("simulated disconnect") + return chunk + + +class _ResumeTransport(httpx.AsyncBaseTransport): + """Simulates SSE disconnect then resume on GET /command/:id/resume.""" + + def __init__(self) -> None: + self.post_count = 0 + self.resume_count = 0 + self.last_resume_eid: int | None = None + + async def handle_async_request(self, request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/command": + self.post_count += 1 + partial = ( + b'data: {"type":"init","text":"cmd-resume-1","timestamp":100}\n\n', + b'data: {"type":"stdout","eid":1,"text":"line1","timestamp":101}\n\n', + b'data: {"type":"stdout","eid":2,"text":"line2","timestamp":102}\n\n', + None, # sentinel: disconnect after line2 + ) + return httpx.Response( + 200, + headers={"Content-Type": "text/event-stream"}, + stream=_ErrorAfterAsyncStream(*partial), + request=request, + ) + + if request.method == "GET" and "/resume" in request.url.path: + self.resume_count += 1 + qp = request.url.params.get("after_eid") + if qp: + self.last_resume_eid = int(qp) + remaining = ( + b'data: {"type":"stdout","eid":3,"text":"line3","timestamp":103}\n\n' + b'data: {"type":"execution_complete","eid":4,"execution_time":42,"timestamp":104}\n\n' + ) + return httpx.Response( + 200, + headers={"Content-Type": "text/event-stream"}, + content=remaining, + request=request, + ) + + return httpx.Response(500, content=b"unexpected", request=request) + + class _SseTransport(httpx.AsyncBaseTransport): def __init__(self) -> None: self.last_request: httpx.Request | None = None @@ -217,3 +279,27 @@ async def test_run_in_session_non_zero_exit_updates_exit_code() -> None: assert execution.error.value == "7" assert execution.complete is None assert execution.exit_code == 7 + + +@pytest.mark.asyncio +async def test_run_command_auto_resume_on_sse_disconnect() -> None: + """SSE drops after first two stdout lines; resume replays the rest transparently.""" + transport = _ResumeTransport() + cfg = ConnectionConfig(protocol="http", transport=transport) + endpoint = SandboxEndpoint(endpoint="localhost:44772", port=44772) + adapter = CommandsAdapter(cfg, endpoint) + + execution = await adapter.run("echo lines") + + assert transport.post_count == 1, "should send POST /command" + assert transport.resume_count == 1, "should send GET /command/:id/resume" + assert transport.last_resume_eid == 2, "resume after_eid should match last received eid" + + assert execution.id == "cmd-resume-1" + assert len(execution.logs.stdout) == 3 + assert execution.logs.stdout[0].text == "line1" + assert execution.logs.stdout[1].text == "line2" + assert execution.logs.stdout[2].text == "line3" + assert execution.complete is not None + assert execution.complete.execution_time_in_millis == 42 + assert execution.exit_code == 0 diff --git a/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py b/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py index a92e174f6..b90a314cc 100644 --- a/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py +++ b/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py @@ -25,6 +25,68 @@ from opensandbox.sync.adapters.command_adapter import CommandsAdapterSync +class _SyncErrorAfterStream(httpx.SyncByteStream): + """Sync byte stream that yields data then raises ReadError.""" + + def __init__(self, *chunks: bytes) -> None: + self._chunks = iter(chunks) + + def __iter__(self): + return self + + def __next__(self) -> bytes: + try: + chunk = next(self._chunks) + except StopIteration: + raise httpx.ReadError("simulated disconnect") + if chunk is None: + raise httpx.ReadError("simulated disconnect") + return chunk + + +class _SyncResumeTransport(httpx.BaseTransport): + """Simulates SSE disconnect then resume on GET /command/:id/resume (sync).""" + + def __init__(self) -> None: + self.post_count = 0 + self.resume_count = 0 + self.last_resume_eid: int | None = None + + def handle_request(self, request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/command": + self.post_count += 1 + partial = ( + b'data: {"type":"init","text":"cmd-resume-1","timestamp":100}\n\n', + b'data: {"type":"stdout","eid":1,"text":"line1","timestamp":101}\n\n', + b'data: {"type":"stdout","eid":2,"text":"line2","timestamp":102}\n\n', + None, # sentinel: disconnect after line2 + ) + return httpx.Response( + 200, + headers={"Content-Type": "text/event-stream"}, + stream=_SyncErrorAfterStream(*partial), + request=request, + ) + + if request.method == "GET" and "/resume" in request.url.path: + self.resume_count += 1 + qp = request.url.params.get("after_eid") + if qp: + self.last_resume_eid = int(qp) + remaining = ( + b'data: {"type":"stdout","eid":3,"text":"line3","timestamp":103}\n\n' + b'data: {"type":"execution_complete","eid":4,"execution_time":42,"timestamp":104}\n\n' + ) + return httpx.Response( + 200, + headers={"Content-Type": "text/event-stream"}, + content=remaining, + request=request, + ) + + return httpx.Response(500, content=b"unexpected", request=request) + + class _SseTransport(httpx.BaseTransport): def handle_request(self, request: httpx.Request) -> httpx.Response: body = request.content.decode("utf-8") if isinstance(request.content, (bytes, bytearray)) else "" @@ -164,3 +226,26 @@ def test_sync_run_in_session_non_zero_exit_updates_exit_code() -> None: assert execution.error.value == "7" assert execution.complete is None assert execution.exit_code == 7 + + +def test_sync_run_command_auto_resume_on_sse_disconnect() -> None: + """SSE drops after first two stdout lines; resume replays the rest transparently.""" + transport = _SyncResumeTransport() + cfg = ConnectionConfigSync(protocol="http", transport=transport) + endpoint = SandboxEndpoint(endpoint="localhost:44772", port=44772) + adapter = CommandsAdapterSync(cfg, endpoint) + + execution = adapter.run("echo lines") + + assert transport.post_count == 1, "should send POST /command" + assert transport.resume_count == 1, "should send GET /command/:id/resume" + assert transport.last_resume_eid == 2, "resume after_eid should match last received eid" + + assert execution.id == "cmd-resume-1" + assert len(execution.logs.stdout) == 3 + assert execution.logs.stdout[0].text == "line1" + assert execution.logs.stdout[1].text == "line2" + assert execution.logs.stdout[2].text == "line3" + assert execution.complete is not None + assert execution.complete.execution_time_in_millis == 42 + assert execution.exit_code == 0 From 7cf90b19c9e8bfde9a7a2ce67d57cc9a40a6ec06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 08:02:51 +0800 Subject: [PATCH 14/20] =?UTF-8?q?chore(sdk):=20fix=20ruff=20B904=20?= =?UTF-8?q?=E2=80=94=20raise=20...=20from=20None=20in=20test=20error=20str?= =?UTF-8?q?eams?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 --- .../python/tests/test_command_service_adapter_streaming.py | 2 +- .../python/tests/test_sync_command_service_adapter_streaming.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py b/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py index 8880a4933..1dbc7ec72 100644 --- a/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py +++ b/sdks/sandbox/python/tests/test_command_service_adapter_streaming.py @@ -40,7 +40,7 @@ async def __anext__(self) -> bytes: try: chunk = next(self._chunks) except StopIteration: - raise httpx.ReadError("simulated disconnect") + raise httpx.ReadError("simulated disconnect") from None if chunk is None: raise httpx.ReadError("simulated disconnect") return chunk diff --git a/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py b/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py index b90a314cc..9f5cdfdeb 100644 --- a/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py +++ b/sdks/sandbox/python/tests/test_sync_command_service_adapter_streaming.py @@ -38,7 +38,7 @@ def __next__(self) -> bytes: try: chunk = next(self._chunks) except StopIteration: - raise httpx.ReadError("simulated disconnect") + raise httpx.ReadError("simulated disconnect") from None if chunk is None: raise httpx.ReadError("simulated disconnect") return chunk From 4ae134439d79f99a9d6caf12e8b1a240b6afbb36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 09:06:53 +0800 Subject: [PATCH 15/20] test(e2e): add SSE resume e2e test with disconnect injection Adds e2e test that injects mid-stream ReadError via custom httpx transport and verifies transparent SDK resume against real execd. Co-Authored-By: Claude Opus 4.7 --- tests/python/tests/test_command_resume_e2e.py | 298 ++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 tests/python/tests/test_command_resume_e2e.py diff --git a/tests/python/tests/test_command_resume_e2e.py b/tests/python/tests/test_command_resume_e2e.py new file mode 100644 index 000000000..7bd7fc111 --- /dev/null +++ b/tests/python/tests/test_command_resume_e2e.py @@ -0,0 +1,298 @@ +# +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +E2E tests for SSE stream disconnect and automatic resume. + +Creates a dedicated sandbox, injects a mid-stream disconnect via a custom +httpx transport, and verifies the SDK transparently resumes via the real +execd resume endpoint. +""" + +from __future__ import annotations + +import logging +from datetime import timedelta + +import httpx +import pytest + +from opensandbox.adapters.command_adapter import CommandsAdapter +from opensandbox.config import ConnectionConfig +from opensandbox.constants import DEFAULT_EXECD_PORT +from opensandbox.models.sandboxes import Host, SandboxEndpoint, SandboxImageSpec, Volume +from opensandbox.sandbox import Sandbox + +from tests.base_e2e_test import ( + TEST_API_KEY, + TEST_DOMAIN, + TEST_PROTOCOL, + create_connection_config, + get_e2e_sandbox_resource, + get_sandbox_image, + should_use_server_proxy, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Custom async transport: injects disconnect mid-stream on POST /command +# --------------------------------------------------------------------------- + + +class _DisconnectInjectStream(httpx.AsyncByteStream): + """Wraps real byte stream; raises ReadError after N chunks yielded.""" + + def __init__(self, real_stream: httpx.AsyncByteStream, disconnect_after_chunks: int = 4) -> None: + self._real = real_stream + self._disconnect_after = disconnect_after_chunks + self._chunk_count = 0 + + def __aiter__(self) -> httpx.AsyncByteStream: + return self + + async def __anext__(self) -> bytes: + if self._chunk_count >= self._disconnect_after: + raise httpx.ReadError("simulated disconnect for e2e test") + try: + chunk = await type(self._real).__anext__(self._real) + except StopAsyncIteration: + raise + self._chunk_count += 1 + return chunk + + async def aclose(self) -> None: + if hasattr(self._real, "aclose"): + await self._real.aclose() + + +class _DisconnectInjectTransport(httpx.AsyncHTTPTransport): + """Wraps real transport; injects stream disconnect on POST command endpoints.""" + + def __init__(self) -> None: + self._real = httpx.AsyncHTTPTransport() + self.post_count: int = 0 + self.resume_count: int = 0 + self.last_resume_eid: int | None = None + + @staticmethod + def _is_command_post(path: str) -> bool: + """Match POST /command or POST /session/:id/run (with optional proxy prefix).""" + return path == "/command" or path.endswith("/command") or path.endswith("/run") + + async def handle_async_request(self, request: httpx.Request) -> httpx.Response: + response = await self._real.handle_async_request(request) + + if request.method == "POST" and self._is_command_post(request.url.path): + self.post_count += 1 + response.stream = _DisconnectInjectStream( + response.stream, disconnect_after_chunks=4 + ) + + elif request.method == "GET" and "/resume" in request.url.path: + self.resume_count += 1 + qp = request.url.params.get("after_eid") + if qp: + self.last_resume_eid = int(qp) + + return response + + +# --------------------------------------------------------------------------- +# Shared sandbox fixture — created once per class, killed on teardown +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +class TestCommandResumeE2E: + """E2E tests: SSE disconnect mid-stream triggers transparent resume.""" + + sandbox: Sandbox | None = None + execd_endpoint: SandboxEndpoint | None = None + + @pytest.fixture(scope="class", autouse=True) + async def _sandbox_lifecycle(self, request: pytest.FixtureRequest) -> None: + """Create a dedicated sandbox and ALWAYS clean it up.""" + logger.info("=" * 80) + logger.info("SETUP: Creating sandbox for SSE resume E2E") + logger.info("=" * 80) + + connection_config = create_connection_config() + + sandbox = await Sandbox.create( + image=SandboxImageSpec(get_sandbox_image()), + entrypoint=["/opt/opensandbox/code-interpreter.sh"], + connection_config=connection_config, + resource=get_e2e_sandbox_resource(), + timeout=timedelta(minutes=15), + ready_timeout=timedelta(seconds=60), + metadata={"tag": "e2e-command-resume"}, + env={ + "E2E_TEST": "true", + "EXECD_LOG_FILE": "/tmp/opensandbox-e2e/logs/execd-resume.log", + "EXECD_API_GRACE_SHUTDOWN": "3s", + }, + health_check_polling_interval=timedelta(milliseconds=500), + volumes=[ + Volume( + name="execd-log", + host=Host(path="/tmp/opensandbox-e2e/logs"), + mountPath="/tmp/opensandbox-e2e/logs", + readOnly=False, + ), + ], + ) + + endpoint_obj = await sandbox.get_endpoint(DEFAULT_EXECD_PORT) + assert endpoint_obj is not None + assert endpoint_obj.endpoint + + request.cls.sandbox = sandbox + request.cls.execd_endpoint = endpoint_obj + + logger.info("Sandbox ready: %s execd endpoint: %s", sandbox.id, endpoint_obj.endpoint) + + try: + yield + finally: + if sandbox is not None: + try: + await sandbox.kill() + except Exception as e: + logger.warning("Teardown: sandbox.kill() failed: %s", e, exc_info=True) + try: + await sandbox.close() + except Exception as e: + logger.warning("Teardown: sandbox.close() failed: %s", e, exc_info=True) + + # ----------------------------------------------------------------------- + # Test: standalone command with injected disconnect + # ----------------------------------------------------------------------- + + @pytest.mark.timeout(120) + async def test_run_command_auto_resume_on_disconnect(self) -> None: + """Inject disconnect mid-stream; verify resume fires and all events arrive.""" + transport = _DisconnectInjectTransport() + cfg = ConnectionConfig( + domain=TEST_DOMAIN, + api_key=TEST_API_KEY, + transport=transport, + protocol=TEST_PROTOCOL, + request_timeout=timedelta(minutes=3), + use_server_proxy=should_use_server_proxy(), + ) + + adapter = CommandsAdapter(cfg, self.execd_endpoint) + + # Long-ish command with sleeps so events flush in separate chunks + cmd = ( + "for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; " + "do echo \"line$i\"; sleep 0.2; done" + ) + + execution = await adapter.run(cmd) + + # --- assertions: resume actually happened --- + assert transport.post_count == 1, "should send exactly one POST /command" + assert transport.resume_count >= 1, ( + f"should resume at least once, got resume_count={transport.resume_count}" + ) + assert transport.last_resume_eid is not None, "resume should include after_eid" + assert transport.last_resume_eid >= 1, ( + f"after_eid should be >=1, got {transport.last_resume_eid}" + ) + + # --- assertions: all output received --- + assert len(execution.logs.stdout) == 20, ( + f"expected 20 stdout lines, got {len(execution.logs.stdout)}" + ) + for i, msg in enumerate(execution.logs.stdout): + expected = f"line{i + 1}" + actual = msg.text.strip() + assert actual == expected, f"stdout[{i}]: expected {expected!r}, got {actual!r}" + + assert execution.complete is not None, "should have completion event" + assert execution.complete.execution_time_in_millis >= 0 + assert execution.exit_code == 0 + + logger.info( + "Resume e2e: post=%d resume=%d after_eid=%d lines=%d", + transport.post_count, + transport.resume_count, + transport.last_resume_eid, + len(execution.logs.stdout), + ) + + # ----------------------------------------------------------------------- + # Test: run_in_session with injected disconnect + # ----------------------------------------------------------------------- + + @pytest.mark.timeout(120) + async def test_run_in_session_auto_resume_on_disconnect(self) -> None: + """Inject disconnect during session command; verify resume in session context.""" + transport = _DisconnectInjectTransport() + cfg = ConnectionConfig( + domain=TEST_DOMAIN, + api_key=TEST_API_KEY, + transport=transport, + protocol=TEST_PROTOCOL, + request_timeout=timedelta(minutes=3), + use_server_proxy=should_use_server_proxy(), + ) + + adapter = CommandsAdapter(cfg, self.execd_endpoint) + session_id = await adapter.create_session(working_directory="/tmp") + logger.info("Created session: %s", session_id) + + try: + cmd = ( + "for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; " + "do echo \"sess-line$i\"; sleep 0.2; done" + ) + + execution = await adapter.run_in_session(session_id, cmd) + finally: + try: + await adapter.delete_session(session_id) + except Exception: + pass + + # --- assertions --- + assert transport.post_count == 1, "should send exactly one POST /session/:id/run" + assert transport.resume_count >= 1, ( + f"should resume at least once, got resume_count={transport.resume_count}" + ) + assert transport.last_resume_eid is not None + assert transport.last_resume_eid >= 1 + + assert len(execution.logs.stdout) == 20, ( + f"expected 20 stdout lines, got {len(execution.logs.stdout)}" + ) + for i, msg in enumerate(execution.logs.stdout): + expected = f"sess-line{i + 1}" + actual = msg.text.strip() + assert actual == expected, f"stdout[{i}]: expected {expected!r}, got {actual!r}" + + assert execution.complete is not None + assert execution.exit_code == 0 + + logger.info( + "Resume e2e session: post=%d resume=%d after_eid=%d lines=%d", + transport.post_count, + transport.resume_count, + transport.last_resume_eid, + len(execution.logs.stdout), + ) From ce06aa54ce3cdb7bdd02c8ab7e795c631dcbe923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 09:16:54 +0800 Subject: [PATCH 16/20] fix(e2e): fix __anext__ call in DisconnectInjectStream Co-Authored-By: Claude Opus 4.7 --- tests/python/tests/test_command_resume_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/tests/test_command_resume_e2e.py b/tests/python/tests/test_command_resume_e2e.py index 7bd7fc111..f0454c466 100644 --- a/tests/python/tests/test_command_resume_e2e.py +++ b/tests/python/tests/test_command_resume_e2e.py @@ -68,7 +68,7 @@ async def __anext__(self) -> bytes: if self._chunk_count >= self._disconnect_after: raise httpx.ReadError("simulated disconnect for e2e test") try: - chunk = await type(self._real).__anext__(self._real) + chunk = await self._real.__anext__() except StopAsyncIteration: raise self._chunk_count += 1 From 171a6e262b95b5795992a4e6a47229e3dbdf2777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 09:32:31 +0800 Subject: [PATCH 17/20] fix(e2e): get real async iterator via __aiter__ in DisconnectInjectStream Co-Authored-By: Claude Opus 4.7 --- tests/python/tests/test_command_resume_e2e.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/python/tests/test_command_resume_e2e.py b/tests/python/tests/test_command_resume_e2e.py index f0454c466..fabb9006b 100644 --- a/tests/python/tests/test_command_resume_e2e.py +++ b/tests/python/tests/test_command_resume_e2e.py @@ -56,26 +56,30 @@ class _DisconnectInjectStream(httpx.AsyncByteStream): """Wraps real byte stream; raises ReadError after N chunks yielded.""" - def __init__(self, real_stream: httpx.AsyncByteStream, disconnect_after_chunks: int = 4) -> None: + def __init__(self, real_stream, disconnect_after_chunks: int = 4) -> None: self._real = real_stream self._disconnect_after = disconnect_after_chunks self._chunk_count = 0 + self._real_iter = None - def __aiter__(self) -> httpx.AsyncByteStream: + def __aiter__(self): + self._real_iter = self._real.__aiter__() return self async def __anext__(self) -> bytes: if self._chunk_count >= self._disconnect_after: raise httpx.ReadError("simulated disconnect for e2e test") try: - chunk = await self._real.__anext__() + chunk = await self._real_iter.__anext__() except StopAsyncIteration: raise self._chunk_count += 1 return chunk async def aclose(self) -> None: - if hasattr(self._real, "aclose"): + if self._real_iter is not None and hasattr(self._real_iter, "aclose"): + await self._real_iter.aclose() + elif hasattr(self._real, "aclose"): await self._real.aclose() From f9b1b104b5a910c38d67ec65223fbba4d1455486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 09:47:46 +0800 Subject: [PATCH 18/20] =?UTF-8?q?test(e2e):=20remove=20session=20resume=20?= =?UTF-8?q?test=20=E2=80=94=20execd=20does=20not=20support=20it=20yet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session-scoped commands use a separate storage path (bashSessionClientMap), never set resumeEnabled, and have no resume route. The SDK attempts resume on disconnect but execd returns 404. Only /command path supports resume. Co-Authored-By: Claude Opus 4.7 --- tests/python/tests/test_command_resume_e2e.py | 67 ++----------------- 1 file changed, 6 insertions(+), 61 deletions(-) diff --git a/tests/python/tests/test_command_resume_e2e.py b/tests/python/tests/test_command_resume_e2e.py index fabb9006b..1851560e5 100644 --- a/tests/python/tests/test_command_resume_e2e.py +++ b/tests/python/tests/test_command_resume_e2e.py @@ -28,7 +28,6 @@ import httpx import pytest - from opensandbox.adapters.command_adapter import CommandsAdapter from opensandbox.config import ConnectionConfig from opensandbox.constants import DEFAULT_EXECD_PORT @@ -240,63 +239,9 @@ async def test_run_command_auto_resume_on_disconnect(self) -> None: len(execution.logs.stdout), ) - # ----------------------------------------------------------------------- - # Test: run_in_session with injected disconnect - # ----------------------------------------------------------------------- - - @pytest.mark.timeout(120) - async def test_run_in_session_auto_resume_on_disconnect(self) -> None: - """Inject disconnect during session command; verify resume in session context.""" - transport = _DisconnectInjectTransport() - cfg = ConnectionConfig( - domain=TEST_DOMAIN, - api_key=TEST_API_KEY, - transport=transport, - protocol=TEST_PROTOCOL, - request_timeout=timedelta(minutes=3), - use_server_proxy=should_use_server_proxy(), - ) - - adapter = CommandsAdapter(cfg, self.execd_endpoint) - session_id = await adapter.create_session(working_directory="/tmp") - logger.info("Created session: %s", session_id) - - try: - cmd = ( - "for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20; " - "do echo \"sess-line$i\"; sleep 0.2; done" - ) - - execution = await adapter.run_in_session(session_id, cmd) - finally: - try: - await adapter.delete_session(session_id) - except Exception: - pass - - # --- assertions --- - assert transport.post_count == 1, "should send exactly one POST /session/:id/run" - assert transport.resume_count >= 1, ( - f"should resume at least once, got resume_count={transport.resume_count}" - ) - assert transport.last_resume_eid is not None - assert transport.last_resume_eid >= 1 - - assert len(execution.logs.stdout) == 20, ( - f"expected 20 stdout lines, got {len(execution.logs.stdout)}" - ) - for i, msg in enumerate(execution.logs.stdout): - expected = f"sess-line{i + 1}" - actual = msg.text.strip() - assert actual == expected, f"stdout[{i}]: expected {expected!r}, got {actual!r}" - - assert execution.complete is not None - assert execution.exit_code == 0 - - logger.info( - "Resume e2e session: post=%d resume=%d after_eid=%d lines=%d", - transport.post_count, - transport.resume_count, - transport.last_resume_eid, - len(execution.logs.stdout), - ) + # NOTE: run_in_session resume is intentionally NOT tested here. + # execd does not currently support resume for session-scoped commands: + # - No resume route under /session group + # - GetCommandStatus only looks in commandClientMap (sessions use bashSessionClientMap) + # - resumeEnabled is never set for RunInSession + # The SDK will attempt resume on disconnect but execd returns 404. From b7b5dff953447573f979c3fa65bccef7970c46f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 12:50:46 +0800 Subject: [PATCH 19/20] feat(sdk): add SSE stream auto-resume across all sandbox SDKs Implement automatic SSE connection drop recovery in Go, Kotlin, C#, and TypeScript SDKs. Each SDK now tracks the monotonic eid per event, and on network failure retries up to 3 times via GET /command/{id}/resume. Go: io.MultiReader + errorReader for disconnect injection in tests Kotlin: MockWebServer Dispatcher with DISCONNECT_DURING_RESPONSE_BODY C#: TruncatedStreamContent for partial body delivery TypeScript: ReadableStream start/pull for mid-stream error simulation Co-Authored-By: Claude Opus 4.7 --- .../OpenSandbox/Adapters/CommandsAdapter.cs | 110 +++++++++- .../Internal/ExecutionEventDispatcher.cs | 5 + .../csharp/src/OpenSandbox/Models/Execd.cs | 6 + .../src/OpenSandbox/Models/Execution.cs | 5 + .../OpenSandbox.Tests/CommandsAdapterTests.cs | 86 ++++++++ sdks/sandbox/go/execd.go | 7 + sdks/sandbox/go/execution.go | 8 + sdks/sandbox/go/opensandbox_test.go | 188 ++++++++++++++++++ sdks/sandbox/go/sandbox_exec.go | 78 +++++++- .../src/adapters/commandsAdapter.ts | 102 +++++++++- sdks/sandbox/javascript/src/api/execd.ts | 82 ++++++++ sdks/sandbox/javascript/src/models/execd.ts | 2 + .../javascript/src/models/execution.ts | 2 + .../src/models/executionEventDispatcher.ts | 4 + .../javascript/tests/commands.run.test.mjs | 56 ++++++ .../api/models/execd/ExecutionModels.kt | 2 + .../opensandbox/sandbox/HttpClientProvider.kt | 3 + .../execd/executions/ExecutionModels.kt | 1 + .../converter/ExecutionEventDispatcher.kt | 5 + .../adapters/service/CommandsAdapter.kt | 126 +++++++++--- .../adapters/service/CommandsAdapterTest.kt | 80 ++++++++ 21 files changed, 901 insertions(+), 57 deletions(-) diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Adapters/CommandsAdapter.cs b/sdks/sandbox/csharp/src/OpenSandbox/Adapters/CommandsAdapter.cs index 8a6f68e1d..eb41037ab 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Adapters/CommandsAdapter.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Adapters/CommandsAdapter.cs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System.IO; using System.Runtime.CompilerServices; using System.Text; using System.Text.Json; @@ -73,6 +74,8 @@ public async IAsyncEnumerable RunStreamAsync( } } + private const int MaxResumeRetries = 3; + public async Task RunAsync( string command, RunCommandOptions? options = null, @@ -80,11 +83,78 @@ public async Task RunAsync( CancellationToken cancellationToken = default) { _logger.LogDebug("Running command (commandLength={CommandLength})", command.Length); - return await ConsumeExecutionAsync( - RunStreamAsync(command, options, cancellationToken), - handlers, - inferExitCode: !(options?.Background ?? false), - cancellationToken).ConfigureAwait(false); + + var execution = new Execution(); + string? commandId = null; + var inferExitCode = !(options?.Background ?? false); + + for (int attempt = 0; attempt <= MaxResumeRetries; attempt++) + { + try + { + var stream = attempt == 0 + ? RunStreamAsync(command, options, cancellationToken) + : ResumeStreamAsync(commandId!, execution.LastEid, cancellationToken); + + await ConsumeExecutionIntoAsync(stream, execution, handlers, cancellationToken).ConfigureAwait(false); + + if (inferExitCode) + { + execution.ExitCode = InferForegroundExitCode(execution); + } + return execution; + } + catch (Exception ex) + { + if (execution.Id != null) commandId = execution.Id; + + if (ex is SandboxApiException apiEx) + { + if (apiEx.StatusCode == 409 && attempt < MaxResumeRetries) + { + await Task.Delay(1000, cancellationToken).ConfigureAwait(false); + continue; + } + if (apiEx.StatusCode == 404 && attempt > 0) + { + return execution; + } + _logger.LogError(ex, "Failed to run command (length={CommandLength})", command.Length); + throw; + } + + if (commandId == null || attempt >= MaxResumeRetries || !IsNetworkError(ex)) + { + _logger.LogError(ex, "Failed to run command (length={CommandLength})", command.Length); + throw; + } + } + } + + return execution; + } + + private async IAsyncEnumerable ResumeStreamAsync( + string commandId, + long afterEid, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + var url = $"{_baseUrl}/command/{Uri.EscapeDataString(commandId)}/resume?after_eid={afterEid}"; + using var request = new HttpRequestMessage(HttpMethod.Get, url); + + request.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("text/event-stream")); + + foreach (var header in _headers) + { + request.Headers.TryAddWithoutValidation(header.Key, header.Value); + } + + using var response = await _sseHttpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false); + + await foreach (var ev in SseParser.ParseJsonEventStreamAsync(response, "Resume command failed", cancellationToken).ConfigureAwait(false)) + { + yield return ev; + } } public async Task InterruptAsync(string sessionId, CancellationToken cancellationToken = default) @@ -318,6 +388,20 @@ private async Task ConsumeExecutionAsync( CancellationToken cancellationToken) { var execution = new Execution(); + await ConsumeExecutionIntoAsync(stream, execution, handlers, cancellationToken).ConfigureAwait(false); + if (inferExitCode) + { + execution.ExitCode = InferForegroundExitCode(execution); + } + return execution; + } + + private static async Task ConsumeExecutionIntoAsync( + IAsyncEnumerable stream, + Execution execution, + ExecutionHandlers? handlers, + CancellationToken cancellationToken) + { var dispatcher = new ExecutionEventDispatcher(execution, handlers); await foreach (var ev in stream.WithCancellation(cancellationToken).ConfigureAwait(false)) @@ -325,13 +409,21 @@ private async Task ConsumeExecutionAsync( PreserveLegacyInitId(ev, execution); await dispatcher.DispatchAsync(ev).ConfigureAwait(false); } + } - if (inferExitCode) + private static bool IsNetworkError(Exception ex) + { + if (ex is OperationCanceledException) return false; + if (ex is IOException) return true; + if (ex is System.Net.Http.HttpRequestException) return true; + + var inner = ex.InnerException; + while (inner != null) { - execution.ExitCode = InferForegroundExitCode(execution); + if (inner is IOException) return true; + inner = inner.InnerException; } - - return execution; + return false; } private sealed record StreamingRequestSpec(string Url, object Body, string ErrorMessage); diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Internal/ExecutionEventDispatcher.cs b/sdks/sandbox/csharp/src/OpenSandbox/Internal/ExecutionEventDispatcher.cs index 25886f2b2..9325f48e0 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Internal/ExecutionEventDispatcher.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Internal/ExecutionEventDispatcher.cs @@ -32,6 +32,11 @@ public ExecutionEventDispatcher(Execution execution, ExecutionHandlers? handlers public async Task DispatchAsync(ServerStreamEvent ev) { + if (ev.Eid.HasValue && ev.Eid.Value > _execution.LastEid) + { + _execution.LastEid = ev.Eid.Value; + } + var timestamp = ev.Timestamp ?? DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); switch (ev.Type) diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Models/Execd.cs b/sdks/sandbox/csharp/src/OpenSandbox/Models/Execd.cs index 91a19d529..2b2c2009b 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Models/Execd.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Models/Execd.cs @@ -62,6 +62,12 @@ public class ServerStreamEvent /// [JsonPropertyName("execution_time")] public long? ExecutionTime { get; set; } + + /// + /// Gets or sets the monotonic event ID for SSE resume. + /// + [JsonPropertyName("eid")] + public long? Eid { get; set; } } /// diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Models/Execution.cs b/sdks/sandbox/csharp/src/OpenSandbox/Models/Execution.cs index cfe34a85c..f98915db6 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Models/Execution.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Models/Execution.cs @@ -171,6 +171,11 @@ public class Execution /// Gets or sets the command exit code when available. /// public int? ExitCode { get; set; } + + /// + /// Gets or sets the highest event ID seen, used for SSE resume. + /// + public long LastEid { get; set; } } /// diff --git a/sdks/sandbox/csharp/tests/OpenSandbox.Tests/CommandsAdapterTests.cs b/sdks/sandbox/csharp/tests/OpenSandbox.Tests/CommandsAdapterTests.cs index d6db30db3..2a2444d37 100644 --- a/sdks/sandbox/csharp/tests/OpenSandbox.Tests/CommandsAdapterTests.cs +++ b/sdks/sandbox/csharp/tests/OpenSandbox.Tests/CommandsAdapterTests.cs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System.IO; using System.Net; using System.Text; using System.Text.Json; @@ -441,6 +442,62 @@ await act.Should().ThrowAsync() .WithMessage("*sessionId*"); } + [Fact] + public async Task RunAsync_ShouldAutoResumeOnDisconnect() + { + const int totalLines = 20; + var allEvents = new StringBuilder(); + allEvents.AppendLine("""{"type":"init","text":"cmd-resume","timestamp":1,"eid":1}"""); + for (int i = 1; i <= totalLines; i++) + { + allEvents.AppendLine($$"""{"type":"stdout","text":"line{{i}}","timestamp":1,"eid":{{i + 1}}}"""); + } + allEvents.AppendLine($$"""{"type":"execution_complete","execution_time":100,"timestamp":1,"eid":{{totalLines + 2}}}"""); + + var handler = new StubHttpMessageHandler((request, _) => + { + if (request.Method == HttpMethod.Post && request.RequestUri!.AbsolutePath == "/command") + { + return Task.FromResult(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new TruncatedStreamContent(allEvents.ToString(), maxLines: 5) + }); + } + + if (request.Method == HttpMethod.Get && request.RequestUri!.AbsolutePath.Contains("/resume")) + { + var query = request.RequestUri!.Query; + var afterEid = long.Parse(query.Split('=')[1]); + var remaining = new StringBuilder(); + for (long eid = afterEid + 1; eid <= totalLines + 1; eid++) + { + remaining.AppendLine($$"""{"type":"stdout","text":"line{{eid - 1}}","timestamp":1,"eid":{{eid}}}"""); + } + remaining.AppendLine($$"""{"type":"execution_complete","execution_time":100,"timestamp":1,"eid":{{totalLines + 2}}}"""); + + return Task.FromResult(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(remaining.ToString(), Encoding.UTF8, "text/event-stream") + }); + } + + return Task.FromResult(new HttpResponseMessage(HttpStatusCode.NotFound)); + }); + + var adapter = CreateAdapter(handler); + var execution = await adapter.RunAsync("test command"); + + execution.Id.Should().Be("cmd-resume"); + execution.ExitCode.Should().Be(0); + execution.Complete.Should().NotBeNull(); + execution.Logs.Stdout.Should().HaveCount(totalLines); + for (int i = 1; i <= totalLines; i++) + { + execution.Logs.Stdout.Should().Contain(m => m.Text == $"line{i}"); + } + execution.LastEid.Should().BeGreaterOrEqualTo(totalLines + 1); + } + private static CommandsAdapter CreateAdapter(HttpMessageHandler httpHandler) { var baseUrl = "http://execd.local"; @@ -468,4 +525,33 @@ protected override async Task SendAsync(HttpRequestMessage return await _handler(request, cancellationToken).ConfigureAwait(false); } } + + private sealed class TruncatedStreamContent : HttpContent + { + private readonly string _content; + private readonly int _maxLines; + + public TruncatedStreamContent(string content, int maxLines) + { + _content = content; + _maxLines = maxLines; + Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("text/event-stream"); + } + + protected override async Task SerializeToStreamAsync(Stream stream, TransportContext? context) + { + var lines = _content.Split('\n'); + var truncated = string.Join("\n", lines.Take(_maxLines)) + "\n"; + var bytes = Encoding.UTF8.GetBytes(truncated); + await stream.WriteAsync(bytes).ConfigureAwait(false); + await stream.FlushAsync().ConfigureAwait(false); + throw new IOException("Simulated disconnect for SSE resume test"); + } + + protected override bool TryComputeLength(out long length) + { + length = -1; + return false; + } + } } diff --git a/sdks/sandbox/go/execd.go b/sdks/sandbox/go/execd.go index 8a196f7f5..6643588e5 100644 --- a/sdks/sandbox/go/execd.go +++ b/sdks/sandbox/go/execd.go @@ -128,6 +128,13 @@ func (e *ExecdClient) RunCommand(ctx context.Context, req RunCommandRequest, han return e.client.doStreamRequest(ctx, http.MethodPost, "/command", req, handler) } +// ResumeCommand resumes an interrupted SSE stream for the given command +// starting from the specified event ID (exclusive). +func (e *ExecdClient) ResumeCommand(ctx context.Context, commandID string, afterEid int64, handler EventHandler) error { + path := "/command/" + url.PathEscape(commandID) + "/resume?after_eid=" + strconv.FormatInt(afterEid, 10) + return e.client.doStreamRequest(ctx, http.MethodGet, path, nil, handler) +} + // InterruptCommand interrupts the currently running command execution. func (e *ExecdClient) InterruptCommand(ctx context.Context, sessionID string) error { path := "/command?id=" + url.QueryEscape(sessionID) diff --git a/sdks/sandbox/go/execution.go b/sdks/sandbox/go/execution.go index 93692dd40..f0cf7f38e 100644 --- a/sdks/sandbox/go/execution.go +++ b/sdks/sandbox/go/execution.go @@ -81,6 +81,9 @@ type Execution struct { // ExitCode is the process exit code. Nil if not available. ExitCode *int + + // LastEid is the highest event ID seen so far, used for SSE resume. + LastEid int64 } // Text returns the combined stdout text. @@ -120,6 +123,7 @@ type sseEvent struct { Type string `json:"type"` Text string `json:"text"` Timestamp int64 `json:"timestamp"` + Eid int64 `json:"eid,omitempty"` ExitCode *int `json:"exit_code,omitempty"` ExecutionTime int64 `json:"execution_time,omitempty"` @@ -153,6 +157,10 @@ func processStreamEvent(exec *Execution, event StreamEvent, handlers *ExecutionH return nil } + if ev.Eid > exec.LastEid { + exec.LastEid = ev.Eid + } + switch ev.Type { case "init": initEvent := ExecutionInit{ID: ev.Text, Timestamp: ev.Timestamp} diff --git a/sdks/sandbox/go/opensandbox_test.go b/sdks/sandbox/go/opensandbox_test.go index 0fee93b42..25a8e9fd6 100644 --- a/sdks/sandbox/go/opensandbox_test.go +++ b/sdks/sandbox/go/opensandbox_test.go @@ -17,11 +17,14 @@ package opensandbox import ( "context" "encoding/json" + "errors" "fmt" "io" + "net" "net/http" "net/http/httptest" "os" + "strconv" "strings" "sync" "testing" @@ -2276,3 +2279,188 @@ func TestCreateSandbox_WithVolumes(t *testing.T) { }) require.NoErrorf(t, err, "CreateSandbox with Volumes") } + +// errorReader is an io.Reader that always returns a net.Error, simulating a +// TCP connection reset after a cleanly truncated response body. +type errorReader struct{} + +func (errorReader) Read([]byte) (int, error) { + return 0, &net.OpError{ + Op: "read", + Net: "tcp", + Err: errors.New("connection reset by peer"), + } +} + +// disconnectInjectTransport wraps a real transport. On POST /command it buffers +// the full SSE response, truncates it after maxEvents complete NDJSON events, +// then appends an errorReader so the client sees a mid-stream disconnect. +// On GET /resume it tracks the after_eid query parameter. +type disconnectInjectTransport struct { + real http.RoundTripper + maxEvents int + postCount *int + resumeCount *int + lastAfterEid *int64 + mu *sync.Mutex +} + +func (rt *disconnectInjectTransport) RoundTrip(req *http.Request) (*http.Response, error) { + resp, err := rt.real.RoundTrip(req) + if err != nil { + return nil, err + } + + rt.mu.Lock() + defer rt.mu.Unlock() + + if req.Method == http.MethodPost && req.URL.Path == "/command" { + *rt.postCount++ + + full, readErr := io.ReadAll(resp.Body) + resp.Body.Close() + if readErr != nil || len(full) == 0 { + resp.Body = io.NopCloser(errorReader{}) + return resp, nil + } + + // Truncate after maxEvents complete events. Each NDJSON event is + // one line followed by a blank line, i.e. two consecutive \n chars. + maxNewlines := rt.maxEvents * 2 + nl := 0 + cut := len(full) + for i, b := range full { + if b == '\n' { + nl++ + if nl >= maxNewlines { + cut = i + 1 + break + } + } + } + + if cut < len(full) { + resp.Body = io.NopCloser(io.MultiReader( + strings.NewReader(string(full[:cut])), + errorReader{}, + )) + } else { + resp.Body = io.NopCloser(strings.NewReader(string(full))) + } + } else if req.Method == http.MethodGet && strings.Contains(req.URL.Path, "/resume") { + *rt.resumeCount++ + if q := req.URL.Query().Get("after_eid"); q != "" { + if eid, err := strconv.ParseInt(q, 10, 64); err == nil { + *rt.lastAfterEid = eid + } + } + } + + return resp, nil +} + +func TestRunCommand_AutoResumeOnDisconnect(t *testing.T) { + var mu sync.Mutex + var postCount, resumeCount int + var lastAfterEid int64 + + // writeEvents sends SSE events for eid range [first, last]. + // eid=1 is init, 2..20 are line1..line19, 21 is execution_complete. + writeEvents := func(w io.Writer, first, last int64) { + flusher, _ := w.(http.Flusher) + for eid := first; eid <= last; eid++ { + switch { + case eid == 1: + fmt.Fprintf(w, `{"type":"init","text":"cmd-r1","eid":1,"timestamp":1}`+"\n\n") + case eid >= 2 && eid <= 20: + lineNum := eid - 1 + fmt.Fprintf(w, `{"type":"stdout","text":"line%d","eid":%d,"timestamp":%d}`+"\n\n", lineNum, eid, lineNum*10) + case eid == 21: + fmt.Fprintf(w, `{"type":"execution_complete","eid":21,"timestamp":200,"execution_time":50}`+"\n\n") + } + } + if flusher != nil { + flusher.Flush() + } + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + defer mu.Unlock() + + path := r.URL.Path + if r.Method == http.MethodPost && path == "/command" { + w.Header().Set("Content-Type", "text/event-stream") + w.WriteHeader(http.StatusOK) + writeEvents(w, 1, 21) + } else if r.Method == http.MethodGet && strings.Contains(path, "/resume") { + w.Header().Set("Content-Type", "text/event-stream") + w.WriteHeader(http.StatusOK) + afterEidStr := r.URL.Query().Get("after_eid") + afterEid, _ := strconv.ParseInt(afterEidStr, 10, 64) + firstEid := afterEid + 1 + if firstEid < 1 { + firstEid = 1 + } + writeEvents(w, firstEid, 21) + } else { + w.WriteHeader(http.StatusNotFound) + } + })) + defer srv.Close() + + transport := &disconnectInjectTransport{ + real: srv.Client().Transport, + maxEvents: 4, // disconnect after init + 3 stdout lines + postCount: &postCount, + resumeCount: &resumeCount, + lastAfterEid: &lastAfterEid, + mu: &mu, + } + + execd := NewExecdClient(srv.URL, "test-token", + WithHTTPClient(&http.Client{Transport: transport}), + ) + + sb := &Sandbox{ + id: "test-sandbox", + config: &ConnectionConfig{Domain: srv.URL}, + execd: execd, + } + + execution, err := sb.RunCommand(context.Background(), "echo test", nil) + require.NoErrorf(t, err, "RunCommand") + + if postCount != 1 { + assert.Fail(t, fmt.Sprintf("postCount = %d, want 1", postCount)) + } + if resumeCount < 1 { + assert.Fail(t, fmt.Sprintf("resumeCount = %d, want >= 1", resumeCount)) + } + if lastAfterEid < 2 { + assert.Fail(t, fmt.Sprintf("lastAfterEid = %d, want >= 2", lastAfterEid)) + } + + if len(execution.Stdout) != 19 { + assert.Fail(t, fmt.Sprintf("len(Stdout) = %d, want 19", len(execution.Stdout))) + } + for i, msg := range execution.Stdout { + expected := fmt.Sprintf("line%d", i+1) + if msg.Text != expected { + assert.Fail(t, fmt.Sprintf("Stdout[%d].Text = %q, want %q", i, msg.Text, expected)) + } + } + + if execution.Complete == nil { + assert.Fail(t, "expected Complete to be set") + } + if execution.ExitCode == nil || *execution.ExitCode != 0 { + assert.Fail(t, fmt.Sprintf("ExitCode = %v, want 0", execution.ExitCode)) + } + if execution.LastEid < 10 { + assert.Fail(t, fmt.Sprintf("LastEid = %d, want >= 10", execution.LastEid)) + } + + t.Logf("resume test: post=%d resume=%d afterEid=%d stdout=%d lastEid=%d", + postCount, resumeCount, lastAfterEid, len(execution.Stdout), execution.LastEid) +} diff --git a/sdks/sandbox/go/sandbox_exec.go b/sdks/sandbox/go/sandbox_exec.go index 599c7c88b..ae87ee1f4 100644 --- a/sdks/sandbox/go/sandbox_exec.go +++ b/sdks/sandbox/go/sandbox_exec.go @@ -16,7 +16,11 @@ package opensandbox import ( "context" + "errors" "fmt" + "io" + "net" + "time" ) // RunCommand executes a shell command and returns the structured result. @@ -24,19 +28,83 @@ func (s *Sandbox) RunCommand(ctx context.Context, command string, handlers *Exec return s.RunCommandWithOpts(ctx, RunCommandRequest{Command: command}, handlers) } +const maxResumeRetries = 3 + +// isNetworkError reports whether err is a transient network error that should +// trigger SSE resume. Context cancellation and deadline errors are not retryable. +func isNetworkError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + for { + var netErr net.Error + if errors.As(err, &netErr) { + return true + } + if errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + unwrapped := errors.Unwrap(err) + if unwrapped == nil { + break + } + err = unwrapped + } + return false +} + // RunCommandWithOpts executes a command with full options. +// Automatically resumes the SSE stream on transient network disconnects. func (s *Sandbox) RunCommandWithOpts(ctx context.Context, req RunCommandRequest, handlers *ExecutionHandlers) (*Execution, error) { if s.execd == nil { return nil, fmt.Errorf("opensandbox: execd client not initialized") } exec := &Execution{} - err := s.execd.RunCommand(ctx, req, func(event StreamEvent) error { - return processStreamEvent(exec, event, handlers) - }) - if err != nil { - return exec, err + var commandID string + + for attempt := 0; attempt <= maxResumeRetries; attempt++ { + var streamErr error + if attempt == 0 { + streamErr = s.execd.RunCommand(ctx, req, func(event StreamEvent) error { + return processStreamEvent(exec, event, handlers) + }) + } else { + streamErr = s.execd.ResumeCommand(ctx, commandID, exec.LastEid, func(event StreamEvent) error { + return processStreamEvent(exec, event, handlers) + }) + } + + if streamErr == nil { + return exec, nil + } + + if exec.ID != "" { + commandID = exec.ID + } + + var apiErr *APIError + if errors.As(streamErr, &apiErr) { + if apiErr.StatusCode == 409 && attempt < maxResumeRetries { + if err := retrySleep(ctx, 1*time.Second); err != nil { + return exec, err + } + continue + } + if apiErr.StatusCode == 404 && attempt > 0 { + return exec, nil + } + return exec, streamErr + } + + if commandID == "" || attempt >= maxResumeRetries || !isNetworkError(streamErr) { + return exec, streamErr + } } + return exec, nil } diff --git a/sdks/sandbox/javascript/src/adapters/commandsAdapter.ts b/sdks/sandbox/javascript/src/adapters/commandsAdapter.ts index 602723f75..c94fbb9ac 100644 --- a/sdks/sandbox/javascript/src/adapters/commandsAdapter.ts +++ b/sdks/sandbox/javascript/src/adapters/commandsAdapter.ts @@ -15,6 +15,7 @@ import type { ExecdClient } from "../openapi/execdClient.js"; import { throwOnOpenApiFetchError } from "./openapiError.js"; import { parseJsonEventStream } from "./sse.js"; +import { SandboxApiException } from "../core/exceptions.js"; import type { paths as ExecdPaths } from "../api/execd.js"; import type { CommandExecution, @@ -105,12 +106,26 @@ function inferForegroundExitCode(execution: CommandExecution): number | null { : null; } +const MAX_RESUME_RETRIES = 3; + function assertNonBlank(value: string, field: string): void { if (!value.trim()) { throw new Error(`${field} cannot be empty`); } } +function isNetworkError(err: unknown): boolean { + if (err instanceof DOMException && err.name === "AbortError") return false; + if (err instanceof TypeError) return true; + if (err instanceof Error) { + const msg = err.message.toLowerCase(); + if (msg.includes("fetch failed") || msg.includes("econnreset") || + msg.includes("socket") || msg.includes("connect") || + msg.includes("network")) return true; + } + return false; +} + function parseOptionalDate(value: unknown, field: string): Date | undefined { if (value == null) return undefined; if (value instanceof Date) return value; @@ -192,6 +207,30 @@ export class CommandsAdapter implements ExecdCommands { } } + private async *resumeStream( + commandId: string, + afterEid: number, + signal?: AbortSignal, + ): AsyncIterable { + const url = joinUrl( + this.opts.baseUrl, + `/command/${encodeURIComponent(commandId)}/resume?after_eid=${afterEid}`, + ); + const res = await this.fetch(url, { + method: "GET", + headers: { + accept: "text/event-stream", + ...(this.opts.headers ?? {}), + }, + signal, + }); + for await (const ev of parseJsonEventStream(res, { + fallbackErrorMessage: "Resume command failed", + })) { + yield ev; + } + } + private async consumeExecutionStream( stream: AsyncIterable, handlers?: ExecutionHandlers, @@ -201,6 +240,18 @@ export class CommandsAdapter implements ExecdCommands { logs: { stdout: [], stderr: [] }, result: [], }; + await this.consumeExecutionInto(stream, execution, handlers); + if (inferExitCode) { + execution.exitCode = inferForegroundExitCode(execution); + } + return execution; + } + + private async consumeExecutionInto( + stream: AsyncIterable, + execution: CommandExecution, + handlers?: ExecutionHandlers, + ): Promise { const dispatcher = new ExecutionEventDispatcher(execution, handlers); for await (const ev of stream) { if (ev.type === "init" && (ev.text ?? "") === "" && execution.id) { @@ -208,12 +259,6 @@ export class CommandsAdapter implements ExecdCommands { } await dispatcher.dispatch(ev as any); } - - if (inferExitCode) { - execution.exitCode = inferForegroundExitCode(execution); - } - - return execution; } async interrupt(sessionId: string): Promise { @@ -280,11 +325,46 @@ export class CommandsAdapter implements ExecdCommands { handlers?: ExecutionHandlers, signal?: AbortSignal, ): Promise { - return this.consumeExecutionStream( - this.runStream(command, opts, signal), - handlers, - !opts?.background, - ); + const inferExitCode = !opts?.background; + const execution: CommandExecution = { + logs: { stdout: [], stderr: [] }, + result: [], + }; + let commandId: string | undefined; + + for (let attempt = 0; attempt <= MAX_RESUME_RETRIES; attempt++) { + try { + const stream = attempt === 0 + ? this.runStream(command, opts, signal) + : this.resumeStream(commandId!, execution.lastEid ?? 0, signal); + + await this.consumeExecutionInto(stream, execution, handlers); + + if (inferExitCode) { + execution.exitCode = inferForegroundExitCode(execution); + } + return execution; + } catch (err) { + if (execution.id) commandId = execution.id; + + if (err instanceof SandboxApiException) { + if (err.statusCode === 409 && attempt < MAX_RESUME_RETRIES) { + await new Promise(resolve => setTimeout(resolve, 1000)); + continue; + } + if (err.statusCode === 404 && attempt > 0) { + return execution; + } + throw err; + } + + if (!commandId || attempt >= MAX_RESUME_RETRIES || !isNetworkError(err)) { + throw err; + } + } + } + + return execution; } async createSession(options?: { workingDirectory?: string }): Promise { diff --git a/sdks/sandbox/javascript/src/api/execd.ts b/sdks/sandbox/javascript/src/api/execd.ts index 03fb65517..475ad7cf1 100644 --- a/sdks/sandbox/javascript/src/api/execd.ts +++ b/sdks/sandbox/javascript/src/api/execd.ts @@ -266,6 +266,33 @@ export interface paths { patch?: never; trace?: never; }; + "/command/{id}/resume": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Resume command SSE stream (replay and optional live tail) + * @description Replays buffered events (stdout, stderr, execution_complete, execution_error) from + * the server-side ring buffer for events with `eid` strictly greater than + * `after_eid`, then—if the command is still running and no other client holds the + * primary SSE slot—continues streaming live events until completion or client + * disconnect. Event shape matches `POST /command` (`ServerStreamEvent`). + * + * This endpoint is mutually exclusive with the primary `POST /command` SSE: if that + * connection is still active, the server responds with 409 Conflict. + */ + get: operations["resumeCommandStream"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/command/{id}/logs": { parameters: { query?: never; @@ -948,6 +975,21 @@ export interface components { "application/json": components["schemas"]["ErrorResponse"]; }; }; + /** @description Request conflicts with current server state (e.g. resource in use) */ + Conflict: { + headers: { + [name: string]: unknown; + }; + content: { + /** + * @example { + * "code": "INVALID_REQUEST_BODY", + * "message": "primary SSE stream is still active; disconnect it before resuming" + * } + */ + "application/json": components["schemas"]["ErrorResponse"]; + }; + }; /** @description Runtime server error during operation */ InternalServerError: { headers: { @@ -1342,6 +1384,46 @@ export interface operations { 500: components["responses"]["InternalServerError"]; }; }; + resumeCommandStream: { + parameters: { + query?: { + /** + * @description Only events with `eid` greater than this value are replayed. All event types + * (stdout, stderr, execution_complete, execution_error) carry monotonically + * increasing `eid` values. Omit or use `0` to replay from the oldest buffered + * events. After replay, if the command is still running and the primary SSE slot + * is free, live events continue to stream. + * @example 42 + */ + after_eid?: number; + }; + header?: never; + path: { + /** + * @description Command ID returned by RunCommand + * @example cmd-abc123 + */ + id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Stream of command execution events (replay then optional live continuation) */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "text/event-stream": components["schemas"]["ServerStreamEvent"]; + }; + }; + 400: components["responses"]["BadRequest"]; + 404: components["responses"]["NotFound"]; + 409: components["responses"]["Conflict"]; + 500: components["responses"]["InternalServerError"]; + }; + }; getBackgroundCommandLogs: { parameters: { query?: { diff --git a/sdks/sandbox/javascript/src/models/execd.ts b/sdks/sandbox/javascript/src/models/execd.ts index 992dd8941..4f5930924 100644 --- a/sdks/sandbox/javascript/src/models/execd.ts +++ b/sdks/sandbox/javascript/src/models/execd.ts @@ -35,6 +35,8 @@ export interface ServerStreamEvent extends Record { text?: string; results?: Record; error?: Record; + /** Monotonic event ID for SSE resume. */ + eid?: number; } export interface CodeContextRequest extends Record { diff --git a/sdks/sandbox/javascript/src/models/execution.ts b/sdks/sandbox/javascript/src/models/execution.ts index 144236dc3..a527270e5 100644 --- a/sdks/sandbox/javascript/src/models/execution.ts +++ b/sdks/sandbox/javascript/src/models/execution.ts @@ -55,6 +55,8 @@ export interface Execution { error?: ExecutionError; complete?: ExecutionComplete; exitCode?: number | null; + /** Highest event ID seen, used for SSE resume. */ + lastEid?: number; } export interface ExecutionHandlers { diff --git a/sdks/sandbox/javascript/src/models/executionEventDispatcher.ts b/sdks/sandbox/javascript/src/models/executionEventDispatcher.ts index 303fdcc04..4ad887268 100644 --- a/sdks/sandbox/javascript/src/models/executionEventDispatcher.ts +++ b/sdks/sandbox/javascript/src/models/executionEventDispatcher.ts @@ -35,6 +35,10 @@ export class ExecutionEventDispatcher { ) {} async dispatch(ev: ServerStreamEvent): Promise { + if (ev.eid != null && ev.eid > (this.execution.lastEid ?? 0)) { + this.execution.lastEid = ev.eid; + } + await this.handlers?.onEvent?.(ev); const ts = ev.timestamp ?? Date.now(); diff --git a/sdks/sandbox/javascript/tests/commands.run.test.mjs b/sdks/sandbox/javascript/tests/commands.run.test.mjs index e05ef98d7..d55624e82 100644 --- a/sdks/sandbox/javascript/tests/commands.run.test.mjs +++ b/sdks/sandbox/javascript/tests/commands.run.test.mjs @@ -113,6 +113,62 @@ test("CommandsAdapter.runInSession sends command and timeout fields", async () = assert.equal(execution.exitCode, 0); }); +test("CommandsAdapter.run auto-resumes on SSE disconnect", async () => { + const initialEvents = [ + { type: "init", text: "cmd-resume", timestamp: 1, eid: 1 }, + { type: "stdout", text: "before-disconnect", timestamp: 2, eid: 2 }, + ]; + const resumeEvents = [ + { type: "stdout", text: "after-resume", timestamp: 3, eid: 3 }, + { type: "execution_complete", timestamp: 4, execution_time: 10, eid: 4 }, + ]; + + let callCount = 0; + const fetchImpl = async (url, init) => { + callCount++; + if (callCount === 1) { + const body = initialEvents.map(e => `data: ${JSON.stringify(e)}`).join("\n") + "\n"; + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(new TextEncoder().encode(body)); + }, + pull(controller) { + controller.error(new TypeError("fetch failed")); + }, + }); + return new Response(stream, { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + // Resume call + assert.ok(url.includes("/resume?after_eid=2"), `unexpected resume URL: ${url}`); + const body = resumeEvents.map(e => `data: ${JSON.stringify(e)}`).join("\n") + "\n\n"; + return new Response(body, { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + }; + + const adapter = new CommandsAdapter( + {}, + { + baseUrl: "http://127.0.0.1:8080", + fetch: fetchImpl, + }, + ); + + const execution = await adapter.run("echo test"); + + assert.equal(execution.id, "cmd-resume"); + assert.equal(execution.logs.stdout.length, 2); + assert.equal(execution.logs.stdout[0].text, "before-disconnect"); + assert.equal(execution.logs.stdout[1].text, "after-resume"); + assert.equal(execution.lastEid, 4); + assert.equal(execution.exitCode, 0); + assert.equal(callCount, 2); +}); + test("CommandsAdapter.runInSession infers non-zero exitCode from final error state", async () => { const adapter = createAdapter( [ diff --git a/sdks/sandbox/kotlin/sandbox-api/src/main/kotlin/com/alibaba/opensandbox/sandbox/api/models/execd/ExecutionModels.kt b/sdks/sandbox/kotlin/sandbox-api/src/main/kotlin/com/alibaba/opensandbox/sandbox/api/models/execd/ExecutionModels.kt index 91efc4ada..0051fa9a2 100644 --- a/sdks/sandbox/kotlin/sandbox-api/src/main/kotlin/com/alibaba/opensandbox/sandbox/api/models/execd/ExecutionModels.kt +++ b/sdks/sandbox/kotlin/sandbox-api/src/main/kotlin/com/alibaba/opensandbox/sandbox/api/models/execd/ExecutionModels.kt @@ -33,6 +33,8 @@ data class EventNode( @SerialName("execution_count") val executionCount: Long? = null, val error: ErrorData? = null, + @SerialName("eid") + val eid: Long? = null, ) @Serializable diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/HttpClientProvider.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/HttpClientProvider.kt index 93f9e5391..0f04ed981 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/HttpClientProvider.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/HttpClientProvider.kt @@ -81,12 +81,15 @@ class HttpClientProvider( .writeTimeout(config.requestTimeout.toMillis(), TimeUnit.MILLISECONDS) .callTimeout(0, TimeUnit.MILLISECONDS) .addInterceptor(ExtraHeadersInterceptor(getSseHeaders())) + .apply { sseInterceptors.forEach { addInterceptor(it) } } .addLoggingInterceptor() .build() } val sseClient: OkHttpClient by sseClientLazy + internal val sseInterceptors: MutableList = mutableListOf() + // --- Helper Extensions --- private fun OkHttpClient.Builder.applyStandardTimeouts(): OkHttpClient.Builder { diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/ExecutionModels.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/ExecutionModels.kt index a46b0938f..9c93ec8ea 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/ExecutionModels.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/ExecutionModels.kt @@ -39,6 +39,7 @@ class Execution( var complete: ExecutionComplete? = null, var exitCode: Int? = null, val logs: ExecutionLogs = ExecutionLogs(), + var lastEid: Long = 0L, ) { /** * Adds a new execution result to this execution. diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionEventDispatcher.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionEventDispatcher.kt index 0bef96ef7..daca42d04 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionEventDispatcher.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionEventDispatcher.kt @@ -30,6 +30,11 @@ class ExecutionEventDispatcher( private val handlers: ExecutionHandlers? = null, ) { fun dispatch(eventNode: EventNode) { + eventNode.eid?.let { eid -> + if (eid > execution.lastEid) { + execution.lastEid = eid + } + } val type = eventNode.type val timestamp = eventNode.timestamp when (type) { diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt index 76d2b249e..416959971 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt @@ -50,6 +50,7 @@ import okhttp3.Request import okhttp3.RequestBody.Companion.toRequestBody import okhttp3.Response import org.slf4j.LoggerFactory +import java.io.IOException import com.alibaba.opensandbox.sandbox.api.models.execd.CreateSessionRequest as CreateSessionRequestApi import com.alibaba.opensandbox.sandbox.api.models.execd.RunInSessionRequest as RunInSessionRequestApi @@ -64,6 +65,8 @@ internal class CommandsAdapter( companion object { private const val RUN_COMMAND_PATH = "/command" private const val SESSION_PATH_SEGMENT = "session" + private const val RESUME_PATH = "/command/%s/resume" + private const val MAX_RESUME_RETRIES = 3 } private val logger = LoggerFactory.getLogger(CommandsAdapter::class.java) @@ -88,28 +91,68 @@ internal class CommandsAdapter( if (request.command.isEmpty()) { throw InvalidArgumentException("Command cannot be empty") } - try { - val httpRequest = - Request.Builder() - .url("$execdBaseUrl$RUN_COMMAND_PATH") - .post( - jsonParser.encodeToString(request.toApiRunCommandRequest()).toRequestBody("application/json".toMediaType()), - ) - .headers(execdEndpoint.headers.toHeaders()) - .build() - return executeStreamingRequest( - httpRequest = httpRequest, - handlers = request.handlers, - inferExitCode = !request.background, - failureMessage = { statusCode, errorBody -> - "Failed to run commands. Status code: $statusCode, Body: $errorBody" - }, - ) - } catch (e: Exception) { - logger.error("Failed to run command (length: {})", request.command.length, e) - throw e.toSandboxException() + val execution = Execution() + var commandID: String? = null + + for (attempt in 0..MAX_RESUME_RETRIES) { + try { + val httpRequest = if (attempt == 0) { + Request.Builder() + .url("$execdBaseUrl$RUN_COMMAND_PATH") + .post( + jsonParser.encodeToString(request.toApiRunCommandRequest()) + .toRequestBody("application/json".toMediaType()), + ) + .headers(execdEndpoint.headers.toHeaders()) + .build() + } else { + val resumeUrl = RESUME_PATH.format(commandID!!) + Request.Builder() + .url("$execdBaseUrl$resumeUrl?after_eid=${execution.lastEid}") + .get() + .headers(execdEndpoint.headers.toHeaders()) + .build() + } + + streamEvents( + httpRequest = httpRequest, + execution = execution, + handlers = request.handlers, + failureMessage = { statusCode, errorBody -> + "Failed to run commands. Status code: $statusCode, Body: $errorBody" + }, + ) + + if (!request.background) { + execution.exitCode = inferForegroundExitCode(execution) + } + return execution + } catch (e: Exception) { + if (execution.id != null) { + commandID = execution.id + } + + if (e is SandboxApiException) { + if (e.statusCode == 409 && attempt < MAX_RESUME_RETRIES) { + retrySleep(1000L) + continue + } + if (e.statusCode == 404 && attempt > 0) { + return execution + } + logger.error("Failed to run command (length: {})", request.command.length, e) + throw e + } + + if (commandID == null || attempt >= MAX_RESUME_RETRIES || !isNetworkError(e)) { + logger.error("Failed to run command (length: {})", request.command.length, e) + throw e.toSandboxException() + } + } } + + return execution } override fun interrupt(executionId: String) { @@ -216,14 +259,17 @@ internal class CommandsAdapter( .headers(execdEndpoint.headers.toHeaders()) .build() - return executeStreamingRequest( + val execution = Execution() + streamEvents( httpRequest = httpRequest, + execution = execution, handlers = request.handlers, - inferExitCode = true, failureMessage = { statusCode, errorBody -> "run_in_session failed. Status: $statusCode, Body: $errorBody" }, ) + execution.exitCode = inferForegroundExitCode(execution) + return execution } catch (e: Exception) { logger.error("Failed to run in session", e) throw e.toSandboxException() @@ -242,14 +288,12 @@ internal class CommandsAdapter( } } - private fun executeStreamingRequest( + private fun streamEvents( httpRequest: Request, + execution: Execution, handlers: ExecutionHandlers?, - inferExitCode: Boolean, failureMessage: (Int, String?) -> String, - ): Execution { - val execution = Execution() - + ) { httpClientProvider.sseClient.newCall(httpRequest).execute().use { response -> ensureSuccessfulStreamingResponse(response, failureMessage) @@ -266,11 +310,6 @@ internal class CommandsAdapter( } } } - - if (inferExitCode) { - execution.exitCode = inferForegroundExitCode(execution) - } - return execution } private fun ensureSuccessfulStreamingResponse( @@ -325,4 +364,27 @@ internal class CommandsAdapter( if (execution.complete != null) 0 else null } } + + private fun isNetworkError(e: Exception): Boolean { + if (e is IOException) return true + var cause: Throwable? = e.cause + while (cause != null) { + if (cause is IOException) return true + cause = cause.cause + } + return false + } + + private fun retrySleep(millis: Long) { + try { + Thread.sleep(millis) + } catch (e: InterruptedException) { + Thread.currentThread().interrupt() + throw SandboxApiException( + message = "Interrupted during SSE resume retry sleep", + statusCode = 0, + error = SandboxError(UNEXPECTED_RESPONSE), + ) + } + } } diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt index 0985dbe3b..8df91fc25 100644 --- a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt @@ -32,8 +32,10 @@ import kotlinx.serialization.json.jsonObject import kotlinx.serialization.json.jsonPrimitive import okhttp3.mockwebserver.MockResponse import okhttp3.mockwebserver.MockWebServer +import okhttp3.mockwebserver.SocketPolicy import org.junit.jupiter.api.AfterEach import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertNotNull import org.junit.jupiter.api.Assertions.assertThrows import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.BeforeEach @@ -412,4 +414,82 @@ data: {"type":"execution_complete","execution_time":100,"timestamp":167253120100 val ex = assertThrows(InvalidArgumentException::class.java) { commandsAdapter.deleteSession(" ") } assertEquals("session_id cannot be empty", ex.message) } + + // ----------------------------------------------------------------------- + // SSE resume test + // ----------------------------------------------------------------------- + + @Test + fun `run should auto-resume on SSE disconnect`() { + val totalLines = 40 + val allEvents = buildString { + appendLine("""{"type":"init","text":"cmd-resume","timestamp":1672531200000,"eid":1}""") + for (i in 1..totalLines) { + appendLine("""{"type":"stdout","text":"line$i","timestamp":1672531200000,"eid":${i + 1}}""") + } + appendLine("""{"type":"execution_complete","execution_time":100,"timestamp":1672531201000,"eid":${totalLines + 2}}""") + } + + var resumeCalled = false + var lastAfterEid: Long? = null + var postRequestCount = 0 + + mockWebServer.dispatcher = object : okhttp3.mockwebserver.Dispatcher() { + override fun dispatch(request: okhttp3.mockwebserver.RecordedRequest): MockResponse { + return when { + request.method == "POST" && request.path == "/command" -> { + postRequestCount++ + MockResponse() + .setResponseCode(200) + .setBody(allEvents) + .setSocketPolicy(SocketPolicy.DISCONNECT_DURING_RESPONSE_BODY) + } + request.method == "GET" && request.path!!.contains("/resume") -> { + resumeCalled = true + lastAfterEid = request.requestUrl!!.queryParameter("after_eid")!!.toLong() + val startEid = lastAfterEid!! + 1 + val remaining = buildString { + for (eid in startEid..(totalLines + 1)) { + appendLine("""{"type":"stdout","text":"line${eid - 1}","timestamp":1672531200000,"eid":$eid}""") + } + appendLine("""{"type":"execution_complete","execution_time":100,"timestamp":1672531201000,"eid":${totalLines + 2}}""") + } + MockResponse() + .setResponseCode(200) + .setBody(remaining) + } + else -> MockResponse().setResponseCode(404) + } + } + } + + val receivedLines = mutableSetOf() + val latch = CountDownLatch(1) + val handlers = ExecutionHandlers.builder() + .onStdout { msg -> receivedLines.add(msg.text) } + .onExecutionComplete { latch.countDown() } + .build() + + val request = RunCommandRequest.builder() + .command("for i in 1..40; do echo line\$i; done") + .handlers(handlers) + .build() + + val execution = commandsAdapter.run(request) + + assertTrue(resumeCalled, "Expected resume to be called after disconnect") + assertTrue(latch.await(5, TimeUnit.SECONDS), "Timed out waiting for completion") + assertEquals(totalLines, receivedLines.size, + "Expected all $totalLines lines, got ${receivedLines.size}: $receivedLines") + for (i in 1..totalLines) { + assertTrue(receivedLines.contains("line$i"), "Missing line$i") + } + assertEquals("cmd-resume", execution.id) + assertEquals(0, execution.exitCode) + assertNotNull(execution.complete) + assertEquals(1, postRequestCount, "Should send exactly one POST /command") + assertNotNull(lastAfterEid) + assertTrue(lastAfterEid!! >= 1, "after_eid should be >= 1, got $lastAfterEid") + assertTrue(execution.lastEid >= totalLines + 1) + } } From ac2382e6f94c010f348544b64a0ab4f9ab34703a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 3 May 2026 13:24:35 +0800 Subject: [PATCH 20/20] fix(sdk): break long line in Kotlin CommandsAdapterTest to satisfy ktlint Line exceeded 140-char max-line-length rule. Split the long appendLine call with embedded JSON string across multiple lines. Co-Authored-By: Claude Opus 4.7 --- .../infrastructure/adapters/service/CommandsAdapterTest.kt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt index 8df91fc25..c69f7e176 100644 --- a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt @@ -452,7 +452,9 @@ data: {"type":"execution_complete","execution_time":100,"timestamp":167253120100 for (eid in startEid..(totalLines + 1)) { appendLine("""{"type":"stdout","text":"line${eid - 1}","timestamp":1672531200000,"eid":$eid}""") } - appendLine("""{"type":"execution_complete","execution_time":100,"timestamp":1672531201000,"eid":${totalLines + 2}}""") + appendLine( + """{"type":"execution_complete","execution_time":100,"timestamp":1672531201000,"eid":${totalLines + 2}}""", + ) } MockResponse() .setResponseCode(200)