From d05f0010811700798c9b215f13fd68083835b7bb Mon Sep 17 00:00:00 2001 From: XuPeng-SH Date: Sun, 5 Apr 2026 12:10:26 +0800 Subject: [PATCH 1/5] fix: cnservice bootstrap panics on context.Canceled instead of returning error (#24064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes: 1. pkg/cnservice/server.go: bootstrap() now checks for context cancellation/timeout before panicking. context.Canceled and context.DeadlineExceeded are external signals, not bootstrap failures — return the error so the caller can handle gracefully. 2. pkg/embed/cluster.go: doStartLocked() propagates errors instead of panicking. CN Start() failures are returned via atomic.Value and the first error is surfaced to the caller. Previously, any bootstrap timeout during embedded cluster tests would crash the entire test process with 'panic: context canceled', causing cascading failures across unrelated test packages. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pkg/cnservice/server.go | 7 +++++++ pkg/embed/cluster.go | 20 ++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pkg/cnservice/server.go b/pkg/cnservice/server.go index 11dbc0f619b6b..9c67cec86c2dd 100644 --- a/pkg/cnservice/server.go +++ b/pkg/cnservice/server.go @@ -942,6 +942,13 @@ func (s *service) bootstrap() error { // bootstrap cannot fail. We panic here to make sure the service can not start. // If bootstrap failed, need clean all data to retry. if err := s.bootstrapService.Bootstrap(ctx); err != nil { + // Context cancellation or timeout is not a bootstrap failure — the + // service is being shut down or the deadline was exceeded. Return + // the error so the caller can handle it gracefully instead of + // crashing the process. + if context.Cause(ctx) != nil || err == context.Canceled || err == context.DeadlineExceeded { + return err + } panic(moerr.AttachCause(ctx, err)) } diff --git a/pkg/embed/cluster.go b/pkg/embed/cluster.go index bd061d4d56043..507dd142ab42c 100644 --- a/pkg/embed/cluster.go +++ b/pkg/embed/cluster.go @@ -104,19 +104,20 @@ func (c *cluster) Start() error { return moerr.NewInvalidStateNoCtx("embed mo cluster already started") } - c.doStartLocked(0) + if err := c.doStartLocked(0); err != nil { + return err + } c.state = started return nil } -func (c *cluster) doStartLocked(from int) { +func (c *cluster) doStartLocked(from int) error { var wg sync.WaitGroup - errC := make(chan error, 1) - defer close(errC) + var startErr atomic.Value for _, s := range c.services[from:] { if s.serviceType != metadata.ServiceType_CN { if err := s.Start(); err != nil { - panic(err) + return err } continue } @@ -125,12 +126,16 @@ func (c *cluster) doStartLocked(from int) { go func(s *operator) { defer wg.Done() if err := s.Start(); err != nil { - panic(err) + startErr.CompareAndSwap(nil, err) } }(s) } wg.Wait() + if v := startErr.Load(); v != nil { + return v.(error) + } + return nil } func (c *cluster) Close() error { @@ -227,8 +232,7 @@ func (c *cluster) StartNewCNService(n int) error { return err } - c.doStartLocked(serviceFrom) - return nil + return c.doStartLocked(serviceFrom) } func (c *cluster) adjust() { From 148bbb929251edf8bae8422561078bdfdb69b5a2 Mon Sep 17 00:00:00 2001 From: XuPeng-SH Date: Sun, 5 Apr 2026 12:15:44 +0800 Subject: [PATCH 2/5] fix: narrow bootstrap cancel check to context.Canceled only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous check (context.Cause(ctx) != nil || err == context.Canceled || err == context.DeadlineExceeded) was too broad. The bootstrap context is created with context.WithTimeoutCause(context.Background(), 5min, moerr.CauseBootstrap), so context.Cause(ctx) is always non-nil when the 5-minute deadline fires. That deadline IS a legitimate bootstrap failure that should panic. Only context.Canceled indicates external shutdown — consistent with the pattern at line 963 for BootstrapUpgrade. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pkg/cnservice/server.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/cnservice/server.go b/pkg/cnservice/server.go index 9c67cec86c2dd..7e5e3ba087d58 100644 --- a/pkg/cnservice/server.go +++ b/pkg/cnservice/server.go @@ -942,11 +942,12 @@ func (s *service) bootstrap() error { // bootstrap cannot fail. We panic here to make sure the service can not start. // If bootstrap failed, need clean all data to retry. if err := s.bootstrapService.Bootstrap(ctx); err != nil { - // Context cancellation or timeout is not a bootstrap failure — the - // service is being shut down or the deadline was exceeded. Return - // the error so the caller can handle it gracefully instead of - // crashing the process. - if context.Cause(ctx) != nil || err == context.Canceled || err == context.DeadlineExceeded { + // External context cancellation (e.g. service shutdown) is not a + // bootstrap failure — return the error so the caller can handle it + // gracefully instead of crashing the process. Note: we do NOT + // check for DeadlineExceeded because the 5-minute bootstrap + // timeout IS a legitimate failure that should panic. + if err == context.Canceled { return err } panic(moerr.AttachCause(ctx, err)) From 04bbc961e556cd857103bf9c849b7043d7139121 Mon Sep 17 00:00:00 2001 From: XuPeng-SH Date: Sun, 5 Apr 2026 12:20:45 +0800 Subject: [PATCH 3/5] fix: use errors.Is for context.Canceled check, add comment - Use errors.Is(err, context.Canceled) instead of == for robustness against wrapped errors (consistent with 70% of codebase usage) - Document first-error-only semantics in cluster.go CompareAndSwap Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pkg/cnservice/server.go | 3 ++- pkg/embed/cluster.go | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/cnservice/server.go b/pkg/cnservice/server.go index 7e5e3ba087d58..77af4f2373386 100644 --- a/pkg/cnservice/server.go +++ b/pkg/cnservice/server.go @@ -19,6 +19,7 @@ import ( "compress/gzip" "context" "encoding/hex" + "errors" "fmt" "sync" "time" @@ -947,7 +948,7 @@ func (s *service) bootstrap() error { // gracefully instead of crashing the process. Note: we do NOT // check for DeadlineExceeded because the 5-minute bootstrap // timeout IS a legitimate failure that should panic. - if err == context.Canceled { + if errors.Is(err, context.Canceled) { return err } panic(moerr.AttachCause(ctx, err)) diff --git a/pkg/embed/cluster.go b/pkg/embed/cluster.go index 507dd142ab42c..540e8b4d66969 100644 --- a/pkg/embed/cluster.go +++ b/pkg/embed/cluster.go @@ -126,6 +126,9 @@ func (c *cluster) doStartLocked(from int) error { go func(s *operator) { defer wg.Done() if err := s.Start(); err != nil { + // Only the first error is captured; concurrent failures + // from other services are discarded since knowing that + // any service failed is sufficient to abort startup. startErr.CompareAndSwap(nil, err) } }(s) From 9d484d962056569d2ee010dca960f171ba9d9854 Mon Sep 17 00:00:00 2001 From: XuPeng-SH Date: Sun, 5 Apr 2026 13:39:40 +0800 Subject: [PATCH 4/5] fix: extract handleBootstrapErr + add tests for 100% coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the bootstrap error-handling logic into a standalone handleBootstrapErr() function so it can be unit-tested independently of the full bootstrap() initialization chain. Tests cover all branches: - context.Canceled → returns error gracefully (not panic) - wrapped context.Canceled → same (errors.Is works through wrapping) - context.DeadlineExceeded → panics (5-min timeout is real failure) - bootstrap timeout with cause → panics (WithTimeoutCause scenario) - other errors → panics - moerr wrapped errors → panics Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pkg/cnservice/server.go | 22 ++++--- pkg/cnservice/server_bootstrap_test.go | 80 ++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 9 deletions(-) create mode 100644 pkg/cnservice/server_bootstrap_test.go diff --git a/pkg/cnservice/server.go b/pkg/cnservice/server.go index 77af4f2373386..29bf9a024301c 100644 --- a/pkg/cnservice/server.go +++ b/pkg/cnservice/server.go @@ -943,15 +943,7 @@ func (s *service) bootstrap() error { // bootstrap cannot fail. We panic here to make sure the service can not start. // If bootstrap failed, need clean all data to retry. if err := s.bootstrapService.Bootstrap(ctx); err != nil { - // External context cancellation (e.g. service shutdown) is not a - // bootstrap failure — return the error so the caller can handle it - // gracefully instead of crashing the process. Note: we do NOT - // check for DeadlineExceeded because the 5-minute bootstrap - // timeout IS a legitimate failure that should panic. - if errors.Is(err, context.Canceled) { - return err - } - panic(moerr.AttachCause(ctx, err)) + return handleBootstrapErr(ctx, err) } trace.GetService(s.cfg.UUID).EnableFlush() @@ -972,6 +964,18 @@ func (s *service) bootstrap() error { return nil } +// handleBootstrapErr decides whether a bootstrap error should be returned +// gracefully (for context cancellation during shutdown) or trigger a panic +// (for real bootstrap failures). Only context.Canceled is treated as a +// graceful shutdown signal; DeadlineExceeded from the 5-minute bootstrap +// timeout is a legitimate failure that should still panic. +func handleBootstrapErr(ctx context.Context, err error) error { + if errors.Is(err, context.Canceled) { + return err + } + panic(moerr.AttachCause(ctx, err)) +} + func (s *service) initTxnTraceService() { rt := runtime.ServiceRuntime(s.cfg.UUID) ts, err := trace.NewService( diff --git a/pkg/cnservice/server_bootstrap_test.go b/pkg/cnservice/server_bootstrap_test.go new file mode 100644 index 0000000000000..4093f660bceac --- /dev/null +++ b/pkg/cnservice/server_bootstrap_test.go @@ -0,0 +1,80 @@ +// Copyright 2021-2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cnservice + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/matrixorigin/matrixone/pkg/common/moerr" +) + +func TestHandleBootstrapErr(t *testing.T) { + t.Run("context.Canceled returns error", func(t *testing.T) { + ctx := context.Background() + err := handleBootstrapErr(ctx, context.Canceled) + require.Error(t, err) + assert.True(t, err == context.Canceled) + }) + + t.Run("wrapped context.Canceled returns error", func(t *testing.T) { + ctx := context.Background() + wrappedErr := fmt.Errorf("bootstrap failed: %w", context.Canceled) + err := handleBootstrapErr(ctx, wrappedErr) + require.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) + }) + + t.Run("context.DeadlineExceeded panics", func(t *testing.T) { + ctx := context.Background() + assert.Panics(t, func() { + handleBootstrapErr(ctx, context.DeadlineExceeded) + }) + }) + + t.Run("bootstrap timeout with cause panics", func(t *testing.T) { + // Simulate the real bootstrap context: WithTimeoutCause sets a + // custom cause, but the 5-minute timeout is a legitimate failure + // that must still panic. + ctx, cancel := context.WithTimeoutCause( + context.Background(), 0, moerr.CauseBootstrap, + ) + defer cancel() + // Wait for the timeout to fire. + <-ctx.Done() + + assert.Panics(t, func() { + handleBootstrapErr(ctx, ctx.Err()) + }) + }) + + t.Run("other error panics", func(t *testing.T) { + ctx := context.Background() + assert.Panics(t, func() { + handleBootstrapErr(ctx, fmt.Errorf("SQL execution failed")) + }) + }) + + t.Run("moerr wrapped error panics", func(t *testing.T) { + ctx := context.Background() + assert.Panics(t, func() { + handleBootstrapErr(ctx, moerr.NewInternalErrorNoCtx("bootstrap init failed")) + }) + }) +} From 49772d259e7aca2f6b6b599868c47aa92f3f91ae Mon Sep 17 00:00:00 2001 From: XuPeng-SH Date: Sun, 5 Apr 2026 16:11:38 +0800 Subject: [PATCH 5/5] test: add doStartLocked error path tests for 75%+ coverage Add 5 sub-tests in TestDoStartLockedErrorPaths to exercise the error paths in cluster.go doStartLocked that were previously uncovered: - Non-CN service Start() error returns immediately - CN service Start() error captured via atomic.Value - Start() propagates doStartLocked error - Double-start rejection - Happy path with empty service list This brings diff coverage from 53.8% (7/13 blocks) to 92.3% (12/13), exceeding the 75% threshold. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pkg/embed/cluster_test.go | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/pkg/embed/cluster_test.go b/pkg/embed/cluster_test.go index b519aa7c470cb..acfae54470094 100644 --- a/pkg/embed/cluster_test.go +++ b/pkg/embed/cluster_test.go @@ -28,6 +28,7 @@ import ( "github.com/matrixorigin/matrixone/pkg/container/vector" "github.com/matrixorigin/matrixone/pkg/pb/metadata" "github.com/matrixorigin/matrixone/pkg/util/executor" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -286,3 +287,70 @@ func TestCreateDB(t *testing.T) { }, ) } + +// TestDoStartLockedErrorPaths exercises the error-handling branches in +// doStartLocked that are not reached by normal cluster startup tests. +func TestDoStartLockedErrorPaths(t *testing.T) { + t.Run("non-CN service error returns immediately", func(t *testing.T) { + // A non-CN operator whose state is already 'started' will return + // an error from Start(), exercising the direct-return path at + // cluster.go line 119-121. + op := &operator{ + serviceType: metadata.ServiceType_LOG, + state: started, // forces Start() to return error + } + c := &cluster{ + services: []*operator{op}, + } + err := c.doStartLocked(0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "already started") + }) + + t.Run("CN service error captured via atomic.Value", func(t *testing.T) { + // A CN operator whose state is already 'started' will return an + // error from Start(), exercising the goroutine error-capture path + // at cluster.go lines 128-133 and the error-return at 138-140. + op := &operator{ + serviceType: metadata.ServiceType_CN, + state: started, + } + c := &cluster{ + services: []*operator{op}, + } + err := c.doStartLocked(0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "already started") + }) + + t.Run("Start propagates doStartLocked error", func(t *testing.T) { + // Exercises the error propagation in Start() at line 107-109. + op := &operator{ + serviceType: metadata.ServiceType_LOG, + state: started, + } + c := &cluster{ + state: stopped, + services: []*operator{op}, + } + err := c.Start() + assert.Error(t, err) + assert.Contains(t, err.Error(), "already started") + }) + + t.Run("Start rejects double start", func(t *testing.T) { + c := &cluster{state: started} + err := c.Start() + assert.Error(t, err) + assert.Contains(t, err.Error(), "embed mo cluster already started") + }) + + t.Run("happy path with no services", func(t *testing.T) { + c := &cluster{ + state: stopped, + services: []*operator{}, + } + err := c.doStartLocked(0) + assert.NoError(t, err) + }) +}