Skip to content

Commit 78f7b42

Browse files
authored
fix: cnservice bootstrap panics on context.Canceled instead of returning error (#24065)
### Problem `cnservice.(*service).bootstrap()` panics on ALL bootstrap errors, including `context.Canceled` and `context.DeadlineExceeded`. This crashes the entire process when the 5-minute bootstrap timeout expires, which frequently happens during embedded cluster tests with multiple CNs. Additionally, `embed/cluster.go:doStartLocked()` panics on CN Start() errors instead of propagating them to the caller. ### Root Cause ```go // server.go:944 — panics on context.Canceled! if err := s.bootstrapService.Bootstrap(ctx); err != nil { panic(moerr.AttachCause(ctx, err)) } ``` Context cancellation is NOT a bootstrap failure — it's an external signal that the operation should stop. The same file already handles `context.Canceled` gracefully for `BootstrapUpgrade` (line 955) but not for initial `Bootstrap`. ### Fix **1. `pkg/cnservice/server.go`:** Check for context cancellation/timeout before panicking. Return error instead of panicking for context-related failures. **2. `pkg/embed/cluster.go`:** Change `doStartLocked()` from `panic(err)` to proper error propagation via `atomic.Value`, returning the first error to the caller. ### Impact - Prevents cascading test failures when embedded cluster bootstrap times out - Production benefit: graceful shutdown won't crash a CN process that's still bootstrapping - Consistent with existing `BootstrapUpgrade` context handling pattern ### Risk Assessment **Very low risk.** Both changes are purely additive: - `server.go`: Only changes the error handling for context-related errors; real bootstrap failures still panic as before - `cluster.go`: Replaces `panic(err)` with `return err` — strictly more graceful behavior Approved by: @LeftHandCold
1 parent 3edb16c commit 78f7b42

4 files changed

Lines changed: 177 additions & 9 deletions

File tree

pkg/cnservice/server.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"compress/gzip"
2020
"context"
2121
"encoding/hex"
22+
"errors"
2223
"fmt"
2324
"sync"
2425
"time"
@@ -942,7 +943,7 @@ func (s *service) bootstrap() error {
942943
// bootstrap cannot fail. We panic here to make sure the service can not start.
943944
// If bootstrap failed, need clean all data to retry.
944945
if err := s.bootstrapService.Bootstrap(ctx); err != nil {
945-
panic(moerr.AttachCause(ctx, err))
946+
return handleBootstrapErr(ctx, err)
946947
}
947948

948949
trace.GetService(s.cfg.UUID).EnableFlush()
@@ -963,6 +964,18 @@ func (s *service) bootstrap() error {
963964
return nil
964965
}
965966

967+
// handleBootstrapErr decides whether a bootstrap error should be returned
968+
// gracefully (for context cancellation during shutdown) or trigger a panic
969+
// (for real bootstrap failures). Only context.Canceled is treated as a
970+
// graceful shutdown signal; DeadlineExceeded from the 5-minute bootstrap
971+
// timeout is a legitimate failure that should still panic.
972+
func handleBootstrapErr(ctx context.Context, err error) error {
973+
if errors.Is(err, context.Canceled) {
974+
return err
975+
}
976+
panic(moerr.AttachCause(ctx, err))
977+
}
978+
966979
func (s *service) initTxnTraceService() {
967980
rt := runtime.ServiceRuntime(s.cfg.UUID)
968981
ts, err := trace.NewService(
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Copyright 2021-2024 Matrix Origin
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cnservice
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"testing"
21+
22+
"github.com/stretchr/testify/assert"
23+
"github.com/stretchr/testify/require"
24+
25+
"github.com/matrixorigin/matrixone/pkg/common/moerr"
26+
)
27+
28+
func TestHandleBootstrapErr(t *testing.T) {
29+
t.Run("context.Canceled returns error", func(t *testing.T) {
30+
ctx := context.Background()
31+
err := handleBootstrapErr(ctx, context.Canceled)
32+
require.Error(t, err)
33+
assert.True(t, err == context.Canceled)
34+
})
35+
36+
t.Run("wrapped context.Canceled returns error", func(t *testing.T) {
37+
ctx := context.Background()
38+
wrappedErr := fmt.Errorf("bootstrap failed: %w", context.Canceled)
39+
err := handleBootstrapErr(ctx, wrappedErr)
40+
require.Error(t, err)
41+
assert.ErrorIs(t, err, context.Canceled)
42+
})
43+
44+
t.Run("context.DeadlineExceeded panics", func(t *testing.T) {
45+
ctx := context.Background()
46+
assert.Panics(t, func() {
47+
handleBootstrapErr(ctx, context.DeadlineExceeded)
48+
})
49+
})
50+
51+
t.Run("bootstrap timeout with cause panics", func(t *testing.T) {
52+
// Simulate the real bootstrap context: WithTimeoutCause sets a
53+
// custom cause, but the 5-minute timeout is a legitimate failure
54+
// that must still panic.
55+
ctx, cancel := context.WithTimeoutCause(
56+
context.Background(), 0, moerr.CauseBootstrap,
57+
)
58+
defer cancel()
59+
// Wait for the timeout to fire.
60+
<-ctx.Done()
61+
62+
assert.Panics(t, func() {
63+
handleBootstrapErr(ctx, ctx.Err())
64+
})
65+
})
66+
67+
t.Run("other error panics", func(t *testing.T) {
68+
ctx := context.Background()
69+
assert.Panics(t, func() {
70+
handleBootstrapErr(ctx, fmt.Errorf("SQL execution failed"))
71+
})
72+
})
73+
74+
t.Run("moerr wrapped error panics", func(t *testing.T) {
75+
ctx := context.Background()
76+
assert.Panics(t, func() {
77+
handleBootstrapErr(ctx, moerr.NewInternalErrorNoCtx("bootstrap init failed"))
78+
})
79+
})
80+
}

pkg/embed/cluster.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -104,19 +104,20 @@ func (c *cluster) Start() error {
104104
return moerr.NewInvalidStateNoCtx("embed mo cluster already started")
105105
}
106106

107-
c.doStartLocked(0)
107+
if err := c.doStartLocked(0); err != nil {
108+
return err
109+
}
108110
c.state = started
109111
return nil
110112
}
111113

112-
func (c *cluster) doStartLocked(from int) {
114+
func (c *cluster) doStartLocked(from int) error {
113115
var wg sync.WaitGroup
114-
errC := make(chan error, 1)
115-
defer close(errC)
116+
var startErr atomic.Value
116117
for _, s := range c.services[from:] {
117118
if s.serviceType != metadata.ServiceType_CN {
118119
if err := s.Start(); err != nil {
119-
panic(err)
120+
return err
120121
}
121122
continue
122123
}
@@ -125,12 +126,19 @@ func (c *cluster) doStartLocked(from int) {
125126
go func(s *operator) {
126127
defer wg.Done()
127128
if err := s.Start(); err != nil {
128-
panic(err)
129+
// Only the first error is captured; concurrent failures
130+
// from other services are discarded since knowing that
131+
// any service failed is sufficient to abort startup.
132+
startErr.CompareAndSwap(nil, err)
129133
}
130134
}(s)
131135
}
132136

133137
wg.Wait()
138+
if v := startErr.Load(); v != nil {
139+
return v.(error)
140+
}
141+
return nil
134142
}
135143

136144
func (c *cluster) Close() error {
@@ -227,8 +235,7 @@ func (c *cluster) StartNewCNService(n int) error {
227235
return err
228236
}
229237

230-
c.doStartLocked(serviceFrom)
231-
return nil
238+
return c.doStartLocked(serviceFrom)
232239
}
233240

234241
func (c *cluster) adjust() {

pkg/embed/cluster_test.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/matrixorigin/matrixone/pkg/container/vector"
2929
"github.com/matrixorigin/matrixone/pkg/pb/metadata"
3030
"github.com/matrixorigin/matrixone/pkg/util/executor"
31+
"github.com/stretchr/testify/assert"
3132
"github.com/stretchr/testify/require"
3233
)
3334

@@ -286,3 +287,70 @@ func TestCreateDB(t *testing.T) {
286287
},
287288
)
288289
}
290+
291+
// TestDoStartLockedErrorPaths exercises the error-handling branches in
292+
// doStartLocked that are not reached by normal cluster startup tests.
293+
func TestDoStartLockedErrorPaths(t *testing.T) {
294+
t.Run("non-CN service error returns immediately", func(t *testing.T) {
295+
// A non-CN operator whose state is already 'started' will return
296+
// an error from Start(), exercising the direct-return path at
297+
// cluster.go line 119-121.
298+
op := &operator{
299+
serviceType: metadata.ServiceType_LOG,
300+
state: started, // forces Start() to return error
301+
}
302+
c := &cluster{
303+
services: []*operator{op},
304+
}
305+
err := c.doStartLocked(0)
306+
assert.Error(t, err)
307+
assert.Contains(t, err.Error(), "already started")
308+
})
309+
310+
t.Run("CN service error captured via atomic.Value", func(t *testing.T) {
311+
// A CN operator whose state is already 'started' will return an
312+
// error from Start(), exercising the goroutine error-capture path
313+
// at cluster.go lines 128-133 and the error-return at 138-140.
314+
op := &operator{
315+
serviceType: metadata.ServiceType_CN,
316+
state: started,
317+
}
318+
c := &cluster{
319+
services: []*operator{op},
320+
}
321+
err := c.doStartLocked(0)
322+
assert.Error(t, err)
323+
assert.Contains(t, err.Error(), "already started")
324+
})
325+
326+
t.Run("Start propagates doStartLocked error", func(t *testing.T) {
327+
// Exercises the error propagation in Start() at line 107-109.
328+
op := &operator{
329+
serviceType: metadata.ServiceType_LOG,
330+
state: started,
331+
}
332+
c := &cluster{
333+
state: stopped,
334+
services: []*operator{op},
335+
}
336+
err := c.Start()
337+
assert.Error(t, err)
338+
assert.Contains(t, err.Error(), "already started")
339+
})
340+
341+
t.Run("Start rejects double start", func(t *testing.T) {
342+
c := &cluster{state: started}
343+
err := c.Start()
344+
assert.Error(t, err)
345+
assert.Contains(t, err.Error(), "embed mo cluster already started")
346+
})
347+
348+
t.Run("happy path with no services", func(t *testing.T) {
349+
c := &cluster{
350+
state: stopped,
351+
services: []*operator{},
352+
}
353+
err := c.doStartLocked(0)
354+
assert.NoError(t, err)
355+
})
356+
}

0 commit comments

Comments
 (0)