Skip to content

Commit aaf6120

Browse files
leonardocearmrulitaocdl
authored
fix: classify WAL restore errors with precise gRPC statuses (#927)
Map each restorer sentinel to the gRPC status that best reflects whether the operator should retry: - ErrWALNotFound -> NotFound (terminal) - ErrInvalidWalName -> InvalidArgument (terminal) - ErrConnectivity -> Unavailable (retry) - ErrGeneric -> Unavailable (retry) - anything else -> Internal (terminal) Previously every non-NotFound failure was returned verbatim, leaving the operator unable to tell transient blips apart from terminal conditions like a malformed WAL name. Signed-off-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com> Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Tao Li <tao.li@enterprisedb.com> Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Tao Li <tao.li@enterprisedb.com>
1 parent 8e2a917 commit aaf6120

8 files changed

Lines changed: 230 additions & 12 deletions

File tree

config/crd/bases/barmancloud.cnpg.io_objectstores.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,25 @@ spec:
176176
format: int32
177177
minimum: 1
178178
type: integer
179+
restoreAdditionalCommandArgs:
180+
description: |-
181+
Additional arguments that can be appended to the 'barman-cloud-restore'
182+
command-line invocation. These arguments provide flexibility to customize
183+
the data restore process further, according to specific requirements or
184+
configurations.
185+
186+
Example:
187+
In a scenario where specialized restore options are required, such as setting
188+
a specific read timeout or defining custom behavior, users can use this field
189+
to specify additional command arguments.
190+
191+
Note:
192+
It's essential to ensure that the provided arguments are valid and supported
193+
by the 'barman-cloud-restore' command, to avoid potential errors or unintended
194+
behavior during execution.
195+
items:
196+
type: string
197+
type: array
179198
type: object
180199
destinationPath:
181200
description: |-

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ go 1.26.3
55
require (
66
github.com/cert-manager/cert-manager v1.20.2
77
github.com/cloudnative-pg/api v1.29.1
8-
github.com/cloudnative-pg/barman-cloud v0.5.1
8+
github.com/cloudnative-pg/barman-cloud v0.5.2-0.20260529020047-bb683bf8d82a
99
github.com/cloudnative-pg/cloudnative-pg v1.29.1
1010
github.com/cloudnative-pg/cnpg-i v0.5.0
1111
github.com/cloudnative-pg/cnpg-i-machinery v0.4.2

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF
2020
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
2121
github.com/cloudnative-pg/api v1.29.1 h1:FWr8S7EQeOfdhYXyr2cof8wUXLARZiiQt5Qa6ltED7w=
2222
github.com/cloudnative-pg/api v1.29.1/go.mod h1:QtWF3yzSvIfORMHaSkPAk/o3bhCJwEHJgN3riyRiz3o=
23-
github.com/cloudnative-pg/barman-cloud v0.5.1 h1:vjkXrrxo2DQXHT9u9usqhtaHiPZ/lTfDVs/pIWYTepQ=
24-
github.com/cloudnative-pg/barman-cloud v0.5.1/go.mod h1:XPc5IUFP1y4cZX1sg+Pd8j9V4tmUEVnv3BGCpfQOOg8=
23+
github.com/cloudnative-pg/barman-cloud v0.5.2-0.20260529020047-bb683bf8d82a h1:u2Ykj9oCJlJw/xxiCXjG+svTmpaHMcaW6kGNdHdertk=
24+
github.com/cloudnative-pg/barman-cloud v0.5.2-0.20260529020047-bb683bf8d82a/go.mod h1:DACs5rdlMIQ7mUOpJVBRTrbf+d7OGyMOLsFUEMJ2JV8=
2525
github.com/cloudnative-pg/cloudnative-pg v1.29.1 h1:ZNEt1TMlnQKXI1kho2UqQuqdfvIvjGln4kN7C1lsmGA=
2626
github.com/cloudnative-pg/cloudnative-pg v1.29.1/go.mod h1:Sbgx9jVmkle4/gR2U5JHrzDd74sRPOBHDtPkvncg5v8=
2727
github.com/cloudnative-pg/cnpg-i v0.5.0 h1:/TOzpNT6cwNgrpftTtrnLKdoHgMwd+88vZgXjlVgXeE=

internal/cnpgi/common/errors.go

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,38 @@ SPDX-License-Identifier: Apache-2.0
2020
package common
2121

2222
import (
23+
"errors"
24+
25+
barmanRestorer "github.com/cloudnative-pg/barman-cloud/pkg/restorer"
2326
"google.golang.org/grpc/codes"
2427
"google.golang.org/grpc/status"
2528
)
2629

30+
// classifyWALRestoreError maps an error returned by the WAL
31+
// restorer to a gRPC-coded error so the caller can tell terminal
32+
// failures apart from transient ones via the status code.
33+
func classifyWALRestoreError(walName string, walErr error) error {
34+
switch {
35+
case errors.Is(walErr, barmanRestorer.ErrWALNotFound):
36+
return newWALNotFoundError(walName)
37+
case errors.Is(walErr, barmanRestorer.ErrInvalidWALName):
38+
// A malformed WAL name will never become valid on retry.
39+
return newInvalidWALNameError(walName, walErr)
40+
case errors.Is(walErr, barmanRestorer.ErrConnectivity),
41+
errors.Is(walErr, barmanRestorer.ErrGeneric):
42+
// barman-cloud exit codes 2 (connectivity) and 4
43+
// (generic) both surface conditions that are retryable
44+
// in practice — barman uses the "generic" bucket for
45+
// some connection-class failures too, not just exit 2.
46+
return newUnavailableError(walName, walErr)
47+
default:
48+
// Unrecognized exit codes and unexpected failures (e.g.
49+
// the barman-cloud command could not be executed). No
50+
// positive signal that retry would help.
51+
return newInternalWALRestoreError(walName, walErr)
52+
}
53+
}
54+
2755
// ErrEndOfWALStreamReached is returned when end of WAL is detected in the cloud archive.
2856
var ErrEndOfWALStreamReached = status.Errorf(codes.OutOfRange, "end of WAL reached")
2957

@@ -35,10 +63,48 @@ var ErrEndOfWALStreamReached = status.Errorf(codes.OutOfRange, "end of WAL reach
3563
var ErrMissingPermissions = status.Errorf(codes.FailedPrecondition,
3664
"no permission to download the backup credentials, retrying")
3765

38-
// newWALNotFoundError returns a error that states that a
39-
// certain WAL file has not been found. This error is
40-
// compatible with GRPC status codes, resulting in a 404
41-
// being used as a response code.
66+
// newWALNotFoundError reports that the requested WAL file is not
67+
// present in the object store. Emits codes.NotFound: this is a
68+
// terminal condition (the file won't appear on retry).
4269
func newWALNotFoundError(walName string) error {
4370
return status.Errorf(codes.NotFound, "wal %q not found", walName)
4471
}
72+
73+
// newUnavailableError reports that downloading the WAL file failed
74+
// for a reason expected to be transient (a barman-cloud connectivity
75+
// blip, or a generic exit code that in practice covers retryable
76+
// conditions too). Emits codes.Unavailable: per gRPC conventions,
77+
// the canonical signal for "retry may succeed".
78+
func newUnavailableError(walName string, err error) error {
79+
return status.Errorf(
80+
codes.Unavailable,
81+
"transient error while downloading %q: %s",
82+
walName,
83+
err.Error(),
84+
)
85+
}
86+
87+
// newInvalidWALNameError reports that the requested WAL name is
88+
// not a valid name. Emits codes.InvalidArgument: this is a
89+
// terminal condition (the same name won't become valid on retry).
90+
func newInvalidWALNameError(walName string, err error) error {
91+
return status.Errorf(
92+
codes.InvalidArgument,
93+
"invalid WAL name %q: %s",
94+
walName,
95+
err.Error(),
96+
)
97+
}
98+
99+
// newInternalWALRestoreError reports that downloading the WAL
100+
// file failed for an unclassified reason. Emits codes.Internal:
101+
// we have no positive signal that retry would help, so by gRPC
102+
// convention this is treated as terminal.
103+
func newInternalWALRestoreError(walName string, err error) error {
104+
return status.Errorf(
105+
codes.Internal,
106+
"internal error while downloading %q: %s",
107+
walName,
108+
err.Error(),
109+
)
110+
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
Copyright © contributors to CloudNativePG, established as
3+
CloudNativePG a Series of LF Projects, LLC.
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
17+
SPDX-License-Identifier: Apache-2.0
18+
*/
19+
20+
package common
21+
22+
import (
23+
"errors"
24+
"fmt"
25+
26+
barmanRestorer "github.com/cloudnative-pg/barman-cloud/pkg/restorer"
27+
. "github.com/onsi/ginkgo/v2"
28+
. "github.com/onsi/gomega"
29+
"google.golang.org/grpc/codes"
30+
"google.golang.org/grpc/status"
31+
)
32+
33+
var _ = Describe("classifyWALRestoreError", func() {
34+
const walName = "000000010000000000000001"
35+
36+
DescribeTable(
37+
"maps barman restorer sentinels to gRPC status codes",
38+
func(walErr error, expectedCode codes.Code) {
39+
got := classifyWALRestoreError(walName, walErr)
40+
Expect(got).To(HaveOccurred())
41+
42+
st, ok := status.FromError(got)
43+
Expect(ok).To(BeTrue(), "returned error must carry a gRPC status")
44+
Expect(st.Code()).To(Equal(expectedCode))
45+
Expect(st.Message()).To(ContainSubstring(walName))
46+
},
47+
Entry("ErrWALNotFound -> NotFound",
48+
fmt.Errorf("object storage or file not found: %w", barmanRestorer.ErrWALNotFound),
49+
codes.NotFound),
50+
Entry("ErrInvalidWALName -> InvalidArgument",
51+
fmt.Errorf("invalid name for a WAL file: %w", barmanRestorer.ErrInvalidWALName),
52+
codes.InvalidArgument),
53+
Entry("ErrConnectivity -> Unavailable",
54+
fmt.Errorf("connectivity failure, retrying: %w", barmanRestorer.ErrConnectivity),
55+
codes.Unavailable),
56+
Entry("ErrGeneric -> Unavailable (barman uses exit 4 for some retryable cases too)",
57+
fmt.Errorf("generic error: %w", barmanRestorer.ErrGeneric),
58+
codes.Unavailable),
59+
Entry("unknown error -> Internal",
60+
errors.New("something we did not classify"),
61+
codes.Internal),
62+
)
63+
64+
It("matches the sentinel even through several wrapping layers", func() {
65+
// The plugin wraps barman errors via fmt.Errorf("...: %w", ...);
66+
// classification must keep working if more wraps appear above.
67+
inner := fmt.Errorf("connectivity failure, retrying: %w", barmanRestorer.ErrConnectivity)
68+
wrapped := fmt.Errorf("while restoring WAL %s: %w", walName, inner)
69+
70+
got := classifyWALRestoreError(walName, wrapped)
71+
st, ok := status.FromError(got)
72+
Expect(ok).To(BeTrue())
73+
Expect(st.Code()).To(Equal(codes.Unavailable))
74+
})
75+
76+
It("treats ErrWALNotFound as terminal even when the error chain mentions other sentinels in its message", func() {
77+
// Defensive: if the underlying error stringifies to something
78+
// resembling another sentinel's message, the switch must still
79+
// match by identity (errors.Is), not by substring.
80+
walErr := fmt.Errorf("not found, looks like a connectivity failure: %w", barmanRestorer.ErrWALNotFound)
81+
82+
got := classifyWALRestoreError(walName, walErr)
83+
st, _ := status.FromError(got)
84+
Expect(st.Code()).To(Equal(codes.NotFound))
85+
})
86+
})
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
Copyright © contributors to CloudNativePG, established as
3+
CloudNativePG a Series of LF Projects, LLC.
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
17+
SPDX-License-Identifier: Apache-2.0
18+
*/
19+
20+
package common
21+
22+
import (
23+
"testing"
24+
25+
. "github.com/onsi/ginkgo/v2"
26+
. "github.com/onsi/gomega"
27+
)
28+
29+
func TestCommon(t *testing.T) {
30+
RegisterFailHandler(Fail)
31+
RunSpecs(t, "Common Suite")
32+
}

internal/cnpgi/common/wal.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -365,11 +365,7 @@ func (w WALServiceImplementation) restoreFromBarmanObjectStore(
365365
// is the one that PostgreSQL has requested to restore.
366366
// The failure has already been logged in walRestorer.RestoreList method
367367
if walStatus[0].Err != nil {
368-
if errors.Is(walStatus[0].Err, barmanRestorer.ErrWALNotFound) {
369-
return newWALNotFoundError(walStatus[0].WalName)
370-
}
371-
372-
return walStatus[0].Err
368+
return classifyWALRestoreError(walStatus[0].WalName, walStatus[0].Err)
373369
}
374370

375371
// We skip this step if streaming connection is not available

manifest.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,25 @@ spec:
175175
format: int32
176176
minimum: 1
177177
type: integer
178+
restoreAdditionalCommandArgs:
179+
description: |-
180+
Additional arguments that can be appended to the 'barman-cloud-restore'
181+
command-line invocation. These arguments provide flexibility to customize
182+
the data restore process further, according to specific requirements or
183+
configurations.
184+
185+
Example:
186+
In a scenario where specialized restore options are required, such as setting
187+
a specific read timeout or defining custom behavior, users can use this field
188+
to specify additional command arguments.
189+
190+
Note:
191+
It's essential to ensure that the provided arguments are valid and supported
192+
by the 'barman-cloud-restore' command, to avoid potential errors or unintended
193+
behavior during execution.
194+
items:
195+
type: string
196+
type: array
178197
type: object
179198
destinationPath:
180199
description: |-

0 commit comments

Comments
 (0)