Skip to content

Commit 29e71d5

Browse files
kvapsclaude
andcommitted
fix(apiserver): resolve VD GET cache-miss via live RD re-read (aligns with RD GET)
Volume-definitions are embedded in the RD CRD spec, so after the v0.1.15 RD-404 fix the SAME committed RD answered differently by path on a multi-replica apiserver: `GET /v1/resource-definitions/{rd}` resolved through the RD store's uncached fallback while `GET .../volume-definitions/{vn}` read only the lagging informer cache and 404'd — observed live on the cozystack e2e stand (`GET .../volume-definitions/0 -> 404`, REST cache-retry budget exhausted). volumeDefinitions.Get now does one live re-read via the existing fetchRDLive on a cache miss OR a stale cached RD revision, mirroring resourceDefinitions.Get. Deliberately NOT extended to the other stores: raw store Gets are cache-only by design — REST existence probes rely on a fast cached NotFound, and get*WithCacheRetry absorbs cross-replica lag at the REST layer while preserving read-your-writes for subsequent cached Lists. A store-level fallback short-circuits that convergence wait: trying it across all stores made the CI snapshot-pagination test regress (the create flow's read-back resolved uncached while the cached List still under-reported). NewWithAPIReader's doc now pins this boundary. Regression tests (L1, fail-on-bug proven — both FAIL on the pre-fix tree): the stale-RD embedded-VD mode, the RD-cache-miss mode, and nil-reader NotFound preservation. Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
1 parent 1e9501e commit 29e71d5

3 files changed

Lines changed: 195 additions & 7 deletions

File tree

pkg/store/k8s/k8s.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,22 @@ func New(c ctrlclient.Client) *Store {
7171
// BUG-048 atomic VolumeNumber allocation, where retrying an optimistic-
7272
// lock conflict against a stale informer cache re-derives the SAME
7373
// number and the create-loop never converges (the second of two
74-
// concurrent `vd c` is dropped). Pass mgr.GetAPIReader() in production;
75-
// nil is accepted and every path falls back to the cached client
76-
// (in-memory / unit harnesses that have no informer).
74+
// concurrent `vd c` is dropped), and the RD/VD GET cache-miss fallback
75+
// (a multi-replica apiserver GET can land on a replica whose cache has
76+
// not observed a just-committed RD yet; VDs are embedded in the RD, so
77+
// both paths must answer consistently). Pass mgr.GetAPIReader() in
78+
// production; nil is accepted and every path falls back to the cached
79+
// client (in-memory / unit harnesses that have no informer).
80+
//
81+
// Do NOT extend the uncached fallback to the other stores' Gets: raw
82+
// store Gets are cache-only by design. REST existence probes rely on a
83+
// fast cached NotFound, and cross-replica lag on read endpoints is
84+
// absorbed at the REST layer by get*WithCacheRetry, which POLLS the
85+
// cached read so subsequent cached Lists stay read-your-writes
86+
// coherent. A store-level fallback short-circuits that convergence
87+
// wait (a CI snapshot-pagination regression demonstrated this when it
88+
// was tried: the create flow's read-back resolved uncached while the
89+
// cached List still under-reported).
7790
func NewWithAPIReader(c ctrlclient.Client, apiReader ctrlclient.Reader) *Store {
7891
s := &Store{c: c}
7992
s.nodes = &nodes{c: c}
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
3+
/*
4+
Copyright 2026 Cozystack contributors.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
package k8s
20+
21+
import (
22+
"context"
23+
"errors"
24+
"testing"
25+
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
29+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
30+
31+
crdv1alpha1 "github.com/cozystack/blockstor/api/v1alpha1"
32+
"github.com/cozystack/blockstor/pkg/store"
33+
)
34+
35+
// These tests align volumeDefinitions.Get with the released
36+
// resourceDefinitions.Get cache-miss behaviour (multi-replica
37+
// apiserver, no leader election). VDs are EMBEDDED in the RD CRD
38+
// spec, so before this fix the SAME committed RD answered differently
39+
// by path: `GET /v1/resource-definitions/{rd}` resolved through the
40+
// RD store's uncached fallback while
41+
// `GET .../volume-definitions/{vn}` read only the lagging cache and
42+
// 404'd (observed live on the cozystack e2e stand as
43+
// `GET .../volume-definitions/0 -> 404` with the REST-layer
44+
// cache-retry budget exhausted).
45+
//
46+
// Deliberately NOT extended to the other stores: raw store Gets are
47+
// cache-only by design — REST existence probes rely on a fast cached
48+
// NotFound, and the REST layer's get*WithCacheRetry helpers already
49+
// absorb cross-replica lag while preserving read-your-writes for
50+
// subsequent cached Lists. A store-level uncached fallback
51+
// short-circuits that convergence wait (demonstrated by the CI
52+
// snapshot-pagination regression when it was tried).
53+
54+
func cacheMissScheme(t *testing.T) *runtime.Scheme {
55+
t.Helper()
56+
57+
scheme := runtime.NewScheme()
58+
if err := crdv1alpha1.AddToScheme(scheme); err != nil {
59+
t.Fatalf("AddToScheme: %v", err)
60+
}
61+
62+
return scheme
63+
}
64+
65+
func emptyClient(t *testing.T, scheme *runtime.Scheme) ctrlclient.Client {
66+
t.Helper()
67+
68+
return fake.NewClientBuilder().WithScheme(scheme).Build()
69+
}
70+
71+
func clientWith(t *testing.T, scheme *runtime.Scheme, objs ...ctrlclient.Object) ctrlclient.Client {
72+
t.Helper()
73+
74+
return fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
75+
}
76+
77+
// TestVolumeDefinitionsGetStaleRDFallsBackToLiveRead pins the subtle VD
78+
// staleness mode: the lagging replica's cache can hold the RD itself
79+
// while still missing a just-committed volume-definition. The cached
80+
// read alone can not distinguish "VD absent" from "RD revision stale"
81+
// — Get must re-read the RD live before answering 404.
82+
func TestVolumeDefinitionsGetStaleRDFallsBackToLiveRead(t *testing.T) {
83+
t.Parallel()
84+
85+
scheme := cacheMissScheme(t)
86+
87+
stale := &crdv1alpha1.ResourceDefinition{
88+
ObjectMeta: metav1.ObjectMeta{Name: Name("pvc-stale")},
89+
}
90+
fresh := &crdv1alpha1.ResourceDefinition{
91+
ObjectMeta: metav1.ObjectMeta{Name: Name("pvc-stale")},
92+
Spec: crdv1alpha1.ResourceDefinitionSpec{
93+
VolumeDefinitions: []crdv1alpha1.ResourceDefinitionVolume{{
94+
VolumeNumber: 0,
95+
SizeKib: 1 << 20,
96+
}},
97+
},
98+
}
99+
100+
st := &volumeDefinitions{c: clientWith(t, scheme, stale), apiReader: clientWith(t, scheme, fresh)}
101+
102+
got, err := st.Get(context.Background(), "pvc-stale", 0)
103+
if err != nil {
104+
t.Fatalf("Get with live re-read fallback: unexpected error: %v", err)
105+
}
106+
107+
if got.SizeKib != 1<<20 {
108+
t.Fatalf("got SizeKib %d, want %d", got.SizeKib, 1<<20)
109+
}
110+
111+
// Truly-absent VD must still be NotFound after the live re-read.
112+
if _, err := st.Get(context.Background(), "pvc-stale", 7); !errors.Is(err, store.ErrNotFound) {
113+
t.Fatalf("Get of absent VD: got %v, want store.ErrNotFound", err)
114+
}
115+
}
116+
117+
// TestVolumeDefinitionsGetCacheMissRDFallsBackToLiveRead pins the
118+
// coarser mode: the RD itself has not reached this replica's cache yet.
119+
func TestVolumeDefinitionsGetCacheMissRDFallsBackToLiveRead(t *testing.T) {
120+
t.Parallel()
121+
122+
scheme := cacheMissScheme(t)
123+
124+
fresh := &crdv1alpha1.ResourceDefinition{
125+
ObjectMeta: metav1.ObjectMeta{Name: Name("pvc-vdmiss")},
126+
Spec: crdv1alpha1.ResourceDefinitionSpec{
127+
VolumeDefinitions: []crdv1alpha1.ResourceDefinitionVolume{{
128+
VolumeNumber: 0,
129+
SizeKib: 4096,
130+
}},
131+
},
132+
}
133+
134+
st := &volumeDefinitions{c: emptyClient(t, scheme), apiReader: clientWith(t, scheme, fresh)}
135+
136+
got, err := st.Get(context.Background(), "pvc-vdmiss", 0)
137+
if err != nil {
138+
t.Fatalf("Get with live re-read fallback: unexpected error: %v", err)
139+
}
140+
141+
if got.SizeKib != 4096 {
142+
t.Fatalf("got SizeKib %d, want %d", got.SizeKib, 4096)
143+
}
144+
145+
nilReader := &volumeDefinitions{c: emptyClient(t, scheme)}
146+
if _, err := nilReader.Get(context.Background(), "pvc-vdmiss", 0); !errors.Is(err, store.ErrNotFound) {
147+
t.Fatalf("Get with nil apiReader: got %v, want store.ErrNotFound", err)
148+
}
149+
}

pkg/store/k8s/volume_definitions.go

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,27 @@ func (s *volumeDefinitions) List(ctx context.Context, rdName string) ([]apiv1.Vo
7272

7373
func (s *volumeDefinitions) Get(ctx context.Context, rdName string, volumeNumber int32) (apiv1.VolumeDefinition, error) {
7474
rd, err := s.fetchRD(ctx, rdName)
75-
if err != nil {
75+
if err == nil {
76+
if vd := vdByNumber(rd, volumeNumber); vd != nil {
77+
return crdToWireVD(vd), nil
78+
}
79+
} else if !errors.Is(err, store.ErrNotFound) || s.apiReader == nil {
7680
return apiv1.VolumeDefinition{}, err
7781
}
7882

79-
for i := range rd.Spec.VolumeDefinitions {
80-
if rd.Spec.VolumeDefinitions[i].VolumeNumber == volumeNumber {
81-
return crdToWireVD(&rd.Spec.VolumeDefinitions[i]), nil
83+
// Cache miss — or a STALE cached RD revision: volume-definitions are
84+
// embedded in the RD spec, so a lagging replica's cache can hold the
85+
// RD while still missing a just-committed VD (observed live as
86+
// `GET .../volume-definitions/0 -> 404` on the cozystack e2e stand).
87+
// One live re-read before concluding 404.
88+
if s.apiReader != nil {
89+
rd, err = s.fetchRDLive(ctx, rdName)
90+
if err != nil {
91+
return apiv1.VolumeDefinition{}, err
92+
}
93+
94+
if vd := vdByNumber(rd, volumeNumber); vd != nil {
95+
return crdToWireVD(vd), nil
8296
}
8397
}
8498

@@ -441,3 +455,15 @@ func wireToCRDVD(vd *apiv1.VolumeDefinition) crdv1alpha1.ResourceDefinitionVolum
441455
Flags: vd.Flags,
442456
}
443457
}
458+
459+
// vdByNumber returns the embedded volume-definition with the given
460+
// VolumeNumber, or nil when the RD spec does not carry it.
461+
func vdByNumber(rd *crdv1alpha1.ResourceDefinition, volumeNumber int32) *crdv1alpha1.ResourceDefinitionVolume {
462+
for i := range rd.Spec.VolumeDefinitions {
463+
if rd.Spec.VolumeDefinitions[i].VolumeNumber == volumeNumber {
464+
return &rd.Spec.VolumeDefinitions[i]
465+
}
466+
}
467+
468+
return nil
469+
}

0 commit comments

Comments
 (0)