Skip to content

Commit efbcc39

Browse files
committed
Implement Mixed Mode State Detection
Replace the global boolean cf.CoeConfig.EnableVPCNetwork with namespace-driven mixed-mode state: HasT1Namespaces and HasVPCNamespaces. New module: pkg/config/mixed_mode.go - Checks SupervisorCapabilities CRD for supports_per_namespace_network_providers capability. - If supported: scans namespace annotations `nsx.vmware.com/vpc_network_config` to derive HasVPCNamespaces and HasT1Namespaces. - If not supported (legacy/pre-9.2): falls back to EnableVPCNetwork config flag. This enables NSX Operator to run in mixed mode where both T1 and VPC namespaces coexist, as required for VDS->VPC and T1->VPC migration. NOTE: This patch only ensures that the existing pure T1 or pure VPC envs preserve existing behaviours. The full functionality will be implemented in the follow-up patches. Testing done: https://jenkins-vcf-wcp-dev.devops.broadcom.net/job/dev-integ-nsxt/5639/ https://jenkins-vcf-wcp-dev.devops.broadcom.net/job/dev-nsxvpc/16738/
1 parent 6f7ddad commit efbcc39

12 files changed

Lines changed: 1038 additions & 72 deletions

File tree

cmd/main.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ func init() {
113113

114114
func startServiceController(mgr manager.Manager, nsxClient *nsx.Client) {
115115
// Generate webhook certificates and start refreshing webhook certificates periodically
116-
if cf.CoeConfig.EnableVPCNetwork {
116+
if config.HasVPCNamespaces() {
117117
if err := pkgutil.GenerateWebhookCerts(); err != nil {
118118
log.Error(err, "Failed to generate webhook certificates")
119119
os.Exit(1)
@@ -123,7 +123,7 @@ func startServiceController(mgr manager.Manager, nsxClient *nsx.Client) {
123123
}
124124

125125
// Initialize and start the system health reporter
126-
if cf.CoeConfig.EnableVPCNetwork && cf.EnableInventory && cf.CoeConfig.EnableSha {
126+
if config.HasVPCNamespaces() && cf.EnableInventory && cf.CoeConfig.EnableSha {
127127
health.Start(nsxClient, cf, mgr.GetClient())
128128
}
129129

@@ -136,7 +136,7 @@ func startServiceController(mgr manager.Manager, nsxClient *nsx.Client) {
136136

137137
checkLicense(nsxClient, cf.LicenseValidationInterval)
138138

139-
if cf.K8sConfig.EnableRestore && cf.CoeConfig.EnableVPCNetwork {
139+
if cf.K8sConfig.EnableRestore && config.HasVPCNamespaces() {
140140
var err error
141141
restoreMode, err = pkgutil.CompareNSXRestore(mgr.GetClient(), nsxClient)
142142
if err != nil {
@@ -153,7 +153,7 @@ func startServiceController(mgr manager.Manager, nsxClient *nsx.Client) {
153153
var hookServer webhook.Server
154154
var subnetSetReconcile *subnetset.SubnetSetReconciler
155155

156-
if cf.CoeConfig.EnableVPCNetwork {
156+
if config.HasVPCNamespaces() {
157157
// Check NSX version for VPC networking mode
158158
if !commonService.NSXClient.NSXCheckVersion(nsx.VPC) {
159159
log.Error(nil, "VPC mode cannot be enabled if NSX version is lower than 4.1.1")
@@ -284,6 +284,21 @@ func startServiceController(mgr manager.Manager, nsxClient *nsx.Client) {
284284
log.Error(err, "Failed to update Pod labels")
285285
panic(err)
286286
}
287+
288+
// Watch for mixed-mode state changes (e.g. T1-only → T1+VPC when the migration starts).
289+
// If the state changes, exit so the operator restarts with the new configuration
290+
// — this is simpler and safer than hot-initializing VPC services after startup.
291+
config.SetMixedModeNamespaceRefreshReader(mgr.GetClient())
292+
go func() {
293+
ticker := time.NewTicker(30 * time.Second)
294+
defer ticker.Stop()
295+
for range ticker.C {
296+
if config.RefreshMixedModeState(context.Background()) {
297+
log.Info("Mixed-mode state changed; restarting NSX Operator to pick up new configuration")
298+
os.Exit(0)
299+
}
300+
}
301+
}()
287302
}
288303

289304
func electMaster(mgr manager.Manager, nsxClient *nsx.Client) {
@@ -329,6 +344,12 @@ func main() {
329344
os.Exit(1)
330345
}
331346

347+
if err := config.InitMixedMode(context.Background(), cfg, cf.CoeConfig.EnableVPCNetwork); err != nil {
348+
log.Error(err, "Failed to initialize mixed mode state")
349+
os.Exit(1)
350+
}
351+
util.SetHasVPCNamespacesFunc(config.HasVPCNamespaces)
352+
332353
if cf.HAEnabled() {
333354
go electMaster(mgr, nsxClient)
334355
} else {

pkg/config/mixed_mode.go

Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
/* Copyright © 2026 VMware, Inc. All Rights Reserved.
2+
SPDX-License-Identifier: Apache-2.0 */
3+
4+
package config
5+
6+
import (
7+
"context"
8+
"strings"
9+
"sync"
10+
"time"
11+
12+
v1 "k8s.io/api/core/v1"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
15+
"k8s.io/apimachinery/pkg/runtime/schema"
16+
"k8s.io/client-go/dynamic"
17+
"k8s.io/client-go/kubernetes"
18+
"k8s.io/client-go/rest"
19+
"sigs.k8s.io/controller-runtime/pkg/client"
20+
21+
"github.com/vmware-tanzu/nsx-operator/pkg/logger"
22+
)
23+
24+
const (
25+
VPCNetworkConfigAnnotation = "nsx.vmware.com/vpc_network_config"
26+
27+
supervisorCapabilitiesName = "supervisor-capabilities"
28+
)
29+
30+
var (
31+
supervisorCapabilitiesGVR = schema.GroupVersionResource{
32+
Group: "iaas.vmware.com",
33+
Version: "v1alpha1",
34+
Resource: "supervisorcapabilities",
35+
}
36+
37+
stateMu sync.RWMutex
38+
hasT1Namespaces bool
39+
hasVPCNamespaces bool
40+
perNamespaceProvidersSupported *bool
41+
stateInitialized bool
42+
43+
// retryInitialInterval and retryMaxInterval control the exponential
44+
// backoff used when a transient error prevents reading
45+
// SupervisorCapabilities or listing namespaces. Overridable in tests.
46+
retryInitialInterval = 2 * time.Second
47+
retryMaxInterval = 30 * time.Second
48+
49+
// storedClientset is kept from InitMixedMode so that RefreshMixedModeState
50+
// can re-scan without requiring the caller to pass it each time.
51+
storedClientset kubernetes.Interface
52+
53+
// namespaceRefreshReader, when non-nil, is used by RefreshMixedModeState to list
54+
// namespaces from the controller-runtime cache (mgr.GetClient()) instead of
55+
// a direct API list on storedClientset — reducing apiserver load on the 30s
56+
// refresh ticker. Set via SetMixedModeNamespaceRefreshReader from cmd after
57+
// controllers are registered on the manager.
58+
namespaceRefreshReader client.Reader
59+
refreshReaderMu sync.RWMutex
60+
)
61+
62+
var log = logger.Log
63+
64+
// checkPerNamespaceProvidersSupported fetches the SupervisorCapabilities object and
65+
// returns whether per-namespace network providers are activated. It retries
66+
// all errors with exponential backoff (starting at retryInitialInterval,
67+
// doubling each attempt, capped at retryMaxInterval). The SupervisorCapabilities
68+
// CR is guaranteed to exist; all failures are treated as transient (e.g. API
69+
// server not yet ready at operator startup). Returns false only when the
70+
// context is cancelled.
71+
func checkPerNamespaceProvidersSupported(ctx context.Context, dynClient dynamic.Interface) bool {
72+
interval := retryInitialInterval
73+
for {
74+
obj, err := dynClient.Resource(supervisorCapabilitiesGVR).Get(
75+
ctx, supervisorCapabilitiesName, metav1.GetOptions{})
76+
if err == nil {
77+
return extractCapability(obj)
78+
}
79+
log.Info("Failed to get SupervisorCapabilities, will retry", "error", err, "retryIn", interval)
80+
select {
81+
case <-ctx.Done():
82+
log.Info("Context cancelled while waiting for SupervisorCapabilities, falling back to legacy config")
83+
return false
84+
case <-time.After(interval):
85+
}
86+
interval = min(interval*2, retryMaxInterval)
87+
}
88+
}
89+
90+
func extractCapability(obj *unstructured.Unstructured) bool {
91+
status, found, err := unstructured.NestedMap(obj.Object, "status")
92+
if err != nil || !found {
93+
return false
94+
}
95+
services, found, err := unstructured.NestedMap(status, "services")
96+
if err != nil || !found {
97+
return false
98+
}
99+
for _, svcCaps := range services {
100+
capsMap, ok := svcCaps.(map[string]interface{})
101+
if !ok {
102+
continue
103+
}
104+
cap, ok := capsMap["supports_per_namespace_network_providers"]
105+
if !ok {
106+
continue
107+
}
108+
capMap, ok := cap.(map[string]interface{})
109+
if !ok {
110+
continue
111+
}
112+
activated, ok := capMap["activated"]
113+
if ok {
114+
if b, ok := activated.(bool); ok && b {
115+
return true
116+
}
117+
}
118+
}
119+
return false
120+
}
121+
122+
func namespaceHasVPCNetworkConfig(ns *v1.Namespace) bool {
123+
if ns == nil {
124+
return false
125+
}
126+
v := strings.TrimSpace(ns.Annotations[VPCNetworkConfigAnnotation])
127+
return v != ""
128+
}
129+
130+
func accumulateMixedModeFlagsFromNamespaces(items []v1.Namespace) (hasT1 bool, hasVPC bool) {
131+
for i := range items {
132+
if namespaceHasVPCNetworkConfig(&items[i]) {
133+
hasVPC = true
134+
} else {
135+
hasT1 = true
136+
}
137+
}
138+
return hasT1, hasVPC
139+
}
140+
141+
func scanNamespaceProviders(ctx context.Context, clientset kubernetes.Interface) (hasT1 bool, hasVPC bool, err error) {
142+
nsList, err := clientset.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
143+
if err != nil {
144+
return false, false, err
145+
}
146+
hasT1, hasVPC = accumulateMixedModeFlagsFromNamespaces(nsList.Items)
147+
return hasT1, hasVPC, nil
148+
}
149+
150+
func scanNamespaceProvidersWithClient(ctx context.Context, reader client.Reader) (hasT1 bool, hasVPC bool, err error) {
151+
nsList := &v1.NamespaceList{}
152+
if err := reader.List(ctx, nsList); err != nil {
153+
return false, false, err
154+
}
155+
hasT1, hasVPC = accumulateMixedModeFlagsFromNamespaces(nsList.Items)
156+
return hasT1, hasVPC, nil
157+
}
158+
159+
// SetMixedModeNamespaceRefreshReader registers a cache-backed client.Reader
160+
// (typically mgr.GetClient()) for periodic mixed-mode rescans. When nil,
161+
// RefreshMixedModeState keeps using the kubernetes.Interface from InitMixedMode.
162+
// Call once from cmd after controllers are set up on the manager.
163+
func SetMixedModeNamespaceRefreshReader(r client.Reader) {
164+
refreshReaderMu.Lock()
165+
defer refreshReaderMu.Unlock()
166+
namespaceRefreshReader = r
167+
}
168+
169+
func currentNamespaceRefreshReader() client.Reader {
170+
refreshReaderMu.RLock()
171+
defer refreshReaderMu.RUnlock()
172+
return namespaceRefreshReader
173+
}
174+
175+
// waitForNamespaceProviders retries scanNamespaceProviders with exponential
176+
// backoff on transient errors (e.g. API server not yet ready at operator startup).
177+
func waitForNamespaceProviders(ctx context.Context, clientset kubernetes.Interface) (bool, bool) {
178+
interval := retryInitialInterval
179+
for {
180+
hasT1, hasVPC, err := scanNamespaceProviders(ctx, clientset)
181+
if err == nil {
182+
return hasT1, hasVPC
183+
}
184+
log.Warn("Failed to list namespaces for mixed-mode scan, will retry", "error", err, "retryIn", interval)
185+
select {
186+
case <-ctx.Done():
187+
log.Info("Context cancelled during mixed-mode namespace scan, returning empty state")
188+
return false, false
189+
case <-time.After(interval):
190+
}
191+
interval = min(interval*2, retryMaxInterval)
192+
}
193+
}
194+
195+
// InitMixedMode initializes mixed-mode state by checking SupervisorCapabilities
196+
// and scanning namespaces (non-empty nsx.vmware.com/vpc_network_config to VPC,
197+
// otherwise T1 for mixed-mode aggregation). If per-namespace providers are not
198+
// activated, falls back to the legacy EnableVPCNetwork flag.
199+
//
200+
// The SupervisorCapabilities lookup is performed outside the state mutex so
201+
// that transient API errors can be retried without blocking readers for an
202+
// extended period.
203+
func InitMixedMode(ctx context.Context, cfg *rest.Config, enableVPCNetwork bool) error {
204+
clientset, err := kubernetes.NewForConfig(cfg)
205+
if err != nil {
206+
return err
207+
}
208+
dynClient, err := dynamic.NewForConfig(cfg)
209+
if err != nil {
210+
return err
211+
}
212+
initMixedModeWithClients(ctx, clientset, dynClient, enableVPCNetwork)
213+
return nil
214+
}
215+
216+
func initMixedModeWithClients(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, enableVPCNetwork bool) {
217+
// checkPerNamespaceProvidersSupported retries on transient errors; runs outside
218+
// the mutex to avoid holding the lock during potentially many retries.
219+
supported := checkPerNamespaceProvidersSupported(ctx, dynClient)
220+
221+
var t1, vpc bool
222+
if supported {
223+
log.Info("Per-namespace network providers are supported, scanning namespaces for mixed-mode")
224+
t1, vpc = waitForNamespaceProviders(ctx, clientset)
225+
} else {
226+
log.Info("Per-namespace network providers not supported, using legacy config", "enableVPCNetwork", enableVPCNetwork)
227+
if enableVPCNetwork {
228+
t1, vpc = false, true
229+
} else {
230+
t1, vpc = true, false
231+
}
232+
}
233+
stateMu.Lock()
234+
defer stateMu.Unlock()
235+
storedClientset = clientset
236+
perNamespaceProvidersSupported = &supported
237+
hasT1Namespaces = t1
238+
hasVPCNamespaces = vpc
239+
stateInitialized = true
240+
log.Info("Mixed-mode state initialized", "hasT1Namespaces", t1, "hasVPCNamespaces", vpc)
241+
}
242+
243+
// RefreshMixedModeState re-scans namespaces using the clientset stored during
244+
// InitMixedMode and updates the global state. Returns true if the state
245+
// changed; the caller should then restart the operator so that VPC services
246+
// and controllers are initialized for the new mode.
247+
func RefreshMixedModeState(ctx context.Context) bool {
248+
stateMu.Lock()
249+
defer stateMu.Unlock()
250+
251+
if perNamespaceProvidersSupported == nil || !*perNamespaceProvidersSupported {
252+
return false
253+
}
254+
if storedClientset == nil {
255+
return false
256+
}
257+
258+
oldT1, oldVPC := hasT1Namespaces, hasVPCNamespaces
259+
var newT1, newVPC bool
260+
var err error
261+
if r := currentNamespaceRefreshReader(); r != nil {
262+
newT1, newVPC, err = scanNamespaceProvidersWithClient(ctx, r)
263+
} else {
264+
newT1, newVPC, err = scanNamespaceProviders(ctx, storedClientset)
265+
}
266+
if err != nil {
267+
log.Warn("Failed to scan namespaces during mixed-mode refresh, keeping current state", "error", err)
268+
return false
269+
}
270+
hasT1Namespaces = newT1
271+
hasVPCNamespaces = newVPC
272+
273+
changed := oldT1 != hasT1Namespaces || oldVPC != hasVPCNamespaces
274+
if changed {
275+
log.Info("Mixed-mode state changed",
276+
"oldHasT1Namespaces", oldT1, "hasT1Namespaces", hasT1Namespaces,
277+
"oldHasVPCNamespaces", oldVPC, "hasVPCNamespaces", hasVPCNamespaces)
278+
}
279+
return changed
280+
}
281+
282+
// HasT1Namespaces returns true when at least one namespace uses T1 networking.
283+
func HasT1Namespaces() bool {
284+
stateMu.RLock()
285+
defer stateMu.RUnlock()
286+
return hasT1Namespaces
287+
}
288+
289+
// HasVPCNamespaces returns true when at least one namespace uses VPC (or VDS in migration).
290+
func HasVPCNamespaces() bool {
291+
stateMu.RLock()
292+
defer stateMu.RUnlock()
293+
return hasVPCNamespaces
294+
}
295+
296+
// IsMixedModeStateInitialized returns true after InitMixedMode has been called.
297+
func IsMixedModeStateInitialized() bool {
298+
stateMu.RLock()
299+
defer stateMu.RUnlock()
300+
return stateInitialized
301+
}
302+
303+
// SetMixedModeStateForTest sets hasT1Namespaces and hasVPCNamespaces for unit tests.
304+
// Must only be used from test code so production always goes through InitMixedMode.
305+
func SetMixedModeStateForTest(hasT1, hasVPC bool) {
306+
stateMu.Lock()
307+
defer stateMu.Unlock()
308+
hasT1Namespaces = hasT1
309+
hasVPCNamespaces = hasVPC
310+
stateInitialized = true
311+
}
312+
313+
// IsPerNamespaceProvidersSupported returns true when SupervisorCapabilities
314+
// advertises per-namespace network providers.
315+
func IsPerNamespaceProvidersSupported() bool {
316+
stateMu.RLock()
317+
defer stateMu.RUnlock()
318+
return perNamespaceProvidersSupported != nil && *perNamespaceProvidersSupported
319+
}

0 commit comments

Comments
 (0)