Skip to content

Commit 4c94957

Browse files
authored
fix(proxyCfg): propogate meshGatewayConfig to upstreams of API gateway (#23369)
* fix(proxyCfg): propogate meshGatewayConfig to upstreams of API gateway - Fixed handleRouteConfigUpdate to properly propagate meshGatewayConfig to API gateway upstreams, which is required during XDS endpoint and cluster config generation. - Added TestStateChangedAPIGateway test cases in state_test.go to validate API gateway update handling. - Added API gateway-specific logging prefix (similar to mesh gateway) to help in debugging. * fix(xds): correct endpoint config generation for API gateway in peered setups (#23370) * fix(xds): correct endpoint config generation for API gateway in peered setups - Previously, API gateway XDS endpoint generation incorrectly relied on cfgSnap.ConnectProxy config (instead of cgfSnap.APIGateway), which caused wrong/no endpoint configuration for peered environments. - Changes made: - Updated makeUpstreamLoadAssignmentForPeerService to fetch localGatewayEndpoint based on cfgSnap kind instead of always using cfgSnap.ConnectProxy. - Updated endpointsFromDiscoveryChain to derive meshGatewayMode based on cfgSnap kind instead of always using cfgSnap.ConnectProxy. - Recompiled golden test file to reflect fix. * removed comment * fix(xds): correct cluster config generation for API gateway in peered setups (#23371) * fix(xds): correct cluster config generation for API gateway in peered setups - Updated makeUpstreamClustersForDiscoveryChain to generate cluster config based on upstream endpoint type. Before this fix, it always generated cluster configs without endpoints, which is incorrect when the upstream endpoint type is hostname and mesh-gateway mode is remote; in such cases, endpoints must also be included in the cluster config. - Added recompiled golden test file to reflect the fix. * fix lint error
1 parent 5481864 commit 4c94957

11 files changed

Lines changed: 388 additions & 46 deletions

agent/proxycfg/api_gateway.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,19 @@ func (h *handlerAPIGateway) handleRouteConfigUpdate(ctx context.Context, u Updat
360360

361361
for _, rule := range route.Rules {
362362
for _, service := range rule.Services {
363+
364+
// Retrieving the meshGatewayConfig from handlerAPIGateway instance.
365+
// `handlerAPIGateway` embeds `handlerState`, which exposes `serviceInstance.proxyCfg`.
366+
// serviceInstance.proxyCfg.MeshGateway is replicated from NodeService during state setup/update.
367+
// and NodeService populated for all gateway's during service resistration `AgentRegisterService`.
368+
//
369+
// So, Whenever any change happens in NodeService, proxyCfg manager will recreate
370+
// the state where it copies NodeService to serviceInstance and
371+
// then calls this api_gateway handleUpdates method.
372+
// which will update the Mesh-Gateway config to api_gateway upstreams (below).
373+
// h.service = <name of api-gateway>
374+
meshGatewayConfig := h.proxyCfg.MeshGateway
375+
363376
for _, listener := range snap.APIGateway.Listeners {
364377
shouldBind := false
365378
for _, parent := range route.Parents {
@@ -382,6 +395,10 @@ func (h *handlerAPIGateway) handleRouteConfigUpdate(ctx context.Context, u Updat
382395
Config: map[string]interface{}{
383396
"protocol": "http",
384397
},
398+
// Propogate the meshGatewayConfig in api gateway upstreams
399+
// so that meshGatewayMode can be used in XDS for
400+
// endpoints and cluster config generation.
401+
MeshGateway: meshGatewayConfig,
385402
}
386403

387404
listenerKey := APIGatewayListenerKeyFromListener(listener)
@@ -410,6 +427,7 @@ func (h *handlerAPIGateway) handleRouteConfigUpdate(ctx context.Context, u Updat
410427
snap.APIGateway.TCPRoutes.Set(ref, route)
411428

412429
for _, service := range route.Services {
430+
meshGatewayConfig := h.proxyCfg.MeshGateway
413431
upstreamID := NewUpstreamIDFromServiceName(service.ServiceName())
414432
seenUpstreamIDs.add(upstreamID)
415433

@@ -436,6 +454,7 @@ func (h *handlerAPIGateway) handleRouteConfigUpdate(ctx context.Context, u Updat
436454
Config: map[string]interface{}{
437455
"protocol": "tcp",
438456
},
457+
MeshGateway: meshGatewayConfig,
439458
}
440459

441460
listenerKey := APIGatewayListenerKeyFromListener(listener)

agent/proxycfg/state.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ func newKindHandler(config stateConfig, s serviceInstance, ch chan UpdateEvent)
225225
case structs.ServiceKindIngressGateway:
226226
handler = &handlerIngressGateway{handlerState: h}
227227
case structs.ServiceKindAPIGateway:
228+
h.logger = config.logger.Named(logging.APIGateway)
228229
handler = &handlerAPIGateway{handlerState: h}
229230
default:
230231
return nil, errors.New("not a connect-proxy, terminating-gateway, mesh-gateway, or ingress-gateway")

agent/proxycfg/state_test.go

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import (
2626
"github.com/hashicorp/consul/sdk/testutil"
2727
)
2828

29-
func TestStateChanged(t *testing.T) {
29+
func TestStateChangedConnectProxy(t *testing.T) {
3030
tests := []struct {
3131
name string
3232
ns *structs.NodeService
@@ -108,6 +108,131 @@ func TestStateChanged(t *testing.T) {
108108
},
109109
want: true,
110110
},
111+
{
112+
name: "different proxy mesh gateway mode",
113+
ns: structs.TestNodeServiceAPIGateway(t),
114+
token: "foo",
115+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
116+
ns.Proxy.MeshGateway.Mode = structs.MeshGatewayModeLocal
117+
return &ns, token
118+
},
119+
want: true,
120+
},
121+
}
122+
123+
for _, tt := range tests {
124+
t.Run(tt.name, func(t *testing.T) {
125+
proxyID := ProxyID{ServiceID: tt.ns.CompoundServiceID()}
126+
state, err := newState(proxyID, tt.ns, testSource, tt.token, stateConfig{logger: hclog.New(nil)}, rate.NewLimiter(rate.Inf, 1))
127+
require.NoError(t, err)
128+
otherNS, otherToken := tt.mutate(*tt.ns, tt.token)
129+
require.Equal(t, tt.want, state.Changed(otherNS, otherToken))
130+
})
131+
}
132+
}
133+
134+
func TestStateChangedAPIGateway(t *testing.T) {
135+
tests := []struct {
136+
name string
137+
ns *structs.NodeService
138+
token string
139+
mutate func(ns structs.NodeService, token string) (*structs.NodeService, string)
140+
want bool
141+
}{
142+
{
143+
name: "nil node service",
144+
ns: structs.TestNodeServiceAPIGateway(t),
145+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
146+
return nil, token
147+
},
148+
want: true,
149+
},
150+
{
151+
name: "same service",
152+
ns: structs.TestNodeServiceAPIGateway(t),
153+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
154+
return &ns, token
155+
}, want: false,
156+
},
157+
{
158+
name: "same service, different token",
159+
ns: structs.TestNodeServiceAPIGateway(t),
160+
token: "foo",
161+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
162+
return &ns, "bar"
163+
},
164+
want: true,
165+
},
166+
{
167+
name: "different address",
168+
ns: structs.TestNodeServiceAPIGateway(t),
169+
token: "foo",
170+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
171+
ns.Address = "10.10.10.10"
172+
return &ns, token
173+
},
174+
want: true,
175+
},
176+
{
177+
name: "different port",
178+
ns: structs.TestNodeServiceAPIGateway(t),
179+
token: "foo",
180+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
181+
ns.Port = 12345
182+
return &ns, token
183+
},
184+
want: true,
185+
},
186+
{
187+
name: "different service kind",
188+
ns: structs.TestNodeServiceAPIGateway(t),
189+
token: "foo",
190+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
191+
ns.Kind = ""
192+
return &ns, token
193+
},
194+
want: true,
195+
},
196+
{
197+
name: "different proxy target",
198+
ns: structs.TestNodeServiceAPIGateway(t),
199+
token: "foo",
200+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
201+
ns.Proxy.DestinationServiceName = "badger"
202+
return &ns, token
203+
},
204+
want: true,
205+
},
206+
{
207+
name: "different proxy upstreams",
208+
ns: structs.TestNodeServiceAPIGateway(t),
209+
token: "foo",
210+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
211+
ns.Proxy.Upstreams = nil
212+
return &ns, token
213+
},
214+
want: true,
215+
},
216+
{
217+
name: "different mesh gateway mode (local)",
218+
ns: structs.TestNodeServiceAPIGateway(t),
219+
token: "foo",
220+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
221+
ns.Proxy.MeshGateway.Mode = structs.MeshGatewayModeLocal
222+
return &ns, token
223+
},
224+
want: true,
225+
},
226+
{
227+
name: "different mesh gateway mode (remote)",
228+
ns: structs.TestNodeServiceAPIGateway(t),
229+
token: "foo",
230+
mutate: func(ns structs.NodeService, token string) (*structs.NodeService, string) {
231+
ns.Proxy.MeshGateway.Mode = structs.MeshGatewayModeRemote
232+
return &ns, token
233+
},
234+
want: true,
235+
},
111236
}
112237

113238
for _, tt := range tests {

agent/structs/testing_catalog.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,35 @@ func TestNodeServiceMeshGateway(t testing.T) *NodeService {
176176
}
177177

178178
func TestNodeServiceAPIGateway(t testing.T) *NodeService {
179+
entMeta := DefaultEnterpriseMetaInPartition("")
179180
return &NodeService{
180181
Kind: ServiceKindAPIGateway,
181182
Service: "api-gateway",
182183
Address: "1.1.1.1",
184+
185+
// ---------------------------------------
186+
// Adding TestConnectProxyConfig to the proxy field here within TestNodeServiceAPIGateway
187+
// to test whether APIGateway is able to handle state changes within ConnectProxyConfig (TestStateChangedAPIGateway).
188+
// Please note:
189+
// The naming may suggest that ConnectProxyConfig should only be used for ConnectProxy,
190+
// but "APIGateway state" uses serviceInstance, which embeds ConnectProxyConfig as part of its state,
191+
// so any changes to ConnectProxyConfig will also impact APIGateway, such as change in "mesh gateway mode".
192+
//
193+
// For example, let's say a user updates the mesh gateway mode of an API gateway,
194+
// First NodeService.Proxy will be updated and then proxyCfg manager detects change in config
195+
// and it recreates the state for api_gateway which would copy the
196+
// NodeService.Proxy.MeshGateway to serviceInstance.proxyCfg.MeshGateway in `newServiceInstanceFromNodeService` (serviceInstance is part of state)
197+
// and then proxyCfg manager calls the api_gateway handleUpdates method which would
198+
// update the api_gateway upstreams with new meshGateway config.
199+
//
200+
// Now, this serviceInstance.proxyCfg and NodeService.Proxy
201+
// refers to same proxy configuration, which is of type ConnectProxyConfig.
202+
203+
// So, we need to test it as well.
204+
// ---------------------------------------
205+
206+
Proxy: TestConnectProxyConfig(t),
207+
EnterpriseMeta: *entMeta,
183208
}
184209
}
185210

agent/xds/clusters.go

Lines changed: 115 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1533,25 +1533,47 @@ func (s *ResourceGenerator) makeUpstreamClustersForDiscoveryChain(
15331533
for _, groupedTarget := range targetGroups {
15341534
s.Logger.Debug("generating cluster for", "cluster", groupedTarget.ClusterName)
15351535

1536+
// Now this makeUpstreamClusterForDiscoveryChain, is a generic method
1537+
// and used by connect proxy, ingress gateway and api gateway.
1538+
//
1539+
// Issue: This method always make cluster (without endpoints).
1540+
// `ClusterDiscoveryType: &envoy_cluster_v3.Cluster_Type{Type: envoy_cluster_v3.Cluster_EDS},`
1541+
//
1542+
// Envoy Exception:
1543+
// As we know that any service whose upstream endpoint is of type hostname,
1544+
// envoy cannot resolve hostname as EDS.
1545+
// So, we need to use CDS to send endpoints as well along with cluster configs.
1546+
//
1547+
// Context:
1548+
// When we have 2 consul DC peered with mesh gw ON AWS and
1549+
// we have API gw on DC1 which need to access a service X which exist on DC2.
1550+
// Also, API gateway is configured to in mesh-gw remote mode.
1551+
// In this case, API gateway upstream would be DC2's mesh-gateway and
1552+
// we configure API GW envoy upstream endpoint to it.
1553+
// The problem is envoy exception and AWS mesh-gw lb type.
1554+
// AWS generates hostname based endpoints for mesh-gw lb endpoint and
1555+
// when we have hostname based endpoints envoy cannot resolve it via EDS,
1556+
// So we configure that endpoint via CDS.
1557+
//
1558+
// If not fixed, whenever any service (gateway) whose upstream endpoint is of hostname type,
1559+
// cluster endpoints will be empty and envoy will fail to route traffic to that cluster.
1560+
//
1561+
// Fix: Add logic to check if we should create a cluster config
1562+
// without upstream endpoint or with upstream endpoint (hostnames)
1563+
// based on upstream endpoint type.
1564+
//
1565+
// You can refer makeUpstreamClusterForPeerService - used by connect proxy for similar logic.
1566+
// or makeGatewayCluster - used by mesh gw for peering services.
1567+
15361568
c := &envoy_cluster_v3.Cluster{
1537-
Name: groupedTarget.ClusterName,
1538-
AltStatName: groupedTarget.ClusterName,
1539-
ConnectTimeout: durationpb.New(node.Resolver.ConnectTimeout),
1540-
ClusterDiscoveryType: &envoy_cluster_v3.Cluster_Type{Type: envoy_cluster_v3.Cluster_EDS},
1569+
Name: groupedTarget.ClusterName,
1570+
AltStatName: groupedTarget.ClusterName,
1571+
ConnectTimeout: durationpb.New(node.Resolver.ConnectTimeout),
15411572
CommonLbConfig: &envoy_cluster_v3.Cluster_CommonLbConfig{
15421573
HealthyPanicThreshold: &envoy_type_v3.Percent{
15431574
Value: 0, // disable panic threshold
15441575
},
15451576
},
1546-
EdsClusterConfig: &envoy_cluster_v3.Cluster_EdsClusterConfig{
1547-
EdsConfig: &envoy_core_v3.ConfigSource{
1548-
InitialFetchTimeout: cfgSnap.GetXDSCommonConfig(s.Logger).GetXDSFetchTimeout(),
1549-
ResourceApiVersion: envoy_core_v3.ApiVersion_V3,
1550-
ConfigSourceSpecifier: &envoy_core_v3.ConfigSource_Ads{
1551-
Ads: &envoy_core_v3.AggregatedConfigSource{},
1552-
},
1553-
},
1554-
},
15551577
// TODO(peering): make circuit breakers or outlier detection work?
15561578
CircuitBreakers: &envoy_cluster_v3.CircuitBreakers{
15571579
Thresholds: makeThresholdsIfNeeded(upstreamConfig.Limits),
@@ -1582,7 +1604,60 @@ func (s *ResourceGenerator) makeUpstreamClustersForDiscoveryChain(
15821604
return nil, fmt.Errorf("cannot have more than one target")
15831605
}
15841606

1585-
if targetInfo := groupedTarget.Targets[0]; targetInfo.TLSContext != nil {
1607+
targetInfo := groupedTarget.Targets[0]
1608+
targetUID := proxycfg.NewUpstreamIDFromTargetID(targetInfo.TargetID)
1609+
1610+
meshGatewayMode, err := s.getMeshGatewayMode(cfgSnap, upstream, targetUID, groupedTarget.ClusterName)
1611+
if err != nil {
1612+
s.Logger.Error(err.Error(), "cluster", groupedTarget.ClusterName)
1613+
}
1614+
1615+
// Check if cluster need to be configured with hostnames or not.
1616+
useEDS := true
1617+
if targetUID.Peer != "" {
1618+
if _, ok := upstreamsSnapshot.PeerUpstreamEndpointsUseHostnames[targetUID]; ok {
1619+
// If we're using local mesh gw, the fact that upstreams use hostnames doesn't matter.
1620+
// If we're not using local mesh gw, then resort to CDS/DNS.
1621+
if meshGatewayMode != structs.MeshGatewayModeLocal {
1622+
useEDS = false
1623+
}
1624+
}
1625+
}
1626+
1627+
if useEDS {
1628+
c.ClusterDiscoveryType = &envoy_cluster_v3.Cluster_Type{Type: envoy_cluster_v3.Cluster_EDS}
1629+
c.EdsClusterConfig = &envoy_cluster_v3.Cluster_EdsClusterConfig{
1630+
EdsConfig: &envoy_core_v3.ConfigSource{
1631+
InitialFetchTimeout: cfgSnap.GetXDSCommonConfig(s.Logger).GetXDSFetchTimeout(),
1632+
ResourceApiVersion: envoy_core_v3.ApiVersion_V3,
1633+
ConfigSourceSpecifier: &envoy_core_v3.ConfigSource_Ads{
1634+
Ads: &envoy_core_v3.AggregatedConfigSource{},
1635+
},
1636+
},
1637+
}
1638+
} else {
1639+
hostnameEndpoints, ok := upstreamsSnapshot.PeerUpstreamEndpoints.Get(targetUID)
1640+
if !ok || len(hostnameEndpoints) == 0 {
1641+
// The upstream snapshot should deliver hostname endpoints soon; skip this cluster until then.
1642+
s.Logger.Debug("peer hostname endpoints not ready for discovery chain target",
1643+
"target", targetInfo.TargetID,
1644+
"upstream", targetUID,
1645+
"cluster", groupedTarget.ClusterName,
1646+
)
1647+
continue
1648+
}
1649+
c.EdsClusterConfig = nil
1650+
configureClusterWithHostnames(
1651+
s.Logger,
1652+
c,
1653+
"", /*TODO: should make configurable ? */
1654+
hostnameEndpoints,
1655+
true, /*isRemote*/
1656+
false, /*onlyPassing*/
1657+
)
1658+
}
1659+
1660+
if targetInfo.TLSContext != nil {
15861661
transportSocket, err := makeUpstreamTLSTransportSocket(targetInfo.TLSContext)
15871662
if err != nil {
15881663
return nil, err
@@ -1609,6 +1684,32 @@ func (s *ResourceGenerator) makeUpstreamClustersForDiscoveryChain(
16091684
return out, nil
16101685
}
16111686

1687+
func (s *ResourceGenerator) getMeshGatewayMode(
1688+
cfgSnap *proxycfg.ConfigSnapshot, upstream *structs.Upstream,
1689+
targetUID proxycfg.UpstreamID, clusterName string,
1690+
) (structs.MeshGatewayMode, error) {
1691+
defaultMode := structs.MeshGatewayModeDefault
1692+
switch cfgSnap.Kind {
1693+
case structs.ServiceKindConnectProxy:
1694+
upstreamConfig, _ := cfgSnap.ConnectProxy.
1695+
GetUpstream(targetUID, &cfgSnap.ProxyID.EnterpriseMeta)
1696+
if upstreamConfig != nil {
1697+
return upstreamConfig.MeshGateway.Mode, nil
1698+
}
1699+
return defaultMode, nil
1700+
case structs.ServiceKindAPIGateway,
1701+
structs.ServiceKindIngressGateway:
1702+
if upstream != nil {
1703+
return upstream.MeshGateway.Mode, nil
1704+
}
1705+
return defaultMode, nil
1706+
case structs.ServiceKindMeshGateway:
1707+
// Mesh Gateway mesh mode will always be remote.
1708+
return structs.MeshGatewayModeRemote, nil
1709+
}
1710+
return structs.MeshGatewayModeDefault, fmt.Errorf("unexpected service kind %q when determining mesh gateway mode for cluster %q", cfgSnap.Kind, clusterName)
1711+
}
1712+
16121713
func (s *ResourceGenerator) makeExportedUpstreamClustersForMeshGateway(cfgSnap *proxycfg.ConfigSnapshot) ([]proto.Message, error) {
16131714
// NOTE: Despite the mesh gateway already having one cluster per service
16141715
// (and subset) in the local datacenter we cannot reliably use those to

0 commit comments

Comments
 (0)