Skip to content

Commit eeee27e

Browse files
TeoSlayerteovl
andauthored
Fix NAT keepalive dropped by src-binding; flow poll retry; recover test (#303)
Co-authored-by: Teodor Calin <teodor@vulturelabs.io>
1 parent 572e14c commit eeee27e

4 files changed

Lines changed: 96 additions & 6 deletions

File tree

cmd/pilotctl/verify_flow.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,22 @@ func cmdVerifyProvider(flags map[string]string, provider string) {
155155

156156
deadline := time.Now().Add(10 * time.Minute)
157157
var badge, badgeSig string
158+
// Tolerate transient poll failures (a dropped frame, a relay blip) rather
159+
// than aborting a flow the user may have already half-completed. Only give
160+
// up after several consecutive errors.
161+
const maxPollErrors = 5
162+
pollErrors := 0
158163
for time.Now().Before(deadline) {
159164
time.Sleep(5 * time.Second)
160165
poll, err := verifierRoundtrip(d, vaddr, verifierRequest{Op: "poll", FlowID: begin.FlowID})
161166
if err != nil {
162-
fatalCode("connection_failed", "verify poll: %v", err)
167+
pollErrors++
168+
if pollErrors >= maxPollErrors {
169+
fatalCode("connection_failed", "verify poll: %d consecutive failures: %v", pollErrors, err)
170+
}
171+
continue
163172
}
173+
pollErrors = 0
164174
switch poll.Status {
165175
case "ready":
166176
badge, badgeSig = poll.Badge, poll.BadgeSig

cmd/pilotctl/zz_verify_cmds_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,52 @@ func TestNodeArgToID(t *testing.T) {
126126
t.Errorf("address: got %d, want 99", got)
127127
}
128128
}
129+
130+
// TestCmdRecoveryRecoverInstallsNewIdentity covers the most destructive command:
131+
// keyless force-rotate via the registry, then install the new identity locally.
132+
func TestCmdRecoveryRecoverInstallsNewIdentity(t *testing.T) {
133+
dir := t.TempDir()
134+
newKey := filepath.Join(dir, "new.json")
135+
id, err := crypto.GenerateIdentity()
136+
if err != nil {
137+
t.Fatalf("keygen: %v", err)
138+
}
139+
if err := crypto.SaveIdentity(newKey, id); err != nil {
140+
t.Fatalf("save new key: %v", err)
141+
}
142+
newPub := crypto.EncodePublicKey(id.PublicKey)
143+
idPath := filepath.Join(dir, "installed.json")
144+
145+
r := newFakeRegistry(t)
146+
var gotPub, gotRecovery string
147+
r.on("recover_identity", func(req map[string]interface{}) map[string]interface{} {
148+
gotPub, _ = req["new_public_key"].(string)
149+
gotRecovery, _ = req["recovery"].(string)
150+
return map[string]interface{}{"type": "recover_identity_ok", "ok": true, "node_id": float64(99)}
151+
})
152+
useRegistry(t, r)
153+
154+
prev := jsonOutput
155+
defer func() { jsonOutput = prev }()
156+
jsonOutput = true
157+
_ = captureStdout(t, func() {
158+
cmdRecovery([]string{"recover", "--node", "99", "--new-key", newKey,
159+
"--recovery", "pilotrecover:v1:99:bmV3:Y29t:9999999999:nn:rec-v1", "--recovery-sig", "c2ln",
160+
"--identity", idPath})
161+
})
162+
163+
if gotRecovery == "" {
164+
t.Fatal("registry never received recover_identity")
165+
}
166+
if gotPub != newPub {
167+
t.Errorf("registry got new_public_key=%q, want the new-key pubkey %q", gotPub, newPub)
168+
}
169+
// The recovered key must be installed at the daemon identity path.
170+
installed, err := crypto.LoadIdentity(idPath)
171+
if err != nil {
172+
t.Fatalf("new identity not installed at %s: %v", idPath, err)
173+
}
174+
if crypto.EncodePublicKey(installed.PublicKey) != newPub {
175+
t.Errorf("installed identity does not match the recovered key")
176+
}
177+
}

pkg/daemon/tunnel.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,19 @@ var TunnelKeepaliveInterval = 25 * time.Second
791791
// "ping" packet. The encrypted payload still authenticates as us
792792
// (peer's AEAD verifies our nodeID AAD), so an attacker can't forge
793793
// keepalives to keep stale entries warm.
794+
// newKeepalivePacket builds the tiny ProtoControl/PortPing keepalive. It
795+
// stamps our own node id as the inner source: the receiver enforces
796+
// pkt.Src.Node == the AEAD-authenticated peer, so a zero-Src keepalive would
797+
// be dropped as spoofed and the peer's NAT mapping would then expire.
798+
func (tm *TunnelManager) newKeepalivePacket() *protocol.Packet {
799+
return &protocol.Packet{
800+
Version: protocol.Version,
801+
Protocol: protocol.ProtoControl,
802+
DstPort: protocol.PortPing,
803+
Src: protocol.Addr{Node: tm.loadNodeID()},
804+
}
805+
}
806+
794807
func (tm *TunnelManager) keepaliveSweep(now time.Time) int {
795808
type peerInfo struct {
796809
id uint32
@@ -814,11 +827,7 @@ func (tm *TunnelManager) keepaliveSweep(now time.Time) int {
814827

815828
sent := 0
816829
for _, p := range stale {
817-
ka := &protocol.Packet{
818-
Version: protocol.Version,
819-
Protocol: protocol.ProtoControl,
820-
DstPort: protocol.PortPing,
821-
}
830+
ka := tm.newKeepalivePacket()
822831
plaintext, err := ka.Marshal()
823832
if err != nil {
824833
continue

pkg/daemon/zz_nat_keepalive_bug_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,28 @@ func TestHandleEncryptedDropsKeepaliveBeforeRecvCh(t *testing.T) {
233233
}
234234
}
235235

236+
// TestKeepalivePacketStampsOwnSource pins the fix for the regression where the
237+
// transport src-binding check dropped NAT keepalives: keepaliveSweep left
238+
// Src.Node=0, but the receiver enforces pkt.Src.Node == the authenticated peer,
239+
// so a zero-Src keepalive was dropped and the NAT mapping then expired. The
240+
// keepalive must carry our own node id (= the peer id from the receiver's view).
241+
func TestKeepalivePacketStampsOwnSource(t *testing.T) {
242+
t.Parallel()
243+
tm := NewTunnelManager()
244+
t.Cleanup(func() { tm.Close() })
245+
246+
const selfID uint32 = 0xAB00CD01
247+
tm.SetNodeID(selfID)
248+
249+
ka := tm.newKeepalivePacket()
250+
if ka.Src.Node != selfID {
251+
t.Fatalf("keepalive Src.Node = %#x, want own id %#x — a zero/wrong Src is dropped by the receiver's spoof check, expiring the NAT mapping", ka.Src.Node, selfID)
252+
}
253+
if ka.Protocol != protocol.ProtoControl || ka.DstPort != protocol.PortPing {
254+
t.Fatalf("keepalive shape wrong: protocol=%d dstport=%d", ka.Protocol, ka.DstPort)
255+
}
256+
}
257+
236258
// TestRecordOutboundSendStampsTimestamp pins the half of the fix that
237259
// already lands in RED: every successful writeFrame call must update
238260
// tm.lastOutboundSend[nodeID] so the eventual keepaliveSweep can tell

0 commit comments

Comments
 (0)