From 8d3d8250e892060743d186b14f8dd3b90b15dddb Mon Sep 17 00:00:00 2001 From: Pawan Kalyan <91543630+pawannn@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:04:06 -0700 Subject: [PATCH] Fix: Next-Hop MTU in ICMPv4 "Fragmentation Needed" Packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes https://github.com/google/gvisor/issues/12568 ### Problem - When gVisor sends an ICMPv4 “Fragmentation Needed” message (Type 3, Code 4), it was leaving the Next-Hop MTU field as 0. - According to RFC 1191, this field must contain the MTU of the next-hop link. Sending 0 is incorrect. ### Impact - When a Linux host receives a Fragmentation Needed message with MTU set to 0, it falls back to an old default value from the RFC plateau table. In practice, this results in the system caching an MTU of 552 bytes. - That causes connections to containers behind gVisor to degrade significantly, especially for larger packets. It can also create long-lived MTU locks in the routing cache, which impacts performance to external services like GitHub. ### Root cause **In the IPv4 forwarding path**: If a packet: - Has the Don’t Fragment bit set - Exceeds the outgoing link MTU gVisor correctly generates an ICMP Destination Unreachable with Code 4. However, while constructing the ICMP header, it never sets the Next-Hop MTU field using icmpHdr.SetMTU(). As a result, the MTU remains 0. The method to set it already exists. It just wasn’t being used. ### What this change does 1. Introduces a new ICMP reason type that carries the next-hop MTU value. 2. Updates sendICMPv4 to: - Set the correct ICMP code for Fragmentation Needed. - Populate the MTU field with the actual link MTU. - Safely cap it to uint16 maximum. 3. Updates the forwarding path to pass the correct MTU when generating the ICMP error. ### Result Now, when a packet is too large and cannot be forwarded: - gVisor correctly includes the actual next-hop MTU in the ICMP response. - Hosts update their Path MTU properly. - No fallback to 552 bytes. - No unnecessary performance degradation. FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/12634 from pawannn:master 2067f61703c2d05f952cf66bec6300dd04549fa4 PiperOrigin-RevId: 892405056 --- pkg/tcpip/network/ipv4/icmp.go | 39 ++++++++++++++++++++++------------ pkg/tcpip/network/ipv4/ipv4.go | 4 +++- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go index e3d79c3ee9..980fade11c 100644 --- a/pkg/tcpip/network/ipv4/icmp.go +++ b/pkg/tcpip/network/ipv4/icmp.go @@ -16,6 +16,7 @@ package ipv4 import ( "fmt" + "math" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" @@ -599,7 +600,12 @@ func (*icmpReasonNetworkUnreachable) isICMPReason() {} // icmpReasonFragmentationNeeded is an error where a packet requires // fragmentation while also having the Don't Fragment flag set, as per RFC 792 // page 3, Destination Unreachable Message. -type icmpReasonFragmentationNeeded struct{} +type icmpReasonFragmentationNeeded struct { + // mtu is the MTU of the next-hop link. Per RFC 1191 §4, this value + // must be included in the ICMP Fragmentation Needed message so the + // sender can update its path MTU cache. + mtu uint32 +} func (*icmpReasonFragmentationNeeded) isICMPReason() {} @@ -704,30 +710,36 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer, deliv } sent := netEP.stats.icmp.packetsSent - icmpType, icmpCode, counter, pointer := func() (header.ICMPv4Type, header.ICMPv4Code, tcpip.MultiCounterStat, byte) { + icmpType, icmpCode, counter, pointer, nextHopMTU := func() (header.ICMPv4Type, header.ICMPv4Code, tcpip.MultiCounterStat, byte, uint16) { switch reason := reason.(type) { case *icmpReasonNetworkProhibited: - return header.ICMPv4DstUnreachable, header.ICMPv4NetProhibited, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4NetProhibited, sent.dstUnreachable, 0, 0 case *icmpReasonHostProhibited: - return header.ICMPv4DstUnreachable, header.ICMPv4HostProhibited, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4HostProhibited, sent.dstUnreachable, 0, 0 case *icmpReasonAdministrativelyProhibited: - return header.ICMPv4DstUnreachable, header.ICMPv4AdminProhibited, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4AdminProhibited, sent.dstUnreachable, 0, 0 case *icmpReasonPortUnreachable: - return header.ICMPv4DstUnreachable, header.ICMPv4PortUnreachable, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4PortUnreachable, sent.dstUnreachable, 0, 0 case *icmpReasonProtoUnreachable: - return header.ICMPv4DstUnreachable, header.ICMPv4ProtoUnreachable, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4ProtoUnreachable, sent.dstUnreachable, 0, 0 case *icmpReasonNetworkUnreachable: - return header.ICMPv4DstUnreachable, header.ICMPv4NetUnreachable, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4NetUnreachable, sent.dstUnreachable, 0, 0 case *icmpReasonHostUnreachable: - return header.ICMPv4DstUnreachable, header.ICMPv4HostUnreachable, sent.dstUnreachable, 0 + return header.ICMPv4DstUnreachable, header.ICMPv4HostUnreachable, sent.dstUnreachable, 0, 0 case *icmpReasonFragmentationNeeded: - return header.ICMPv4DstUnreachable, header.ICMPv4FragmentationNeeded, sent.dstUnreachable, 0 + // Per RFC 1191 §4, include the next-hop MTU in the ICMP message. + // Cap at MaxUint16 since the field is 16 bits wide. + mtu := reason.mtu + if mtu > math.MaxUint16 { + mtu = math.MaxUint16 + } + return header.ICMPv4DstUnreachable, header.ICMPv4FragmentationNeeded, sent.dstUnreachable, 0, uint16(mtu) case *icmpReasonTTLExceeded: - return header.ICMPv4TimeExceeded, header.ICMPv4TTLExceeded, sent.timeExceeded, 0 + return header.ICMPv4TimeExceeded, header.ICMPv4TTLExceeded, sent.timeExceeded, 0, 0 case *icmpReasonReassemblyTimeout: - return header.ICMPv4TimeExceeded, header.ICMPv4ReassemblyTimeout, sent.timeExceeded, 0 + return header.ICMPv4TimeExceeded, header.ICMPv4ReassemblyTimeout, sent.timeExceeded, 0, 0 case *icmpReasonParamProblem: - return header.ICMPv4ParamProblem, header.ICMPv4UnusedCode, sent.paramProblem, reason.pointer + return header.ICMPv4ParamProblem, header.ICMPv4UnusedCode, sent.paramProblem, reason.pointer, 0 default: panic(fmt.Sprintf("unsupported ICMP type %T", reason)) } @@ -796,6 +808,7 @@ func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer, deliv icmpHdr.SetCode(icmpCode) icmpHdr.SetType(icmpType) icmpHdr.SetPointer(pointer) + icmpHdr.SetMTU(nextHopMTU) icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data().Checksum())) if err := route.WritePacket( diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index ff12d2fd6b..f774dabf41 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -760,7 +760,9 @@ func (e *endpoint) forwardPacketWithRoute(route *stack.Route, pkt *stack.PacketB // WriteHeaderIncludedPacket checks for the presence of the Don't Fragment bit // while sending the packet and returns this error iff fragmentation is // necessary and the bit is also set. - _ = e.protocol.returnError(&icmpReasonFragmentationNeeded{}, pkt, false /* deliveredLocally */) + _ = e.protocol.returnError(&icmpReasonFragmentationNeeded{ + mtu: e.nic.MTU(), + }, pkt, false /* deliveredLocally */) return &ip.ErrMessageTooLong{} case *tcpip.ErrNoBufferSpace: return &ip.ErrOutgoingDeviceNoBufferSpace{}