Skip to content

Commit 03548dc

Browse files
committed
net: Support all valid IP_MTU_DISCOVER values for UDP and TCP
The UDP endpoint only accepted PMTUDiscoveryDont and the TCP endpoint rejected PMTUDiscoveryProbe, causing QUIC libraries (quic-go) to fail when setting IP_PMTUDISC_DO or IP_PMTUDISC_PROBE on UDP sockets. Accept all valid PMTUD strategies (DONT/WANT/DO/PROBE) for both UDP and TCP, since gVisor does not implement ICMP-based PMTU feedback, `PROBE` and `DO` are functionally equivalent — both set DF but neither updates a cached route PMTU. Test: UdpSocketTest.SetPMTUD, TcpSocketTest.SetPMTUD Signed-off-by: Tan Yifeng <yiftan@tencent.com>
1 parent 4eeb780 commit 03548dc

6 files changed

Lines changed: 88 additions & 40 deletions

File tree

pkg/tcpip/tcpip.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -955,9 +955,8 @@ const (
955955

956956
// MTUDiscoverOption is used to set/get the path MTU discovery setting.
957957
//
958-
// NOTE: Setting this option to any other value than PMTUDiscoveryDont
959-
// is not supported and will fail as such, and getting this option will
960-
// always return PMTUDiscoveryDont.
958+
// The value controls whether the Don't Fragment (DF) bit is set on
959+
// outgoing IPv4 packets.
961960
MTUDiscoverOption
962961

963962
// MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control

pkg/tcpip/transport/internal/network/endpoint.go

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ type Endpoint struct {
7474
ipv4TOS uint8
7575
// +checklocks:mu
7676
ipv6TClass uint8
77+
// +checklocks:mu
78+
pmtud tcpip.PMTUDStrategy
7779

7880
// Lock ordering: mu > infoMu.
7981
infoMu sync.RWMutex `state:"nosave"`
@@ -234,6 +236,7 @@ type WriteContext struct {
234236
route *stack.Route
235237
ttl uint8
236238
tos uint8
239+
df bool
237240
}
238241

239242
func (c *WriteContext) MTU() uint32 {
@@ -362,6 +365,7 @@ func (c *WriteContext) WritePacket(pkt *stack.PacketBuffer, headerIncluded bool)
362365
Protocol: c.e.transProto,
363366
TTL: c.ttl,
364367
TOS: c.tos,
368+
DF: c.df,
365369
ExperimentOptionValue: expOptVal,
366370
}, pkt)
367371

@@ -568,11 +572,28 @@ func (e *Endpoint) AcquireContextForWrite(opts tcpip.WriteOptions) (WriteContext
568572
panic(fmt.Sprintf("invalid protocol number = %d", netProto))
569573
}
570574

575+
// Set the DF (Don't Fragment) bit based on the PMTUD strategy,
576+
// matching TCP behavior in connect.go.
577+
// Note: In gVisor, WANT and DO are treated identically (both set DF).
578+
// Linux kernel differentiates them (WANT allows local fragmentation,
579+
// DO returns EMSGSIZE), but gVisor's IPv4 layer always allows local
580+
// fragmentation for locally-generated packets regardless of DF
581+
// (see gvisor.dev/issue/5919).
582+
//
583+
// PROBE also sets DF, matching Linux ip_dont_fragment(). In Linux,
584+
// PROBE differs from DO only in that it ignores incoming ICMP
585+
// "Fragmentation Needed" messages (i.e. does not update the cached
586+
// route PMTU). Since gVisor does not implement ICMP-based PMTU
587+
// feedback for transport sockets, PROBE and DO are functionally
588+
// equivalent here.
589+
df := e.pmtud == tcpip.PMTUDiscoveryWant || e.pmtud == tcpip.PMTUDiscoveryDo || e.pmtud == tcpip.PMTUDiscoveryProbe
590+
571591
return WriteContext{
572592
e: e,
573593
route: route,
574594
ttl: ttl,
575595
tos: tos,
596+
df: df,
576597
}, nil
577598
}
578599

@@ -840,9 +861,18 @@ func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, bool) {
840861
func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
841862
switch opt {
842863
case tcpip.MTUDiscoverOption:
843-
// Return not supported if the value is not disabling path
844-
// MTU discovery.
845-
if tcpip.PMTUDStrategy(v) != tcpip.PMTUDiscoveryDont {
864+
// Store PMTU discovery settings. The DF bit on outgoing
865+
// packets is set accordingly in AcquireContextForWrite.
866+
// PROBE is accepted alongside DO/WANT/DONT. In Linux,
867+
// PROBE sets DF but ignores ICMP-based PMTU updates;
868+
// since gVisor lacks ICMP PMTU feedback, it behaves
869+
// identically to DO.
870+
switch tcpip.PMTUDStrategy(v) {
871+
case tcpip.PMTUDiscoveryWant, tcpip.PMTUDiscoveryDont, tcpip.PMTUDiscoveryDo, tcpip.PMTUDiscoveryProbe:
872+
e.mu.Lock()
873+
e.pmtud = tcpip.PMTUDStrategy(v)
874+
e.mu.Unlock()
875+
default:
846876
return &tcpip.ErrNotSupported{}
847877
}
848878

@@ -891,8 +921,10 @@ func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
891921
func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
892922
switch opt {
893923
case tcpip.MTUDiscoverOption:
894-
// The only supported setting is path MTU discovery disabled.
895-
return int(tcpip.PMTUDiscoveryDont), nil
924+
e.mu.Lock()
925+
v := int(e.pmtud)
926+
e.mu.Unlock()
927+
return v, nil
896928

897929
case tcpip.MulticastTTLOption:
898930
e.mu.Lock()

pkg/tcpip/transport/tcp/connect.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,8 @@ func (e *Endpoint) sendRaw(pkt *stack.PacketBuffer, flags header.TCPFlags, seq,
10391039
ack: ack,
10401040
rcvWnd: rcvWnd,
10411041
opts: options,
1042-
df: e.pmtud == tcpip.PMTUDiscoveryWant || e.pmtud == tcpip.PMTUDiscoveryDo,
1042+
// PROBE sets DF like DO; see network/endpoint.go for details.
1043+
df: e.pmtud == tcpip.PMTUDiscoveryWant || e.pmtud == tcpip.PMTUDiscoveryDo || e.pmtud == tcpip.PMTUDiscoveryProbe,
10431044
expOptVal: expOptVal,
10441045
}, pkt, e.gso)
10451046
}

pkg/tcpip/transport/tcp/endpoint.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,15 +1901,15 @@ func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
19011901
e.UnlockUser()
19021902

19031903
case tcpip.MTUDiscoverOption:
1904+
// PROBE is accepted alongside DO/WANT/DONT. In Linux,
1905+
// PROBE sets DF but ignores ICMP-based PMTU updates;
1906+
// since gVisor lacks ICMP PMTU feedback, it behaves
1907+
// identically to DO.
19041908
switch v := tcpip.PMTUDStrategy(v); v {
1905-
case tcpip.PMTUDiscoveryWant, tcpip.PMTUDiscoveryDont, tcpip.PMTUDiscoveryDo:
1909+
case tcpip.PMTUDiscoveryWant, tcpip.PMTUDiscoveryDont, tcpip.PMTUDiscoveryDo, tcpip.PMTUDiscoveryProbe:
19061910
e.LockUser()
19071911
e.pmtud = v
19081912
e.UnlockUser()
1909-
case tcpip.PMTUDiscoveryProbe:
1910-
// We don't support a way to ignore MTU updates; it's
1911-
// either on or it's off.
1912-
return &tcpip.ErrNotSupported{}
19131913
default:
19141914
return &tcpip.ErrNotSupported{}
19151915
}

test/syscalls/linux/tcp_socket.cc

Lines changed: 13 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2342,38 +2342,25 @@ TEST_P(TcpSocketTest, SetPMTUD) {
23422342
SyscallSucceeds());
23432343
EXPECT_EQ(got, IP_PMTUDISC_WANT);
23442344

2345-
int set = IP_PMTUDISC_DO;
2346-
ASSERT_THAT(
2347-
setsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &set, length),
2348-
SyscallSucceeds());
2349-
ASSERT_THAT(
2350-
getsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &got, &length),
2351-
SyscallSucceeds());
2352-
EXPECT_EQ(got, IP_PMTUDISC_DO);
2353-
set = IP_PMTUDISC_DONT;
2354-
ASSERT_THAT(
2355-
setsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &set, length),
2356-
SyscallSucceeds());
2357-
ASSERT_THAT(
2358-
getsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &got, &length),
2359-
SyscallSucceeds());
2360-
EXPECT_EQ(got, IP_PMTUDISC_DONT);
2361-
2362-
// IP_PMTUDISC_PROBE is not supported by gVisor.
2363-
set = IP_PMTUDISC_PROBE;
2364-
if (IsRunningOnGvisor() && !IsRunningWithHostinet()) {
2345+
// Set and verify each valid PMTUD strategy.
2346+
int strategies[] = {IP_PMTUDISC_DONT, IP_PMTUDISC_WANT, IP_PMTUDISC_DO,
2347+
IP_PMTUDISC_PROBE};
2348+
for (int strategy : strategies) {
23652349
ASSERT_THAT(
2366-
setsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &set, length),
2367-
SyscallFailsWithErrno(ENOTSUP));
2368-
} else {
2369-
ASSERT_THAT(
2370-
setsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &set, length),
2350+
setsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &strategy,
2351+
length),
23712352
SyscallSucceeds());
23722353
ASSERT_THAT(
23732354
getsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &got, &length),
23742355
SyscallSucceeds());
2375-
EXPECT_EQ(got, IP_PMTUDISC_PROBE);
2356+
EXPECT_EQ(got, strategy);
23762357
}
2358+
2359+
// Invalid value should fail.
2360+
int invalid = 99;
2361+
EXPECT_THAT(
2362+
setsockopt(accepted_.get(), SOL_IP, IP_MTU_DISCOVER, &invalid, length),
2363+
SyscallFails());
23772364
}
23782365

23792366
TEST_P(SimpleTcpSocketTest, GetSocketAcceptConnWithShutdown) {

test/syscalls/linux/udp_socket.cc

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2048,6 +2048,35 @@ TEST_P(UdpSocketTest, ConnectToZeroPortConnected) {
20482048
SyscallFailsWithErrno(ENOTCONN));
20492049
}
20502050

2051+
TEST_P(UdpSocketTest, SetPMTUD) {
2052+
// IP_PMTUDISC_WANT should be default.
2053+
int got = -1;
2054+
socklen_t length = sizeof(got);
2055+
ASSERT_THAT(
2056+
getsockopt(bind_.get(), SOL_IP, IP_MTU_DISCOVER, &got, &length),
2057+
SyscallSucceeds());
2058+
EXPECT_EQ(got, IP_PMTUDISC_WANT);
2059+
2060+
// Set and verify each valid PMTUD strategy.
2061+
int strategies[] = {IP_PMTUDISC_DONT, IP_PMTUDISC_WANT, IP_PMTUDISC_DO,
2062+
IP_PMTUDISC_PROBE};
2063+
for (int strategy : strategies) {
2064+
ASSERT_THAT(
2065+
setsockopt(bind_.get(), SOL_IP, IP_MTU_DISCOVER, &strategy, length),
2066+
SyscallSucceeds());
2067+
ASSERT_THAT(
2068+
getsockopt(bind_.get(), SOL_IP, IP_MTU_DISCOVER, &got, &length),
2069+
SyscallSucceeds());
2070+
EXPECT_EQ(got, strategy);
2071+
}
2072+
2073+
// Invalid value should fail.
2074+
int invalid = 99;
2075+
EXPECT_THAT(
2076+
setsockopt(bind_.get(), SOL_IP, IP_MTU_DISCOVER, &invalid, length),
2077+
SyscallFails());
2078+
}
2079+
20512080
INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest,
20522081
::testing::Values(AF_INET, AF_INET6));
20532082

0 commit comments

Comments
 (0)