Skip to content

Commit 297cdc4

Browse files
authored
fix: Eliminate flaky parallel test failures caused by port collisions and timing races
- Assign each Ginkgo parallel process a dedicated port range in [61000,65534] (above the Linux ephemeral range) to prevent cross-process port collisions - Add `ReservePort`/`ReleasePort`/`ReleaseAllPorts` to hold ports open until external processes (gorouter, NATS) bind them, eliminating the TOCTOU race - Apply correct reserve→release→start ordering across all 6 NATS call sites and all gorouter `exec.Command` launch sites - Fix flaky timing assertions in 4 test files by bracketing with `before`/`after` timestamps instead of comparing to `time.Now()` after the fact - Fix HTTP 100 Continue test: replace `NewResponse(100)` with a header-less `http.Response` literal to avoid spurious Connection: close on the proxy - Stop gorouter before NATS in AfterEach to prevent `log.Fatal` → `os.Exit(1)` killing the test binary before Ginkgo captures the result
1 parent 70f4fac commit 297cdc4

17 files changed

Lines changed: 246 additions & 77 deletions

File tree

src/code.cloudfoundry.org/gorouter/common/component_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,9 @@ var _ = Describe("Component", func() {
167167
var natsRunner *test_util.NATSRunner
168168

169169
BeforeEach(func() {
170-
natsPort := test_util.NextAvailPort()
170+
natsPort := test_util.ReservePort()
171171
natsRunner = test_util.NewNATSRunner(int(natsPort))
172+
test_util.ReleasePort(natsPort)
172173
natsRunner.Start()
173174
mbusClient = natsRunner.MessageBus
174175
mbusClient.Opts.SkipSubjectValidation = true

src/code.cloudfoundry.org/gorouter/handlers/reporter_test.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ var _ = Describe("Reporter Handler", func() {
8888
})
8989

9090
It("emits routing response metrics", func() {
91+
before := time.Now()
9192
handler.ServeHTTP(resp, req)
93+
after := time.Now()
9294

9395
Expect(fakeReporter.CaptureBadGatewayCallCount()).To(Equal(0))
9496

@@ -102,7 +104,11 @@ var _ = Describe("Reporter Handler", func() {
102104
Expect(capturedEndpoint.PrivateInstanceId).To(Equal("id"))
103105
Expect(capturedEndpoint.PrivateInstanceIndex).To(Equal("1"))
104106
Expect(capturedRespCode).To(Equal(http.StatusTeapot))
105-
Expect(startTime).To(BeTemporally("~", time.Now(), 100*time.Millisecond))
107+
// ReceivedAt is set to timeNow-1ms where timeNow is captured inside
108+
// the handler (between before and after), so the exact bracket is:
109+
// before-1ms <= startTime <= after-1ms
110+
Expect(startTime).To(BeTemporally(">=", before.Add(-1*time.Millisecond)))
111+
Expect(startTime).To(BeTemporally("<=", after.Add(-1*time.Millisecond)))
106112
Expect(latency).To(BeNumerically(">", 0))
107113
Expect(latency).To(BeNumerically("<", 10*time.Millisecond))
108114

src/code.cloudfoundry.org/gorouter/handlers/requestinfo_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,17 @@ var _ = Describe("RequestInfoHandler", func() {
5959
})
6060

6161
It("sets RequestInfo with StartTime on the context", func() {
62+
before := time.Now()
6263
handler.ServeHTTP(resp, req, nextHandler)
6364
var contextReq *http.Request
6465
Eventually(reqChan).Should(Receive(&contextReq))
65-
66-
expectedStartTime := time.Now()
66+
after := time.Now()
6767

6868
ri, err := handlers.ContextRequestInfo(contextReq)
6969
Expect(err).ToNot(HaveOccurred())
7070
Expect(ri).ToNot(BeNil())
71-
Expect(ri.ReceivedAt).To(BeTemporally("~", expectedStartTime, 10*time.Millisecond))
71+
Expect(ri.ReceivedAt).To(BeTemporally(">=", before))
72+
Expect(ri.ReceivedAt).To(BeTemporally("<=", after))
7273

7374
})
7475
})

src/code.cloudfoundry.org/gorouter/integration/common_integration_test.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,17 @@ func (s *testState) SetOnlyTrustClientCACertsTrue() {
6363

6464
func NewTestState() *testState {
6565
// TODO: don't hide so much behind these test_util methods
66-
cfg, clientTLSConfig := test_util.SpecSSLConfig(test_util.NextAvailPort(), test_util.NextAvailPort(), test_util.NextAvailPort(), test_util.NextAvailPort(), test_util.NextAvailPort(), test_util.NextAvailPort(), test_util.NextAvailPort())
66+
// Use ReservePort to keep listeners open until the gorouter process
67+
// starts, preventing other processes from grabbing these ports.
68+
cfg, clientTLSConfig := test_util.SpecSSLConfig(test_util.ReservePort(), test_util.ReservePort(), test_util.ReservePort(), test_util.ReservePort(), test_util.ReservePort(), test_util.ReservePort(), test_util.ReservePort())
6769
cfg.SkipSSLValidation = false
6870
cfg.RouteServicesHairpinning = false
6971
cfg.CipherString = "ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384"
7072

7173
// TODO: why these magic numbers?
7274
cfg.PruneStaleDropletsInterval = 2 * time.Second
7375
cfg.DropletStaleThreshold = 10 * time.Second
74-
cfg.StartResponseDelayInterval = 1 * time.Second
76+
cfg.StartResponseDelayInterval = 0
7577
cfg.EndpointTimeout = 15 * time.Second
7678
cfg.EndpointDialTimeout = 500 * time.Millisecond
7779
cfg.DrainTimeout = 200 * time.Millisecond
@@ -258,6 +260,10 @@ func (s *testState) registerAndWait(rm mbus.RegistryMessage) {
258260
func (s *testState) StartGorouter() *Session {
259261
Expect(s.cfg).NotTo(BeNil(), "set up test cfg before calling this function")
260262

263+
// Release NATS port first so the NATS server can bind it, while keeping
264+
// the other ports reserved until the gorouter starts.
265+
test_util.ReleasePort(s.cfg.Nats.Hosts[0].Port)
266+
261267
s.natsRunner = test_util.NewNATSRunner(int(s.cfg.Nats.Hosts[0].Port))
262268
s.natsRunner.Start()
263269

@@ -271,6 +277,10 @@ func (s *testState) StartGorouter() *Session {
271277
Expect(err).ToNot(HaveOccurred())
272278
Expect(os.WriteFile(cfgFile, cfgBytes, 0644)).To(Succeed())
273279

280+
// Release remaining reserved ports just before the gorouter process
281+
// starts, minimizing the TOCTOU window between release and bind.
282+
test_util.ReleaseAllPorts()
283+
274284
cmd := exec.Command(gorouterPath, "-c", cfgFile)
275285
s.gorouterSession, err = Start(cmd, GinkgoWriter, GinkgoWriter)
276286
Expect(err).ToNot(HaveOccurred())
@@ -297,6 +307,12 @@ func (s *testState) StartGorouterOrFail() {
297307
}
298308

299309
func (s *testState) StopAndCleanup() {
310+
// Stop router before NATS to prevent subscriber's ClosedCB from
311+
// firing log.Fatal → os.Exit(1), which kills the test proc.
312+
if s.gorouterSession != nil && s.gorouterSession.ExitCode() == -1 {
313+
Eventually(s.gorouterSession.Terminate(), 5).Should(Exit(0))
314+
}
315+
300316
if s.natsRunner != nil {
301317
s.natsRunner.Stop()
302318
}
@@ -308,10 +324,6 @@ func (s *testState) StopAndCleanup() {
308324

309325
os.RemoveAll(s.tmpdir)
310326

311-
if s.gorouterSession != nil && s.gorouterSession.ExitCode() == -1 {
312-
Eventually(s.gorouterSession.Terminate(), 5).Should(Exit(0))
313-
}
314-
315327
if s.fakeMetron != nil {
316328
s.StopMetron()
317329
}

src/code.cloudfoundry.org/gorouter/integration/gdpr_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ var _ = Describe("GDPR", func() {
6565
testState.EnableAccessLog()
6666
testState.cfg.Status.Pass = "pass"
6767
testState.cfg.Status.User = "user"
68-
testState.cfg.Status.Routes.Port = 6705
68+
testState.cfg.Status.Routes.Port = test_util.ReservePort()
6969
testState.cfg.Logging.DisableLogForwardedFor = true
7070
testState.StartGorouterOrFail()
7171

@@ -136,7 +136,7 @@ var _ = Describe("GDPR", func() {
136136
testState.EnableAccessLog()
137137
testState.cfg.Status.Pass = "pass"
138138
testState.cfg.Status.User = "user"
139-
testState.cfg.Status.Routes.Port = 6706
139+
testState.cfg.Status.Routes.Port = test_util.ReservePort()
140140
testState.cfg.Logging.DisableLogSourceIP = true
141141
testState.StartGorouterOrFail()
142142

src/code.cloudfoundry.org/gorouter/integration/main_test.go

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -65,29 +65,33 @@ var _ = Describe("Router Integration", func() {
6565
Expect(err).ToNot(HaveOccurred())
6666
cfgFile = filepath.Join(tmpdir, "config.yml")
6767

68-
statusPort = test_util.NextAvailPort()
69-
statusTLSPort = test_util.NextAvailPort()
70-
statusRoutesPort = test_util.NextAvailPort()
71-
proxyPort = test_util.NextAvailPort()
72-
natsPort = test_util.NextAvailPort()
73-
sslPort = test_util.NextAvailPort()
74-
routeServiceServerPort = test_util.NextAvailPort()
68+
statusPort = test_util.ReservePort()
69+
statusTLSPort = test_util.ReservePort()
70+
statusRoutesPort = test_util.ReservePort()
71+
proxyPort = test_util.ReservePort()
72+
natsPort = test_util.ReservePort()
73+
sslPort = test_util.ReservePort()
74+
routeServiceServerPort = test_util.ReservePort()
7575

7676
natsRunner = test_util.NewNATSRunner(int(natsPort))
77+
test_util.ReleasePort(natsPort)
7778
natsRunner.Start()
7879
oauthServerURL = oauthServer.Addr()
7980
})
8081

8182
AfterEach(func() {
83+
test_util.ReleaseAllPorts()
84+
// Stop router before NATS to prevent subscriber's ClosedCB from
85+
// firing log.Fatal → os.Exit(1), which kills the test proc.
86+
if gorouterSession != nil && gorouterSession.ExitCode() == -1 {
87+
stopGorouter(gorouterSession)
88+
}
89+
8290
if natsRunner != nil {
8391
natsRunner.Stop()
8492
}
8593

8694
os.RemoveAll(tmpdir)
87-
88-
if gorouterSession != nil && gorouterSession.ExitCode() == -1 {
89-
stopGorouter(gorouterSession)
90-
}
9195
})
9296

9397
Context("when config is invalid", func() {
@@ -609,6 +613,7 @@ var _ = Describe("Router Integration", func() {
609613
tempCfg.Logging.MetronAddress = ""
610614
writeConfig(tempCfg, cfgFile)
611615

616+
test_util.ReleaseAllPorts()
612617
gorouterCmd := exec.Command(gorouterPath, "-c", cfgFile)
613618
gorouterSession, _ = Start(gorouterCmd, GinkgoWriter, GinkgoWriter)
614619
Eventually(gorouterSession, 5*time.Second).Should(Exit(1))
@@ -635,7 +640,7 @@ var _ = Describe("Router Integration", func() {
635640

636641
BeforeEach(func() {
637642
testState = NewTestState()
638-
testState.cfg.DebugAddr = fmt.Sprintf("127.0.0.1:%d", test_util.NextAvailPort())
643+
testState.cfg.DebugAddr = fmt.Sprintf("127.0.0.1:%d", test_util.ReservePort())
639644
testState.StartGorouterOrFail()
640645
gorouterSession = testState.gorouterSession
641646

@@ -1047,7 +1052,7 @@ var _ = Describe("Router Integration", func() {
10471052
Describe("prometheus metrics", func() {
10481053
It("starts a prometheus https server", func() {
10491054
c := createConfig(statusPort, statusTLSPort, statusRoutesPort, proxyPort, routeServiceServerPort, cfgFile, defaultPruneInterval, defaultPruneThreshold, 0, false, 0, natsPort)
1050-
metricsPort := test_util.NextAvailPort()
1055+
metricsPort := test_util.ReservePort()
10511056
serverCAPath, serverCertPath, serverKeyPath, clientCert := tls_helpers.GenerateCaAndMutualTlsCerts()
10521057

10531058
c.Prometheus.Enabled = true
@@ -1421,6 +1426,7 @@ var _ = Describe("Router Integration", func() {
14211426
It("does not exit", func() {
14221427
writeConfig(cfg, cfgFile)
14231428

1429+
test_util.ReleaseAllPorts()
14241430
gorouterCmd := exec.Command(gorouterPath, "-c", cfgFile)
14251431
session, err := Start(gorouterCmd, GinkgoWriter, GinkgoWriter)
14261432
Expect(err).ToNot(HaveOccurred())
@@ -1436,6 +1442,7 @@ var _ = Describe("Router Integration", func() {
14361442
It("gorouter exits with non-zero code", func() {
14371443
writeConfig(cfg, cfgFile)
14381444

1445+
test_util.ReleaseAllPorts()
14391446
gorouterCmd := exec.Command(gorouterPath, "-c", cfgFile)
14401447
session, err := Start(gorouterCmd, GinkgoWriter, GinkgoWriter)
14411448
Expect(err).ToNot(HaveOccurred())
@@ -1453,6 +1460,7 @@ var _ = Describe("Router Integration", func() {
14531460
routingApiServer.Close()
14541461
writeConfig(cfg, cfgFile)
14551462

1463+
test_util.ReleaseAllPorts()
14561464
gorouterCmd := exec.Command(gorouterPath, "-c", cfgFile)
14571465
session, err := Start(gorouterCmd, GinkgoWriter, GinkgoWriter)
14581466
Expect(err).ToNot(HaveOccurred())
@@ -1468,6 +1476,7 @@ var _ = Describe("Router Integration", func() {
14681476
cfg.OAuth.Port = 0
14691477
writeConfig(cfg, cfgFile)
14701478

1479+
test_util.ReleaseAllPorts()
14711480
gorouterCmd := exec.Command(gorouterPath, "-c", cfgFile)
14721481
session, err := Start(gorouterCmd, GinkgoWriter, GinkgoWriter)
14731482
Expect(err).ToNot(HaveOccurred())

src/code.cloudfoundry.org/gorouter/integration/nats_test.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,27 +36,31 @@ var _ = Describe("NATS Integration", func() {
3636
Expect(err).ToNot(HaveOccurred())
3737
cfgFile = filepath.Join(tmpdir, "config.yml")
3838

39-
statusPort = test_util.NextAvailPort()
40-
statusTLSPort = test_util.NextAvailPort()
41-
statusRoutesPort = test_util.NextAvailPort()
42-
proxyPort = test_util.NextAvailPort()
43-
natsPort = test_util.NextAvailPort()
44-
routeServiceServerPort = test_util.NextAvailPort()
39+
statusPort = test_util.ReservePort()
40+
statusTLSPort = test_util.ReservePort()
41+
statusRoutesPort = test_util.ReservePort()
42+
proxyPort = test_util.ReservePort()
43+
natsPort = test_util.ReservePort()
44+
routeServiceServerPort = test_util.ReservePort()
4545

4646
natsRunner = test_util.NewNATSRunner(int(natsPort))
47+
test_util.ReleasePort(natsPort)
4748
natsRunner.Start()
4849
})
4950

5051
AfterEach(func() {
52+
test_util.ReleaseAllPorts()
53+
// Stop router before NATS to prevent subscriber's ClosedCB from
54+
// firing log.Fatal → os.Exit(1), which kills the test proc.
55+
if gorouterSession != nil && gorouterSession.ExitCode() == -1 {
56+
stopGorouter(gorouterSession)
57+
}
58+
5159
if natsRunner != nil {
5260
natsRunner.Stop()
5361
}
5462

5563
os.RemoveAll(tmpdir)
56-
57-
if gorouterSession != nil && gorouterSession.ExitCode() == -1 {
58-
stopGorouter(gorouterSession)
59-
}
6064
})
6165

6266
It("has Nats connectivity", func() {
@@ -162,7 +166,7 @@ var _ = Describe("NATS Integration", func() {
162166
)
163167

164168
BeforeEach(func() {
165-
natsPort2 = test_util.NextAvailPort()
169+
natsPort2 = test_util.ReservePort()
166170
natsRunner2 = test_util.NewNATSRunner(int(natsPort2))
167171

168172
pruneInterval = 2 * time.Second
@@ -206,6 +210,7 @@ var _ = Describe("NATS Integration", func() {
206210
time.Sleep(heartbeatInterval * 2)
207211

208212
natsRunner.Stop()
213+
test_util.ReleasePort(natsPort2)
209214
natsRunner2.Start()
210215

211216
// Give router time to make a bad decision (i.e. prune routes)
@@ -222,7 +227,7 @@ var _ = Describe("NATS Integration", func() {
222227
Context("when suspend_pruning_if_nats_unavailable enabled", func() {
223228

224229
BeforeEach(func() {
225-
natsPort2 = test_util.NextAvailPort()
230+
natsPort2 = test_util.ReservePort()
226231
natsRunner2 = test_util.NewNATSRunner(int(natsPort2))
227232

228233
pruneInterval = 200 * time.Millisecond

src/code.cloudfoundry.org/gorouter/integration/test_utils_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func configDrainSetup(cfg *config.Config, pruneInterval, pruneThreshold time.Dur
4646
// as part of pausing
4747
cfg.PruneStaleDropletsInterval = pruneInterval
4848
cfg.DropletStaleThreshold = pruneThreshold
49-
cfg.StartResponseDelayInterval = 1 * time.Second
49+
cfg.StartResponseDelayInterval = 0
5050
cfg.EndpointTimeout = 5 * time.Second
5151
cfg.EndpointDialTimeout = 500 * time.Millisecond
5252
cfg.DrainTimeout = 200 * time.Millisecond
@@ -60,6 +60,7 @@ func writeConfig(cfg *config.Config, cfgFile string) {
6060
}
6161

6262
func startGorouterSession(cfgFile string) *Session {
63+
test_util.ReleaseAllPorts()
6364
gorouterCmd := exec.Command(gorouterPath, "-c", cfgFile)
6465
session, err := Start(gorouterCmd, GinkgoWriter, GinkgoWriter)
6566
Expect(err).ToNot(HaveOccurred())

src/code.cloudfoundry.org/gorouter/logger/logger.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package logger
22

33
import (
4+
"fmt"
45
"io"
56
"log/slog"
67
"os"
@@ -235,6 +236,8 @@ via os.Exit(1) after writing the log message.
235236
*/
236237
func Fatal(logger *slog.Logger, message string, slogAttrs ...any) {
237238
logger.Error(message, slogAttrs...)
239+
// Write to stderr so the message survives os.Exit (stderr is unbuffered).
240+
fmt.Fprintf(os.Stderr, "FATAL: %s %v\n", message, slogAttrs)
238241
os.Exit(1)
239242
}
240243

src/code.cloudfoundry.org/gorouter/mbus/subscriber_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ var _ = Describe("Subscriber", func() {
3838
)
3939

4040
BeforeEach(func() {
41-
natsPort = test_util.NextAvailPort()
42-
41+
natsPort = test_util.ReservePort()
4342
natsRunner = test_util.NewNATSRunner(int(natsPort))
43+
test_util.ReleasePort(natsPort)
4444
natsRunner.Start()
4545
natsClient = natsRunner.MessageBus
4646

@@ -60,13 +60,13 @@ var _ = Describe("Subscriber", func() {
6060
})
6161

6262
AfterEach(func() {
63-
if natsRunner != nil {
64-
natsRunner.Stop()
65-
}
6663
if process != nil {
6764
process.Signal(os.Interrupt)
6865
}
6966
process = nil
67+
if natsRunner != nil {
68+
natsRunner.Stop()
69+
}
7070
})
7171

7272
It("exits when signaled", func() {

0 commit comments

Comments
 (0)