Skip to content

Commit 79ccfa2

Browse files
authored
Go: Fix flaky TestScriptKillWithRoute race condition (#5950)
* Fix flaky TestScriptKillWithRoute by increasing timeout and replacing sleep with polling The test was flaky because: 1. The script invoked in a goroutine hadn't started executing on the server before SCRIPT KILL was attempted, causing 'NotBusy' errors. 2. The 5-second timeout was insufficient for slow CI environments. 3. The fixed time.Sleep(1s) at the end was unreliable for confirming the script was no longer running. Changes: - Increase script duration from 6s to 10s to ensure it's still running when the kill succeeds. - Increase kill polling timeout from 5s to 8s to accommodate slow script startup in CI. - Replace time.Sleep(1s) at the end with a polling loop that waits for the 'notbusy' state, making the test deterministic. Signed-off-by: Thomas Zhou <thomas.zhou@improving.com> Signed-off-by: Thomas Zhou <thomaszhou64@gmail.com> * Fix flaky TestScriptKillWithRoute by blocking on script execution Restructure the test to match the working testFunctionKillNoWrite pattern: - Run InvokeScriptWithRoute in the main thread (blocking) to guarantee the script is executing on the server before kill is attempted - Run ScriptKill polling in a goroutine - Use a longer request timeout (12s) so the invoke call blocks until killed This eliminates the race condition where ScriptKill was called before the script started, causing 'NotBusy' errors. Verified: 250/250 sequential runs with 0 failures. Fixes #5576 Signed-off-by: Thomas Zhou <thomaszhou64@gmail.com> --------- Signed-off-by: Thomas Zhou <thomas.zhou@improving.com> Signed-off-by: Thomas Zhou <thomaszhou64@gmail.com>
1 parent cfd91a5 commit 79ccfa2

1 file changed

Lines changed: 28 additions & 28 deletions

File tree

go/integTest/cluster_commands_test.go

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2616,10 +2616,13 @@ func (suite *GlideTestSuite) TestScriptKillWithoutRoute() {
26162616
}
26172617

26182618
func (suite *GlideTestSuite) TestScriptKillWithRoute() {
2619-
invokeClient, err := suite.clusterClient(suite.defaultClusterClientConfig())
2620-
require.NoError(suite.T(), err)
26212619
killClient := suite.defaultClusterClient()
26222620

2621+
// Use a longer request timeout so InvokeScript blocks until killed
2622+
invokeConfig := suite.defaultClusterClientConfig().WithRequestTimeout(12 * time.Second)
2623+
invokeClient, err := suite.clusterClient(invokeConfig)
2624+
require.NoError(suite.T(), err)
2625+
26232626
// key for routing to a primary node
26242627
randomKey := uuid.NewString()
26252628
route := options.RouteOption{
@@ -2632,42 +2635,39 @@ func (suite *GlideTestSuite) TestScriptKillWithRoute() {
26322635
assert.True(suite.T(), strings.Contains(strings.ToLower(err.Error()), "notbusy"))
26332636

26342637
// Kill Running Code
2635-
code := CreateLongRunningLuaScript(6, true)
2638+
code := CreateLongRunningLuaScript(10, true)
26362639
script := options.NewScript(code)
26372640

2638-
go invokeClient.InvokeScriptWithRoute(context.Background(), *script, route)
2639-
2640-
timeout := time.After(5 * time.Second)
2641-
ticker := time.NewTicker(100 * time.Millisecond)
2642-
defer ticker.Stop()
2643-
2641+
// Kill in a goroutine - polls until the script is running, matching testFunctionKillNoWrite pattern
2642+
var killErr error
26442643
var result string
2645-
killed := false
2646-
2647-
for !killed {
2648-
select {
2649-
case <-timeout:
2650-
suite.T().Fatal("Timeout: SCRIPT KILL failed to execute in 5 seconds")
2651-
case <-ticker.C:
2652-
result, err = killClient.ScriptKillWithRoute(context.Background(), route)
2653-
if err == nil {
2654-
killed = true
2655-
continue
2656-
}
2644+
go func() {
2645+
timeout := time.After(8 * time.Second)
2646+
ticker := time.NewTicker(500 * time.Millisecond)
2647+
defer ticker.Stop()
26572648

2658-
if !strings.Contains(strings.ToLower(err.Error()), "notbusy") {
2659-
assert.NoError(suite.T(), err)
2660-
killed = true
2649+
for {
2650+
select {
2651+
case <-timeout:
2652+
return
2653+
case <-ticker.C:
2654+
result, killErr = killClient.ScriptKillWithRoute(context.Background(), route)
2655+
if killErr == nil {
2656+
return
2657+
}
26612658
}
26622659
}
2663-
}
2660+
}()
26642661

2665-
assert.NoError(suite.T(), err)
2662+
// Block until script is killed (returns error) - guarantees script is running on server
2663+
_, err = invokeClient.InvokeScriptWithRoute(context.Background(), *script, route)
2664+
assert.Error(suite.T(), err)
2665+
assert.True(suite.T(), strings.Contains(strings.ToLower(err.Error()), "script killed"))
2666+
2667+
assert.NoError(suite.T(), killErr)
26662668
assert.Equal(suite.T(), "OK", result)
26672669
script.Close()
26682670

2669-
time.Sleep(1 * time.Second)
2670-
26712671
// Ensure no script is running at the end
26722672
_, err = killClient.ScriptKillWithRoute(context.Background(), route)
26732673
assert.Error(suite.T(), err)

0 commit comments

Comments
 (0)