Skip to content

Commit dffb926

Browse files
authored
feat(runner): add timeout support (#134)
1 parent 6356129 commit dffb926

13 files changed

Lines changed: 268 additions & 10 deletions

File tree

config.example.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,9 @@ runner:
298298
# Useful for clients like Erigon that need time to complete internal sync pipelines
299299
# after their RPC endpoint becomes available.
300300
# wait_after_rpc_ready: 30s
301+
# Optional: Maximum duration for the test execution phase.
302+
# If exceeded, the run is cancelled with "timed_out" status. Partial results are kept.
303+
# run_timeout: 2h
301304
# Optional: Retry engine_newPayload calls when client returns SYNCING status.
302305
# Useful for clients with internal sync pipelines (e.g., Erigon) that may return
303306
# SYNCING while still processing blocks internally.
@@ -443,6 +446,7 @@ runner:
443446
# tmpfs_max_size: "16g"
444447
# wait_after_tcp_drop_connections: "10s"
445448
# wait_after_rpc_ready: 60s # Instance-level override (optional)
449+
# run_timeout: 1h # Instance-level override (optional)
446450
# retry_new_payloads_syncing_state: # Instance-level override (optional)
447451
# enabled: true
448452
# max_retries: 10

docs/configuration.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@ runner:
506506
| `rollback_strategy` | string | `rpc-debug-setHead` | Rollback strategy after each test (see below) |
507507
| `checkpoint_restore_strategy_options` | object | - | Options for the checkpoint-restore rollback strategy (see [Checkpoint Restore Strategy Options](#checkpoint-restore-strategy-options)) |
508508
| `wait_after_rpc_ready` | string | - | Duration to wait after RPC becomes ready (see below) |
509+
| `run_timeout` | string | - | Maximum duration for test execution before the run is timed out (see below) |
509510
| `retry_new_payloads_syncing_state` | object | - | Retry config for SYNCING responses (see below) |
510511
| `resource_limits` | object | - | Container resource constraints (see [Resource Limits](#resource-limits)) |
511512
| `post_test_rpc_calls` | []object | - | Arbitrary RPC calls to execute after each test step (see [Post-Test RPC Calls](#post-test-rpc-calls)) |
@@ -666,6 +667,26 @@ The value is a Go duration string (e.g., `30s`, `1m`, `500ms`). If not set, no a
666667
- When you observe `SYNCING` responses from Engine API calls despite the RPC being available
667668
- When starting from pre-populated data directories where clients may need time to validate state
668669

670+
##### Run Timeout
671+
672+
The `run_timeout` option sets a maximum duration for the test execution phase of a run. If the timeout is exceeded, the run is cancelled with a `timed_out` status. Partial results collected before the timeout are still written and published.
673+
674+
```yaml
675+
runner:
676+
client:
677+
config:
678+
run_timeout: 2h
679+
```
680+
681+
The value is a Go duration string (e.g., `30m`, `1h`, `2h30m`). If not set, no timeout is applied.
682+
683+
The timeout covers only the test execution phase — container setup, image pulling, and RPC readiness checks are not included.
684+
685+
**When to use:**
686+
- When running large test suites that may hang or take unexpectedly long
687+
- When you want to enforce a maximum wall-clock time for benchmark runs
688+
- When running in CI/CD environments with time constraints
689+
669690
##### Retry New Payloads Syncing State
670691

671692
When `engine_newPayload` returns a `SYNCING` status, it indicates the client hasn't fully processed the parent block yet. The `retry_new_payloads_syncing_state` option configures automatic retries with exponential backoff.
@@ -833,6 +854,7 @@ runner:
833854
| `rollback_strategy` | string | No | From `runner.client.config` | Instance-specific rollback strategy |
834855
| `checkpoint_restore_strategy_options` | object | No | From `runner.client.config` | Instance-specific checkpoint-restore strategy options (replaces global) |
835856
| `wait_after_rpc_ready` | string | No | From `runner.client.config` | Instance-specific RPC ready wait duration |
857+
| `run_timeout` | string | No | From `runner.client.config` | Instance-specific run timeout duration |
836858
| `retry_new_payloads_syncing_state` | object | No | From `runner.client.config` | Instance-specific retry config for SYNCING responses |
837859
| `resource_limits` | object | No | From `runner.client.config` | Instance-specific resource limits |
838860
| `post_test_rpc_calls` | []object | No | From `runner.client.config` | Instance-specific post-test RPC calls (replaces global) |

pkg/api/indexstore/indexstore.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ func (s *store) ListRunIDs(
327327
}
328328

329329
// terminalStatuses are run statuses that will not change.
330-
var terminalStatuses = []string{"completed", "failed", "cancelled", "container_died"}
330+
var terminalStatuses = []string{"completed", "failed", "cancelled", "container_died", "timeout"}
331331

332332
// ListIncompleteRunIDs returns run IDs where the result has not been indexed
333333
// and the run is still potentially in progress. A run is considered

pkg/config/config.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ type ClientDefaults struct {
593593
ResourceLimits *ResourceLimits `yaml:"resource_limits,omitempty" mapstructure:"resource_limits"`
594594
RetryNewPayloadsSyncingState *RetryNewPayloadsSyncingConfig `yaml:"retry_new_payloads_syncing_state,omitempty" mapstructure:"retry_new_payloads_syncing_state"`
595595
WaitAfterRPCReady string `yaml:"wait_after_rpc_ready,omitempty" mapstructure:"wait_after_rpc_ready"`
596+
RunTimeout string `yaml:"run_timeout,omitempty" mapstructure:"run_timeout"`
596597
PostTestRPCCalls []PostTestRPCCall `yaml:"post_test_rpc_calls,omitempty" mapstructure:"post_test_rpc_calls"`
597598
BootstrapFCU *BootstrapFCUConfig `yaml:"bootstrap_fcu,omitempty" mapstructure:"bootstrap_fcu"`
598599
CheckpointRestoreStrategyOptions *CheckpointRestoreStrategyOptions `yaml:"checkpoint_restore_strategy_options,omitempty" mapstructure:"checkpoint_restore_strategy_options"`
@@ -616,6 +617,7 @@ type ClientInstance struct {
616617
ResourceLimits *ResourceLimits `yaml:"resource_limits,omitempty" mapstructure:"resource_limits"`
617618
RetryNewPayloadsSyncingState *RetryNewPayloadsSyncingConfig `yaml:"retry_new_payloads_syncing_state,omitempty" mapstructure:"retry_new_payloads_syncing_state"`
618619
WaitAfterRPCReady string `yaml:"wait_after_rpc_ready,omitempty" mapstructure:"wait_after_rpc_ready"`
620+
RunTimeout string `yaml:"run_timeout,omitempty" mapstructure:"run_timeout"`
619621
PostTestRPCCalls []PostTestRPCCall `yaml:"post_test_rpc_calls,omitempty" mapstructure:"post_test_rpc_calls"`
620622
BootstrapFCU *BootstrapFCUConfig `yaml:"bootstrap_fcu,omitempty" mapstructure:"bootstrap_fcu"`
621623
CheckpointRestoreStrategyOptions *CheckpointRestoreStrategyOptions `yaml:"checkpoint_restore_strategy_options,omitempty" mapstructure:"checkpoint_restore_strategy_options"`
@@ -1038,6 +1040,11 @@ func (c *Config) Validate(opts ...ValidateOpts) error {
10381040
return err
10391041
}
10401042

1043+
// Validate run_timeout settings.
1044+
if err := c.validateRunTimeout(); err != nil {
1045+
return err
1046+
}
1047+
10411048
// Validate post_test_rpc_calls settings.
10421049
if err := c.validatePostTestRPCCalls(); err != nil {
10431050
return err
@@ -1282,6 +1289,29 @@ func (c *Config) GetWaitAfterRPCReady(instance *ClientInstance) time.Duration {
12821289
return d
12831290
}
12841291

1292+
// GetRunTimeout returns the maximum duration for test execution.
1293+
// Instance-level config takes precedence over global defaults. Returns 0 if not set.
1294+
func (c *Config) GetRunTimeout(instance *ClientInstance) time.Duration {
1295+
var s string
1296+
1297+
if instance.RunTimeout != "" {
1298+
s = instance.RunTimeout
1299+
} else {
1300+
s = c.Runner.Client.Config.RunTimeout
1301+
}
1302+
1303+
if s == "" {
1304+
return 0
1305+
}
1306+
1307+
d, err := time.ParseDuration(s)
1308+
if err != nil {
1309+
return 0
1310+
}
1311+
1312+
return d
1313+
}
1314+
12851315
// GetPostTestRPCCalls returns the post-test RPC calls for an instance.
12861316
// Instance-level config completely replaces the global default.
12871317
// Returns nil if not configured at either level.
@@ -1641,6 +1671,25 @@ func (c *Config) validateWaitAfterRPCReady() error {
16411671
return nil
16421672
}
16431673

1674+
// validateRunTimeout validates run_timeout settings.
1675+
func (c *Config) validateRunTimeout() error {
1676+
for _, instance := range c.Runner.Instances {
1677+
s := instance.RunTimeout
1678+
if s == "" {
1679+
s = c.Runner.Client.Config.RunTimeout
1680+
}
1681+
1682+
if s != "" {
1683+
if _, err := time.ParseDuration(s); err != nil {
1684+
return fmt.Errorf("instance %q: invalid run_timeout %q: %w",
1685+
instance.ID, s, err)
1686+
}
1687+
}
1688+
}
1689+
1690+
return nil
1691+
}
1692+
16441693
// validatePostTestRPCCalls validates post_test_rpc_calls settings.
16451694
func (c *Config) validatePostTestRPCCalls() error {
16461695
// Validate global-level calls.

pkg/config/config_test.go

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"os"
77
"path/filepath"
88
"testing"
9+
"time"
910

1011
"github.com/stretchr/testify/assert"
1112
"github.com/stretchr/testify/require"
@@ -2370,3 +2371,125 @@ func TestValidateAPIIndexing(t *testing.T) {
23702371
})
23712372
}
23722373
}
2374+
2375+
func TestGetRunTimeout(t *testing.T) {
2376+
tests := []struct {
2377+
name string
2378+
global string
2379+
instance string
2380+
expected time.Duration
2381+
}{
2382+
{
2383+
name: "empty returns zero",
2384+
global: "",
2385+
instance: "",
2386+
expected: 0,
2387+
},
2388+
{
2389+
name: "global value used",
2390+
global: "30m",
2391+
instance: "",
2392+
expected: 30 * time.Minute,
2393+
},
2394+
{
2395+
name: "instance overrides global",
2396+
global: "30m",
2397+
instance: "1h",
2398+
expected: 1 * time.Hour,
2399+
},
2400+
{
2401+
name: "instance only",
2402+
global: "",
2403+
instance: "45m",
2404+
expected: 45 * time.Minute,
2405+
},
2406+
{
2407+
name: "invalid returns zero",
2408+
global: "not-a-duration",
2409+
instance: "",
2410+
expected: 0,
2411+
},
2412+
}
2413+
2414+
for _, tt := range tests {
2415+
t.Run(tt.name, func(t *testing.T) {
2416+
cfg := &Config{
2417+
Runner: RunnerConfig{
2418+
Client: ClientConfig{
2419+
Config: ClientDefaults{
2420+
RunTimeout: tt.global,
2421+
},
2422+
},
2423+
},
2424+
}
2425+
instance := &ClientInstance{
2426+
RunTimeout: tt.instance,
2427+
}
2428+
assert.Equal(t, tt.expected, cfg.GetRunTimeout(instance))
2429+
})
2430+
}
2431+
}
2432+
2433+
func TestValidateRunTimeout(t *testing.T) {
2434+
tests := []struct {
2435+
name string
2436+
global string
2437+
instance string
2438+
wantErr bool
2439+
errSubstr string
2440+
}{
2441+
{
2442+
name: "empty is valid",
2443+
global: "",
2444+
instance: "",
2445+
},
2446+
{
2447+
name: "valid global",
2448+
global: "30m",
2449+
},
2450+
{
2451+
name: "valid instance",
2452+
instance: "1h",
2453+
},
2454+
{
2455+
name: "invalid global",
2456+
global: "bad",
2457+
wantErr: true,
2458+
errSubstr: "invalid run_timeout",
2459+
},
2460+
{
2461+
name: "invalid instance",
2462+
instance: "bad",
2463+
wantErr: true,
2464+
errSubstr: "invalid run_timeout",
2465+
},
2466+
}
2467+
2468+
for _, tt := range tests {
2469+
t.Run(tt.name, func(t *testing.T) {
2470+
cfg := &Config{
2471+
Runner: RunnerConfig{
2472+
Client: ClientConfig{
2473+
Config: ClientDefaults{
2474+
RunTimeout: tt.global,
2475+
},
2476+
},
2477+
Instances: []ClientInstance{
2478+
{
2479+
ID: "test",
2480+
Client: "geth",
2481+
RunTimeout: tt.instance,
2482+
},
2483+
},
2484+
},
2485+
}
2486+
err := cfg.validateRunTimeout()
2487+
if tt.wantErr {
2488+
require.Error(t, err)
2489+
assert.Contains(t, err.Error(), tt.errSubstr)
2490+
} else {
2491+
require.NoError(t, err)
2492+
}
2493+
})
2494+
}
2495+
}

pkg/runner/runner.go

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ const (
115115
RunStatusFailed = "failed"
116116
RunStatusContainerDied = "container_died"
117117
RunStatusCancelled = "cancelled"
118+
RunStatusTimedOut = "timeout"
118119
)
119120

120121
// SystemInfo contains system hardware and OS information.
@@ -181,6 +182,7 @@ type ResolvedInstance struct {
181182
RollbackStrategy string `json:"rollback_strategy,omitempty"`
182183
DropMemoryCaches string `json:"drop_memory_caches,omitempty"`
183184
WaitAfterRPCReady string `json:"wait_after_rpc_ready,omitempty"`
185+
RunTimeout string `json:"run_timeout,omitempty"`
184186
RetryNewPayloadsSyncingState *config.RetryNewPayloadsSyncingConfig `json:"retry_new_payloads_syncing_state,omitempty"`
185187
ResourceLimits *ResolvedResourceLimits `json:"resource_limits,omitempty"`
186188
PostTestRPCCalls []config.PostTestRPCCall `json:"post_test_rpc_calls,omitempty"`
@@ -1040,6 +1042,16 @@ func (r *runner) runContainerLifecycle(
10401042
}
10411043
}
10421044

1045+
// Resolve run_timeout for config output and timeout enforcement.
1046+
var runTimeoutStr string
1047+
var runTimeout time.Duration
1048+
if r.cfg.FullConfig != nil {
1049+
runTimeout = r.cfg.FullConfig.GetRunTimeout(instance)
1050+
if runTimeout > 0 {
1051+
runTimeoutStr = runTimeout.String()
1052+
}
1053+
}
1054+
10431055
// Write run configuration with resolved values.
10441056
runConfig := &RunConfig{
10451057
Timestamp: params.RunTimestamp,
@@ -1070,6 +1082,7 @@ func (r *runner) runContainerLifecycle(
10701082
}(),
10711083
DropMemoryCaches: dropMemoryCaches,
10721084
WaitAfterRPCReady: waitAfterRPCReadyStr,
1085+
RunTimeout: runTimeoutStr,
10731086
RetryNewPayloadsSyncingState: func() *config.RetryNewPayloadsSyncingConfig {
10741087
if r.cfg.FullConfig != nil {
10751088
return r.cfg.FullConfig.GetRetryNewPayloadsSyncingState(instance)
@@ -1232,8 +1245,19 @@ func (r *runner) runContainerLifecycle(
12321245

12331246
log.Info("Container started")
12341247

1248+
// Apply run timeout if configured.
1249+
testCtx := ctx
1250+
var timeoutCancel context.CancelFunc
1251+
1252+
if runTimeout > 0 {
1253+
testCtx, timeoutCancel = context.WithTimeout(ctx, runTimeout)
1254+
defer timeoutCancel()
1255+
1256+
log.WithField("timeout", runTimeout).Info("Run timeout configured")
1257+
}
1258+
12351259
// Start container death monitoring.
1236-
execCtx, execCancel := context.WithCancel(ctx)
1260+
execCtx, execCancel := context.WithCancel(testCtx)
12371261
defer execCancel()
12381262

12391263
var containerDied bool
@@ -1460,14 +1484,14 @@ func (r *runner) runContainerLifecycle(
14601484
switch rollbackStrategy {
14611485
case config.RollbackStrategyCheckpointRestore:
14621486
result, execErr = r.runTestsWithCheckpointRestore(
1463-
ctx, params, spec, containerID, containerIP,
1487+
testCtx, params, spec, containerID, containerIP,
14641488
dropMemoryCaches, dropCachesPath,
14651489
runResultsDir, &logCancel, &logDone, benchmarkoorLogFile,
14661490
&localCleanupFuncs, localCleanupStarted,
14671491
)
14681492
default:
14691493
result, execErr = r.runTestsWithContainerStrategy(
1470-
ctx, params, spec, containerID, containerIP,
1494+
testCtx, params, spec, containerID, containerIP,
14711495
rollbackStrategy, dropMemoryCaches, dropCachesPath,
14721496
runResultsDir, &logCancel, &logDone, benchmarkoorLogFile,
14731497
&localCleanupFuncs, localCleanupStarted,
@@ -1569,8 +1593,13 @@ func (r *runner) runContainerLifecycle(
15691593
}
15701594

15711595
// Determine final run status (don't overwrite if already set by executor).
1596+
// Timeout is checked first because when the deadline fires, the context
1597+
// cancellation can cause the container to stop, which sets containerDied.
15721598
mu.Lock()
1573-
if containerDied {
1599+
if timeoutCancel != nil && testCtx.Err() == context.DeadlineExceeded {
1600+
runConfig.Status = RunStatusTimedOut
1601+
runConfig.TerminationReason = fmt.Sprintf("the run_timeout of %s was reached", runTimeout)
1602+
} else if containerDied {
15741603
runConfig.Status = RunStatusContainerDied
15751604
runConfig.TerminationReason = "container exited during test execution"
15761605
runConfig.ContainerExitCode = containerExitCode

ui/src/api/types.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ export interface Index {
55
}
66

77
// Run status type
8-
export type RunStatus = 'completed' | 'container_died' | 'cancelled'
8+
export type RunStatus = 'completed' | 'container_died' | 'cancelled' | 'timeout'
99

1010
export interface IndexEntry {
1111
run_id: string
@@ -235,6 +235,7 @@ export interface InstanceConfig {
235235
rollback_strategy?: string
236236
drop_memory_caches?: string
237237
wait_after_rpc_ready?: string
238+
run_timeout?: string
238239
retry_new_payloads_syncing_state?: RetryNewPayloadsSyncingConfig
239240
resource_limits?: ResourceLimitsConfig
240241
post_test_rpc_calls?: PostTestRPCCallConfig[]

0 commit comments

Comments
 (0)