Skip to content

Commit 149c044

Browse files
aster-voidclaude
andcommitted
refactor(job): remove retry feature entirely
Retry logic should be handled in user space rather than in rollcron. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c578b79 commit 149c044

File tree

4 files changed

+28
-279
lines changed

4 files changed

+28
-279
lines changed

CLAUDE.md

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ src/
1717
│ └── job/ # Job Actor - single job control
1818
│ ├── mod.rs # Actor definition, state machine
1919
│ ├── tick.rs # cron schedule evaluation
20-
│ └── executor.rs # command execution, retry, timeout
20+
│ └── executor.rs # command execution, timeout
2121
├── config.rs # YAML config parsing, Job struct
2222
├── git.rs # Git operations (clone, pull, archive)
2323
├── env.rs # Environment variable handling
@@ -53,7 +53,6 @@ enum RunConfigRaw {
5353
sh: String,
5454
timeout: String, // Default: "1h"
5555
concurrency: Concurrency,
56-
retry: Option<RetryConfigRaw>,
5756
working_dir: Option<String>,
5857
env_file: Option<String>,
5958
env: Option<HashMap<String, String>>,
@@ -98,7 +97,6 @@ struct Job {
9897
command: String, // From run.sh
9998
timeout: Duration, // From run.timeout
10099
concurrency: Concurrency,
101-
retry: Option<RetryConfig>,
102100
working_dir: Option<String>, // run.working_dir || job.working_dir
103101
log_file: Option<String>, // From log.file
104102
log_max_size: u64, // From log.max_size
@@ -114,13 +112,6 @@ struct WebhookConfig {
114112
url: String, // Webhook URL (supports $ENV_VAR expansion)
115113
}
116114

117-
struct RetryConfig {
118-
max: u32, // Max retry attempts
119-
delay: Duration, // Initial delay (exponential backoff)
120-
jitter: Option<Duration>, // Random variation added to retry delay (0 to jitter)
121-
// Auto-inferred as 25% of delay when not set
122-
}
123-
124115
struct RunnerConfig {
125116
timezone: TimezoneConfig,
126117
env_file: Option<String>, // Path to .env file (relative to repo root)
@@ -194,8 +185,7 @@ jobs:
194185
name: "Display Name"
195186
schedule: { cron: "*/5 * * * *", timezone: Asia/Tokyo }
196187
build: { sh: cargo build, timeout: 30m, working_dir: ./subdir }
197-
run: { sh: ./app, timeout: 10s, concurrency: skip, working_dir: ./subdir,
198-
retry: { max: 3, delay: 1s, jitter: 500ms } }
188+
run: { sh: ./app, timeout: 10s, concurrency: skip, working_dir: ./subdir }
199189
log: { file: output.log, max_size: 10M }
200190
working_dir: ./subdir
201191
env_file: .env
@@ -260,7 +250,7 @@ mise exec -- cargo test # Run tests
260250
### Job Execution
261251
1. Each job calculates next occurrence and sleeps until scheduled time
262252
2. When scheduled time arrives: spawn task in run/ directory with timeout
263-
3. On failure: apply exponential backoff + retry jitter before retry
253+
3. On failure: send webhook notification
264254
4. After job completes: try to copy pending build if any
265255

266256
### Shutdown (Ctrl+C)

README.md

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,6 @@ jobs:
121121
timeout: 5m
122122
concurrency: skip
123123
working_dir: ./dist
124-
retry:
125-
max: 3
126-
delay: 5s
127-
jitter: 1s
128124
env_file: .env.run
129125
env:
130126
NODE_ENV: production
@@ -239,14 +235,6 @@ Full form:
239235
| `env_file` | string, optional | - | Run-specific .env file |
240236
| `env` | map, optional | - | Run-specific environment variables |
241237

242-
#### `jobs.<job-id>.run.retry` (optional)
243-
244-
| Field | Type | Default | Description |
245-
|-------|------|---------|-------------|
246-
| `max` | int | **required** | Max retry attempts (must be ≥ 1) |
247-
| `delay` | duration, optional | `1s` | Initial delay (doubles each retry) |
248-
| `jitter` | duration, optional | 25% of delay | Random variation added to delay |
249-
250238
#### `jobs.<job-id>.log` (optional)
251239

252240
Shorthand: `log: "output.log"`

src/actor/job/executor.rs

Lines changed: 23 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,17 @@
11
use chrono::{Local, Utc};
2-
use rand::Rng;
32
use std::collections::HashMap;
43
use std::fs::{self, File, OpenOptions};
54
use std::io::Write;
65
use std::path::{Path, PathBuf};
76
use std::time::{Duration, Instant};
87
use tokio::process::Command;
9-
use tokio::time::sleep;
108
use tracing::{debug, error, info, warn};
119

12-
use crate::config::{Job, RetryConfig, RunnerConfig, TimezoneConfig};
10+
use crate::config::{Job, RunnerConfig, TimezoneConfig};
1311
use crate::env;
1412
use crate::git;
1513
use crate::webhook::{self, BuildFailure, JobFailure};
1614

17-
/// Default jitter ratio when not explicitly configured (25% of base delay)
18-
const AUTO_JITTER_RATIO: u32 = 25;
19-
2015
/// Grace period to wait after SIGTERM before sending SIGKILL
2116
const GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(10);
2217

@@ -329,79 +324,47 @@ pub async fn execute_job(job: &Job, sot_path: &Path, runner: &RunnerConfig) -> b
329324
.as_ref()
330325
.and_then(|p| create_log_file(&job_dir, p, job.log_max_size));
331326

332-
let max_attempts = job.retry.as_ref().map(|r| r.max + 1).unwrap_or(1);
333-
let mut last_result: Option<CommandResult> = None;
334-
335-
for attempt in 0..max_attempts {
336-
if attempt > 0 {
337-
if let Some(retry) = job.retry.as_ref() {
338-
let delay = calculate_backoff(retry, attempt - 1);
339-
info!(
340-
target: "rollcron::job",
341-
job_id = %job.id,
342-
attempt,
343-
max_retries = max_attempts - 1,
344-
delay = ?delay,
345-
"Retrying"
346-
);
347-
sleep(delay).await;
348-
}
349-
}
350-
351-
info!(
352-
target: "rollcron::job",
353-
job_id = %job.id,
354-
name = %job.name,
355-
command = %job.command,
356-
"Starting job"
357-
);
358-
359-
if let Some(ref mut file) = log_file {
360-
let marker = if attempt > 0 {
361-
format!("Job started (retry {}/{})", attempt, max_attempts - 1)
362-
} else {
363-
"Job started".to_string()
364-
};
365-
write_log_marker(file, &runner.timezone, job.timezone.as_ref(), &marker);
366-
}
367-
368-
let start_time = Instant::now();
369-
let result = run_command(job, &work_dir, sot_path, runner).await;
370-
let duration = start_time.elapsed();
371-
let success = handle_result(job, &result, log_file.as_mut(), &runner.timezone, duration);
327+
info!(
328+
target: "rollcron::job",
329+
job_id = %job.id,
330+
name = %job.name,
331+
command = %job.command,
332+
"Starting job"
333+
);
372334

373-
if success {
374-
return true;
375-
}
335+
if let Some(ref mut file) = log_file {
336+
write_log_marker(file, &runner.timezone, job.timezone.as_ref(), "Job started");
337+
}
376338

377-
last_result = Some(result);
339+
let start_time = Instant::now();
340+
let result = run_command(job, &work_dir, sot_path, runner).await;
341+
let duration = start_time.elapsed();
342+
let success = handle_result(job, &result, log_file.as_mut(), &runner.timezone, duration);
378343

379-
if attempt + 1 < max_attempts {
380-
debug!(target: "rollcron::job", job_id = %job.id, "Will retry...");
381-
}
344+
if success {
345+
return true;
382346
}
383347

384-
// All retries exhausted - send webhook notifications if configured
348+
// Job failed - send webhook notifications if configured
385349
if !job.webhook.is_empty() {
386-
let (error, stderr) = match &last_result {
387-
Some(CommandResult::Completed(output)) => {
350+
let (error, stderr) = match &result {
351+
CommandResult::Completed(output) => {
388352
let err = format!("exit code {:?}", output.status.code());
389353
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
390354
(err, stderr)
391355
}
392-
Some(CommandResult::ExecError(e)) => (format!("exec error: {}", e), String::new()),
393-
Some(CommandResult::Timeout) => {
356+
CommandResult::ExecError(e) => (format!("exec error: {}", e), String::new()),
357+
CommandResult::Timeout => {
394358
(format!("timeout after {:?}", job.timeout), String::new())
395359
}
396-
None => ("unknown error".to_string(), String::new()),
397360
};
398361

399362
let failure = JobFailure {
400363
job_id: &job.id,
401364
job_name: &job.name,
402365
error,
403366
stderr,
404-
attempts: max_attempts,
367+
attempts: 1,
405368
};
406369

407370
let runner_env = load_runner_env_vars(sot_path, runner);
@@ -714,27 +677,6 @@ fn handle_result(job: &Job, result: &CommandResult, log_file: Option<&mut File>,
714677
}
715678
}
716679

717-
// === Backoff ===
718-
719-
fn calculate_backoff(retry: &RetryConfig, attempt: u32) -> Duration {
720-
let base_delay = retry.delay.saturating_mul(2u32.saturating_pow(attempt));
721-
let jitter_max =
722-
retry.jitter.unwrap_or_else(|| retry.delay.saturating_mul(AUTO_JITTER_RATIO) / 100);
723-
base_delay.saturating_add(generate_jitter(jitter_max))
724-
}
725-
726-
fn generate_jitter(max: Duration) -> Duration {
727-
if max.is_zero() {
728-
return Duration::ZERO;
729-
}
730-
let millis = max.as_millis();
731-
if millis == 0 {
732-
return Duration::ZERO;
733-
}
734-
let jitter_millis = rand::thread_rng().gen_range(0..=millis);
735-
Duration::from_millis(jitter_millis as u64)
736-
}
737-
738680
// === Logging ===
739681

740682
fn rotate_log_file(path: &Path, max_size: u64) {
@@ -819,7 +761,6 @@ mod tests {
819761
command: cmd.to_string(),
820762
timeout: Duration::from_secs(timeout_secs),
821763
concurrency: Concurrency::Skip,
822-
retry: None,
823764
working_dir: None,
824765
enabled: true,
825766
timezone: None,
@@ -858,37 +799,6 @@ mod tests {
858799
execute_job(&job, &dir.path().to_path_buf(), &runner).await;
859800
}
860801

861-
#[test]
862-
fn exponential_backoff_calculation() {
863-
let retry = RetryConfig {
864-
max: 5,
865-
delay: Duration::from_secs(1),
866-
jitter: None,
867-
};
868-
let backoff_0 = calculate_backoff(&retry, 0);
869-
assert!(backoff_0 >= Duration::from_secs(1));
870-
assert!(backoff_0 <= Duration::from_millis(1250));
871-
872-
let backoff_1 = calculate_backoff(&retry, 1);
873-
assert!(backoff_1 >= Duration::from_secs(2));
874-
assert!(backoff_1 <= Duration::from_millis(2250));
875-
}
876-
877-
#[test]
878-
fn generate_jitter_bounds() {
879-
let max = Duration::from_millis(100);
880-
for _ in 0..10 {
881-
let jitter = generate_jitter(max);
882-
assert!(jitter <= max);
883-
}
884-
}
885-
886-
#[test]
887-
fn generate_jitter_zero() {
888-
let jitter = generate_jitter(Duration::ZERO);
889-
assert_eq!(jitter, Duration::ZERO);
890-
}
891-
892802
#[test]
893803
fn format_duration_milliseconds() {
894804
assert_eq!(format_duration(Duration::from_millis(0)), "0ms");

0 commit comments

Comments
 (0)