Skip to content

Commit 4451698

Browse files
Surface retrying activity age on operator metrics
Freezes `operator_metrics.activities.oldest_retrying_started_at` (ISO-8601 or null) and `operator_metrics.activities.max_retrying_age_ms` (integer ms) on `OperatorMetrics::snapshot()`. The pair mirrors the existing task-path age signals (`tasks.oldest_lease_expired_at` / `max_lease_expired_age_ms`, `tasks.oldest_dispatch_overdue_since` / `max_dispatch_overdue_age_ms`, `tasks.oldest_claim_failed_at` / `max_claim_failed_age_ms`) but on the activity path, so operators can read "how long has the worst-case activity been chewing retries?" — the primary retry-rate age indicator on the activity path — from the metric alone without walking `activity_executions`. The retry-window predicate matches `retryingActivities()` exactly: Pending status with `attempt_count > 0`. Activities on their first attempt (`attempt_count = 0`), Running attempts, and closed (Completed/Failed/Cancelled) executions are not counted, so the signal isolates the retry-backoff cohort from in-flight first attempts and from completed work. Pins the row on `docs/architecture/rollout-safety.md` and adds a retrying-activity-age bullet, guarded by `RolloutSafetyDocumentationTest::testContractDocumentFreezesRetryingActivityAgeRow`.
1 parent 6089777 commit 4451698

4 files changed

Lines changed: 193 additions & 3 deletions

File tree

docs/architecture/rollout-safety.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ change.
414414
| `tasks` | `oldest_dispatch_overdue_since`, `max_dispatch_overdue_age_ms` | earliest `COALESCE(last_dispatched_at, created_at)` among dispatch-overdue tasks — the timestamp the worst-case ready-but-unclaimed task has been waiting for a successful dispatch wake since (either its last attempted dispatch that didn't stick or its creation time if it was never dispatched) — and the largest age in milliseconds, mirroring the `oldest_ready_due_at` / `max_ready_due_age_ms` shape so operators can read wake-latency ("how long has the oldest ready-but-unclaimed task been waiting for a working dispatch wake?") from the metric alone without walking `workflow_tasks` |
415415
| `tasks` | `oldest_claim_failed_at`, `max_claim_failed_age_ms` | earliest `last_claim_failed_at` among claim-failed tasks (Ready tasks whose most recent claim attempt recorded an uncleared `last_claim_error`) and the largest claim-failed age in milliseconds, mirroring the `oldest_dispatch_overdue_since` / `max_dispatch_overdue_age_ms` shape for the dispatch path so operators can read "how long has the worst-case task been sitting with an uncleared claim error?" — the primary lease-conflict and duplicate-risk age indicator for the claim path — from the metric alone without walking `workflow_tasks` |
416416
| `tasks` | `unhealthy` | sum of transport failure and lease expiry counts (the primary duplicate-risk indicator) |
417+
| `activities` | `retrying`, `oldest_retrying_started_at`, `max_retrying_age_ms` | activity executions currently in the retry window (Pending status with `attempt_count > 0`), the earliest `started_at` among them, and the largest retrying age in milliseconds, mirroring the `tasks.oldest_lease_expired_at` / `max_lease_expired_age_ms` shape on the task path so operators can answer "how long has the worst-case activity been chewing retries?" — the primary retry-rate age indicator on the activity path — from the metric alone without walking `activity_executions` |
417418
| `backlog` | `runnable_tasks`, `delayed_tasks`, `leased_tasks` | authoritative backlog counts |
418419
| `backlog` | `unhealthy_tasks`, `repair_needed_runs`, `claim_failed_runs`, `compatibility_blocked_runs` | stuck/blocked roll-ups |
419420
| `backlog` | `oldest_compatibility_blocked_started_at`, `max_compatibility_blocked_age_ms` | earliest wait-start timestamp among compatibility-blocked runs and the largest blocked age in milliseconds, mirroring the `repair.oldest_missing_run_started_at` / `max_missing_run_age_ms` shape so operators can answer "how stale is the worst mixed-build block?" from the metric alone |

src/V2/Support/OperatorMetrics.php

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public static function snapshot(?CarbonInterface $now = null, ?string $namespace
4242
'generated_at' => $now->toJSON(),
4343
'runs' => self::runMetrics($now, $namespace),
4444
'tasks' => self::taskMetrics($now, $namespace),
45-
'activities' => self::activityMetrics($namespace),
45+
'activities' => self::activityMetrics($now, $namespace),
4646
'backlog' => self::backlogMetrics($now, $namespace),
4747
'repair' => $namespace === null ? TaskRepairCandidates::snapshot($now) : self::emptyRepairSnapshot(),
4848
'starts' => self::startMetrics($now, $namespace),
@@ -186,10 +186,12 @@ private static function backlogMetrics(CarbonInterface $now, ?string $namespace)
186186
}
187187

188188
/**
189-
* @return array<string, int>
189+
* @return array<string, int|string|null>
190190
*/
191-
private static function activityMetrics(?string $namespace): array
191+
private static function activityMetrics(CarbonInterface $now, ?string $namespace): array
192192
{
193+
$oldestRetryingStartedAt = self::oldestRetryingActivityStartedAt($namespace);
194+
193195
return [
194196
'open' => self::scopedRunModelQuery(self::activityExecutionModel(), $namespace)
195197
->whereIn('status', [ActivityStatus::Pending->value, ActivityStatus::Running->value])
@@ -201,6 +203,10 @@ private static function activityMetrics(?string $namespace): array
201203
->where('status', ActivityStatus::Running->value)
202204
->count(),
203205
'retrying' => self::retryingActivities($namespace),
206+
'oldest_retrying_started_at' => $oldestRetryingStartedAt?->toJSON(),
207+
'max_retrying_age_ms' => $oldestRetryingStartedAt === null
208+
? 0
209+
: (int) $oldestRetryingStartedAt->diffInMilliseconds($now),
204210
'failed_attempts' => self::scopedRunModelQuery(self::activityAttemptModel(), $namespace)
205211
->where('status', ActivityAttemptStatus::Failed->value)
206212
->count(),
@@ -780,6 +786,34 @@ private static function retryingActivities(?string $namespace): int
780786
->count();
781787
}
782788

789+
/**
790+
* Earliest `started_at` across activity executions currently in the
791+
* retry window — Pending status with `attempt_count > 0`. Rollout-safety
792+
* surfaces this alongside `activities.retrying` so operators can read
793+
* "how long has the worst-case activity been chewing retries?" — the
794+
* primary retry-rate age indicator on the activity path — from the
795+
* metric alone, mirroring the `tasks.oldest_lease_expired_at` /
796+
* `max_lease_expired_age_ms` shape on the task path. The retrying
797+
* predicate matches `retryingActivities()` exactly so the two keys
798+
* stay aligned.
799+
*/
800+
private static function oldestRetryingActivityStartedAt(?string $namespace): ?CarbonInterface
801+
{
802+
/** @var ActivityExecution|null $execution */
803+
$execution = self::scopedRunModelQuery(self::activityExecutionModel(), $namespace)
804+
->where('status', ActivityStatus::Pending->value)
805+
->where('attempt_count', '>', 0)
806+
->whereNotNull('started_at')
807+
->orderBy('started_at')
808+
->first();
809+
810+
if (! $execution instanceof ActivityExecution) {
811+
return null;
812+
}
813+
814+
return $execution->started_at;
815+
}
816+
783817
private static function pendingStartRuns(?string $namespace): int
784818
{
785819
return self::summaryQuery($namespace)

tests/Feature/V2/V2OperatorMetricsTest.php

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
use Illuminate\Support\Carbon;
88
use Illuminate\Support\Facades\Schema;
99
use Tests\TestCase;
10+
use Workflow\V2\Enums\ActivityStatus;
1011
use Workflow\V2\Enums\CommandOutcome;
1112
use Workflow\V2\Enums\CommandStatus;
1213
use Workflow\V2\Enums\CommandType;
1314
use Workflow\V2\Enums\HistoryEventType;
1415
use Workflow\V2\Enums\ScheduleStatus;
1516
use Workflow\V2\Enums\TaskStatus;
1617
use Workflow\V2\Enums\TaskType;
18+
use Workflow\V2\Models\ActivityExecution;
1719
use Workflow\V2\Models\WorkflowCommand;
1820
use Workflow\V2\Models\WorkflowHistoryEvent;
1921
use Workflow\V2\Models\WorkflowInstance;
@@ -1320,6 +1322,125 @@ public function testSnapshotReportsRunWaitAgeAsZeroWhenNoRunsAreWaiting(): void
13201322
$this->assertSame(0, $snapshot['runs']['max_wait_age_ms']);
13211323
}
13221324

1325+
public function testSnapshotSurfacesRetryingActivityAgeFromOldestRetryingActivity(): void
1326+
{
1327+
Carbon::setTestNow('2026-04-09 12:00:00');
1328+
$this->beforeApplicationDestroyed(static function (): void {
1329+
Carbon::setTestNow();
1330+
});
1331+
1332+
$now = Carbon::now();
1333+
1334+
$run = $this->createRunWithSummary(
1335+
instanceId: 'retrying-activity-instance',
1336+
runId: '01JRETRYACTRUN00000000001',
1337+
status: 'running',
1338+
statusBucket: 'running',
1339+
livenessState: 'running',
1340+
);
1341+
1342+
// Worst-case: Pending activity in retry backoff for 90s (started_at 90s ago, attempt_count = 2).
1343+
$this->createActivityExecution($run, '01JRETRYACTEXEC0000000001', [
1344+
'sequence' => 1,
1345+
'status' => ActivityStatus::Pending->value,
1346+
'attempt_count' => 2,
1347+
'started_at' => $now->copy()
1348+
->subSeconds(90),
1349+
]);
1350+
1351+
// Newer Pending retry — attempt_count > 0, but started 30s ago, so it
1352+
// must not win the "oldest retrying since".
1353+
$this->createActivityExecution($run, '01JRETRYACTEXEC0000000002', [
1354+
'sequence' => 2,
1355+
'status' => ActivityStatus::Pending->value,
1356+
'attempt_count' => 1,
1357+
'started_at' => $now->copy()
1358+
->subSeconds(30),
1359+
]);
1360+
1361+
// Pending first attempt — attempt_count = 0, NOT counted as retrying
1362+
// even though it has been waiting longer than 90s.
1363+
$this->createActivityExecution($run, '01JRETRYACTEXEC0000000003', [
1364+
'sequence' => 3,
1365+
'status' => ActivityStatus::Pending->value,
1366+
'attempt_count' => 0,
1367+
'started_at' => $now->copy()
1368+
->subSeconds(120),
1369+
]);
1370+
1371+
// Running attempt — `retrying` predicate excludes Running so this is
1372+
// not counted, even with attempt_count > 0.
1373+
$this->createActivityExecution($run, '01JRETRYACTEXEC0000000004', [
1374+
'sequence' => 4,
1375+
'status' => ActivityStatus::Running->value,
1376+
'attempt_count' => 3,
1377+
'started_at' => $now->copy()
1378+
->subSeconds(150),
1379+
]);
1380+
1381+
// Closed activity — Completed executions are not retrying.
1382+
$this->createActivityExecution($run, '01JRETRYACTEXEC0000000005', [
1383+
'sequence' => 5,
1384+
'status' => ActivityStatus::Completed->value,
1385+
'attempt_count' => 4,
1386+
'started_at' => $now->copy()
1387+
->subSeconds(300),
1388+
'closed_at' => $now->copy()
1389+
->subSeconds(60),
1390+
]);
1391+
1392+
$snapshot = OperatorMetrics::snapshot($now);
1393+
1394+
$expectedOldestRetryingStartedAt = $now->copy()
1395+
->subSeconds(90)
1396+
->toJSON();
1397+
1398+
$this->assertSame(2, $snapshot['activities']['retrying']);
1399+
$this->assertSame($expectedOldestRetryingStartedAt, $snapshot['activities']['oldest_retrying_started_at']);
1400+
$this->assertSame(90 * 1000, $snapshot['activities']['max_retrying_age_ms']);
1401+
}
1402+
1403+
public function testSnapshotReportsRetryingActivityAgeAsZeroWhenNoActivitiesAreRetrying(): void
1404+
{
1405+
Carbon::setTestNow('2026-04-09 12:00:00');
1406+
$this->beforeApplicationDestroyed(static function (): void {
1407+
Carbon::setTestNow();
1408+
});
1409+
1410+
$now = Carbon::now();
1411+
1412+
$run = $this->createRunWithSummary(
1413+
instanceId: 'retrying-none-instance',
1414+
runId: '01JRETRYNONERUN0000000001',
1415+
status: 'running',
1416+
statusBucket: 'running',
1417+
livenessState: 'running',
1418+
);
1419+
1420+
// Pending first attempt — attempt_count = 0, not retrying.
1421+
$this->createActivityExecution($run, '01JRETRYNONEEXEC000000001', [
1422+
'sequence' => 1,
1423+
'status' => ActivityStatus::Pending->value,
1424+
'attempt_count' => 0,
1425+
'started_at' => null,
1426+
]);
1427+
1428+
// Running attempt — `retrying` predicate excludes Running.
1429+
$this->createActivityExecution($run, '01JRETRYNONEEXEC000000002', [
1430+
'sequence' => 2,
1431+
'status' => ActivityStatus::Running->value,
1432+
'attempt_count' => 2,
1433+
'started_at' => $now->copy()
1434+
->subSeconds(45),
1435+
]);
1436+
1437+
$snapshot = OperatorMetrics::snapshot($now);
1438+
1439+
$this->assertSame(0, $snapshot['activities']['retrying']);
1440+
$this->assertNull($snapshot['activities']['oldest_retrying_started_at']);
1441+
$this->assertSame(0, $snapshot['activities']['max_retrying_age_ms']);
1442+
}
1443+
13231444
public function testSnapshotReportsInWorkerMatchingRoleShapeByDefault(): void
13241445
{
13251446
config()->set('workflows.v2.matching_role.queue_wake_enabled', true);
@@ -1499,6 +1620,27 @@ private function createStartCommand(WorkflowRun $run, Carbon $acceptedAt): Workf
14991620
]);
15001621
}
15011622

1623+
/**
1624+
* @param array<string, mixed> $attributes
1625+
*/
1626+
private function createActivityExecution(WorkflowRun $run, string $id, array $attributes = []): ActivityExecution
1627+
{
1628+
/** @var ActivityExecution $execution */
1629+
$execution = ActivityExecution::query()->create(array_merge([
1630+
'id' => $id,
1631+
'workflow_run_id' => $run->id,
1632+
'sequence' => 1,
1633+
'activity_class' => 'WorkflowActivityClass',
1634+
'activity_type' => 'workflow.activity.test',
1635+
'status' => ActivityStatus::Pending->value,
1636+
'connection' => 'redis',
1637+
'queue' => 'default',
1638+
'attempt_count' => 0,
1639+
], $attributes));
1640+
1641+
return $execution;
1642+
}
1643+
15021644
/**
15031645
* @param array<string, mixed> $attributes
15041646
*/

tests/Unit/V2/RolloutSafetyDocumentationTest.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ final class RolloutSafetyDocumentationTest extends TestCase
130130
'max_ready_due_age_ms',
131131
'oldest_claim_failed_at',
132132
'max_claim_failed_age_ms',
133+
'oldest_retrying_started_at',
134+
'max_retrying_age_ms',
133135
'unhealthy',
134136
'runnable_tasks',
135137
'delayed_tasks',
@@ -385,6 +387,17 @@ public function testContractDocumentFreezesClaimFailedAgeRow(): void
385387
);
386388
}
387389

390+
public function testContractDocumentFreezesRetryingActivityAgeRow(): void
391+
{
392+
$contents = $this->documentContents();
393+
394+
$this->assertMatchesRegularExpression(
395+
'/\|\s*`activities`\s*\|[^|]*`retrying`[^|]*`oldest_retrying_started_at`[^|]*`max_retrying_age_ms`/',
396+
$contents,
397+
'Rollout safety contract must pin the activities retrying-age row so operators can read "how long has the worst-case activity been chewing retries?" from OperatorMetrics::snapshot() without walking activity_executions.',
398+
);
399+
}
400+
388401
public function testContractDocumentFreezesHealthCheckNames(): void
389402
{
390403
$contents = $this->documentContents();

0 commit comments

Comments
 (0)