Skip to content

Commit 2781637

Browse files
Expose recent backlog task flow on operator metrics
Expose recent backlog task flow on operator metrics
1 parent 169d2ec commit 2781637

4 files changed

Lines changed: 50 additions & 0 deletions

File tree

docs/architecture/rollout-safety.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ change.
419419
| `activities` | `retrying`, `oldest_retrying_started_at`, `max_retrying_age_ms` | activity executions currently in the retry window (Pending status with `attempt_count > 0`), the earliest `started_at` among them, and the largest retrying age in milliseconds, mirroring the `tasks.oldest_lease_expired_at` / `max_lease_expired_age_ms` shape on the task path so operators can answer "how long has the worst-case activity been chewing retries?" — the primary retry-rate age indicator on the activity path — from the metric alone without walking `activity_executions` |
420420
| `activities` | `timeout_overdue`, `oldest_timeout_overdue_at`, `max_timeout_overdue_age_ms` | open activity executions whose `schedule_deadline_at` (Pending), `close_deadline_at` (Running), `schedule_to_close_deadline_at`, or `heartbeat_deadline_at` (Running) is `<= $now` and is therefore waiting for `ActivityTimeoutEnforcer` to enforce the timeout, the earliest such expired deadline timestamp across the four enforcement-relevant columns, and the largest overdue age in milliseconds. The `timeout_overdue` predicate mirrors `ActivityTimeoutEnforcer::expiredExecutionIds()` exactly, so the count is the operator-visible view of the same enforcement backlog — the activity-path counterpart of `tasks.lease_expired`. Both surface stuck work that the corresponding sweep has not yet reclaimed; sustained non-zero readings indicate the activity-timeout sweep is lagging or stalled and that worker liveness via heartbeat or start-to-close has stopped on at least one execution. The age data lets operators read "how long has the worst-case activity been past a timeout deadline without enforcement?" — the primary stuck-activity duplicate-risk age indicator on the activity path — from the metric alone without walking `activity_executions` |
421421
| `backlog` | `runnable_tasks`, `delayed_tasks`, `leased_tasks` | authoritative backlog counts |
422+
| `backlog` | `tasks_added_last_minute`, `tasks_dispatched_last_minute` | trailing-60-second queue-flow facts: distinct durable task rows created in the last minute and distinct durable task rows whose latest successful `last_dispatched_at` falls in the last minute. These are intentionally task-row facts, not a transport-attempt counter stream; repeated redispatches of the same durable task collapse to one count because `workflow_tasks` retains only the latest successful dispatch timestamp |
422423
| `backlog` | `unhealthy_tasks`, `repair_needed_runs`, `claim_failed_runs`, `compatibility_blocked_runs` | stuck/blocked roll-ups |
423424
| `backlog` | `oldest_compatibility_blocked_started_at`, `max_compatibility_blocked_age_ms` | earliest wait-start timestamp among compatibility-blocked runs and the largest blocked age in milliseconds, mirroring the `repair.oldest_missing_run_started_at` / `max_missing_run_age_ms` shape so operators can answer "how stale is the worst mixed-build block?" from the metric alone |
424425
| `repair` | `missing_task_candidates`, `selected_missing_task_candidates`, `oldest_missing_run_started_at`, `max_missing_run_age_ms` | stuck-run detectors per `TaskRepairCandidates` |

src/V2/Support/OperatorMetrics.php

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,8 @@ private static function backlogMetrics(CarbonInterface $now, ?string $namespace)
188188
'runnable_tasks' => self::readyDueTasks($now, $namespace),
189189
'delayed_tasks' => self::delayedTasks($now, $namespace),
190190
'leased_tasks' => self::leasedTasks($namespace),
191+
'tasks_added_last_minute' => self::tasksAddedLastMinute($now, $namespace),
192+
'tasks_dispatched_last_minute' => self::tasksDispatchedLastMinute($now, $namespace),
191193
'retrying_activities' => self::retryingActivities($namespace),
192194
'unhealthy_tasks' => self::dispatchFailedTasks($namespace)
193195
+ self::claimFailedTasks($namespace)
@@ -1141,6 +1143,38 @@ private static function leasedTasks(?string $namespace): int
11411143
->count();
11421144
}
11431145

1146+
/**
1147+
* Distinct durable task rows created inside the trailing 60-second window.
1148+
* This is the best stable "tasks added" fact the durable model exposes
1149+
* without inventing a second transport-only counter stream.
1150+
*/
1151+
private static function tasksAddedLastMinute(CarbonInterface $now, ?string $namespace): int
1152+
{
1153+
$windowStart = $now->copy()
1154+
->subMinute();
1155+
1156+
return self::scopedRunModelQuery(self::taskModel(), $namespace)
1157+
->where('created_at', '>=', $windowStart)
1158+
->count();
1159+
}
1160+
1161+
/**
1162+
* Distinct durable task rows whose latest successful dispatch landed
1163+
* inside the trailing 60-second window. Repeated redispatches of the same
1164+
* durable task collapse to one row because `workflow_tasks` retains only
1165+
* the latest successful `last_dispatched_at`.
1166+
*/
1167+
private static function tasksDispatchedLastMinute(CarbonInterface $now, ?string $namespace): int
1168+
{
1169+
$windowStart = $now->copy()
1170+
->subMinute();
1171+
1172+
return self::scopedRunModelQuery(self::taskModel(), $namespace)
1173+
->whereNotNull('last_dispatched_at')
1174+
->where('last_dispatched_at', '>=', $windowStart)
1175+
->count();
1176+
}
1177+
11441178
private static function leaseExpiredTasks(CarbonInterface $now, ?string $namespace): int
11451179
{
11461180
return self::scopedRunModelQuery(self::taskModel(), $namespace)

tests/Feature/V2/V2OperatorMetricsTest.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,8 @@ public function testSnapshotSummarizesDurableBacklogRepairCompatibilityAndWorker
260260
$this->assertSame(4, $snapshot['backlog']['runnable_tasks']);
261261
$this->assertSame(1, $snapshot['backlog']['delayed_tasks']);
262262
$this->assertSame(2, $snapshot['backlog']['leased_tasks']);
263+
$this->assertSame(6, $snapshot['backlog']['tasks_added_last_minute']);
264+
$this->assertSame(2, $snapshot['backlog']['tasks_dispatched_last_minute']);
263265
$this->assertSame(4, $snapshot['backlog']['unhealthy_tasks']);
264266
$this->assertSame(1, $snapshot['backlog']['repair_needed_runs']);
265267
$this->assertSame(1, $snapshot['backlog']['claim_failed_runs']);

tests/Unit/V2/RolloutSafetyDocumentationTest.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ final class RolloutSafetyDocumentationTest extends TestCase
145145
'runnable_tasks',
146146
'delayed_tasks',
147147
'leased_tasks',
148+
'tasks_added_last_minute',
149+
'tasks_dispatched_last_minute',
148150
'unhealthy_tasks',
149151
'repair_needed_runs',
150152
'claim_failed_runs',
@@ -356,6 +358,17 @@ public function testContractDocumentFreezesCompatibilityBlockedAgeRow(): void
356358
);
357359
}
358360

361+
public function testContractDocumentFreezesBacklogRateRow(): void
362+
{
363+
$contents = $this->documentContents();
364+
365+
$this->assertMatchesRegularExpression(
366+
'/\|\s*`backlog`\s*\|[^|]*`tasks_added_last_minute`[^|]*`tasks_dispatched_last_minute`/',
367+
$contents,
368+
'Rollout safety contract must pin the backlog add/dispatch throughput row so operators can read recent queue flow from OperatorMetrics::snapshot() without inferring it from unrelated counters.',
369+
);
370+
}
371+
359372
public function testContractDocumentFreezesLeaseExpiredAgeRow(): void
360373
{
361374
$contents = $this->documentContents();

0 commit comments

Comments
 (0)