Skip to content

Commit 1f3e7c7

Browse files
GitHub #584: server + workflow + waterline: v2 architecture Phase 6: rollout safety enforcement and coordination health (#506)
1 parent c2a0671 commit 1f3e7c7

4 files changed

Lines changed: 143 additions & 10 deletions

File tree

docs/architecture/rollout-safety.md

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -563,12 +563,26 @@ serving the snapshot — without re-aggregating metrics across the
563563
`matching_role` block. `wake_owner` is `worker_loop` on nodes that
564564
still run the in-worker broad-poll wake and `dedicated_repair_pass`
565565
on nodes that have opted out so the broad sweep runs as
566-
`php artisan workflow:v2:repair-pass` instead. Adding a new check
567-
is allowed; renaming or removing one is a protocol-level change.
568-
The canonical check names above match the strings emitted by
569-
`Workflow\V2\Support\HealthCheck::snapshot()` verbatim, and a runtime
570-
pinning test in the workflow package asserts the match so doc/code
571-
drift fails loudly.
566+
`php artisan workflow:v2:repair-pass` instead. The check also forwards
567+
`required_compatibility`, `active_workers`,
568+
`active_workers_supporting_required`, and the convenience boolean
569+
`fleet_supports_required` from the `workers` block so operators can
570+
tell whether a compatibility block reflects "no live worker advertises
571+
the required marker" (the canonical fail-closed admission case
572+
escalated under `DW_V2_FLEET_VALIDATION_MODE=fail`) vs a transient
573+
compatibility race that still has fleet coverage — without joining
574+
the `worker_compatibility` check separately.
575+
`fleet_supports_required` is `true` when no marker is required (the
576+
unscoped case) or at least one heartbeat advertises the required
577+
marker. When `compatibility_blocked_runs > 0` and
578+
`fleet_supports_required = false`, the check's `message` names the
579+
missing-coverage case explicitly so the operator action ("ensure a
580+
worker advertising the required marker rolls in") is unambiguous.
581+
Adding a new check is allowed; renaming or removing one is a
582+
protocol-level change. The canonical check names above match the
583+
strings emitted by `Workflow\V2\Support\HealthCheck::snapshot()`
584+
verbatim, and a runtime pinning test in the workflow package asserts
585+
the match so doc/code drift fails loudly.
572586

573587
### Queue visibility
574588

src/V2/Support/HealthCheck.php

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -351,24 +351,36 @@ private static function routingHealthCheck(
351351
? $matchingRole['task_dispatch_mode']
352352
: 'queue';
353353
$activeWorkerScopes = self::integer($workers['active_worker_scopes'] ?? 0);
354+
$requiredCompatibility = is_string($workers['required_compatibility'] ?? null)
355+
? $workers['required_compatibility']
356+
: null;
357+
$activeWorkers = self::integer($workers['active_workers'] ?? 0);
358+
$activeWorkersSupportingRequired = self::integer($workers['active_workers_supporting_required'] ?? 0);
359+
// True when no marker is required (unscoped) or at least one heartbeat advertises it.
360+
$fleetSupportsRequired = $requiredCompatibility === null
361+
|| $activeWorkersSupportingRequired > 0;
362+
363+
$compatibilityBlocked = $compatibilityBlockedRuns > 0;
364+
$compatibilityBlockedWithoutFleetCoverage = $compatibilityBlocked && ! $fleetSupportsRequired;
354365

355366
$message = 'No routing drains, compatibility blocks, or uncleared claim failures are currently projected.';
356-
if ($compatibilityBlockedRuns > 0 || $dispatchOverdueTasks > 0 || $claimFailedTasks > 0) {
357-
$signalCount = ($compatibilityBlockedRuns > 0 ? 1 : 0)
367+
if ($compatibilityBlocked || $dispatchOverdueTasks > 0 || $claimFailedTasks > 0) {
368+
$signalCount = ($compatibilityBlocked ? 1 : 0)
358369
+ ($dispatchOverdueTasks > 0 ? 1 : 0)
359370
+ ($claimFailedTasks > 0 ? 1 : 0);
360371

361372
$message = match (true) {
362373
$signalCount > 1 => 'Routing health is degraded: compatibility blocks, dispatch lag, or uncleared claim failures are visible in durable state.',
363-
$compatibilityBlockedRuns > 0 => 'One or more runs are ready but waiting for a compatible worker in the active fleet.',
374+
$compatibilityBlockedWithoutFleetCoverage => 'One or more runs are blocked because no active worker heartbeat advertises the required compatibility marker.',
375+
$compatibilityBlocked => 'One or more runs are ready but waiting for a compatible worker in the active fleet.',
364376
$dispatchOverdueTasks > 0 => 'One or more ready tasks have waited past the redispatch window without a successful dispatch wake.',
365377
default => 'One or more ready tasks still carry an uncleared claim failure.',
366378
};
367379
}
368380

369381
return self::check(
370382
'routing_health',
371-
($compatibilityBlockedRuns > 0 || $dispatchOverdueTasks > 0 || $claimFailedTasks > 0) ? 'warning' : 'ok',
383+
($compatibilityBlocked || $dispatchOverdueTasks > 0 || $claimFailedTasks > 0) ? 'warning' : 'ok',
372384
$message,
373385
self::CATEGORY_CORRECTNESS,
374386
[
@@ -394,6 +406,10 @@ private static function routingHealthCheck(
394406
'wake_owner' => $wakeOwner,
395407
'task_dispatch_mode' => $taskDispatchMode,
396408
'active_worker_scopes' => $activeWorkerScopes,
409+
'required_compatibility' => $requiredCompatibility,
410+
'active_workers' => $activeWorkers,
411+
'active_workers_supporting_required' => $activeWorkersSupportingRequired,
412+
'fleet_supports_required' => $fleetSupportsRequired,
397413
],
398414
);
399415
}

tests/Unit/V2/HealthCheckTest.php

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,13 @@ public function testSnapshotReportsRoutingHealthOkWhenNoRoutingRisksAreVisible()
810810
$this->assertSame('queue', $routing['data']['task_dispatch_mode']);
811811
$this->assertTrue($routing['data']['queue_wake_enabled']);
812812
$this->assertSame(0, $routing['data']['active_worker_scopes']);
813+
// No marker is required (fresh test config) and the fleet has no
814+
// heartbeats yet, but `fleet_supports_required` is still true because
815+
// the unscoped case never blocks routing.
816+
$this->assertNull($routing['data']['required_compatibility']);
817+
$this->assertSame(0, $routing['data']['active_workers']);
818+
$this->assertSame(0, $routing['data']['active_workers_supporting_required']);
819+
$this->assertTrue($routing['data']['fleet_supports_required']);
813820
}
814821

815822
public function testSnapshotWarnsWhenRoutingHealthSeesCompatibilityDispatchAndClaimDrains(): void
@@ -934,6 +941,81 @@ public function testSnapshotWarnsWhenRoutingHealthSeesCompatibilityDispatchAndCl
934941
$this->assertSame('dedicated_repair_pass', $routing['data']['wake_owner']);
935942
$this->assertSame('poll', $routing['data']['task_dispatch_mode']);
936943
$this->assertSame(0, $routing['data']['active_worker_scopes']);
944+
$this->assertNull($routing['data']['required_compatibility']);
945+
$this->assertSame(0, $routing['data']['active_workers']);
946+
$this->assertSame(0, $routing['data']['active_workers_supporting_required']);
947+
$this->assertTrue($routing['data']['fleet_supports_required']);
948+
}
949+
950+
public function testRoutingHealthMessageNamesMissingFleetCoverageWhenCompatibilityBlocksHaveZeroSupportingWorkers(): void
951+
{
952+
Carbon::setTestNow('2026-04-09 12:00:00');
953+
$this->beforeApplicationDestroyed(static function (): void {
954+
Carbon::setTestNow();
955+
});
956+
957+
config()->set('queue.default', 'redis');
958+
config()->set('queue.connections.redis.driver', 'redis');
959+
config()->set('cache.default', 'array');
960+
config()->set('cache.stores.array.driver', 'array');
961+
config()->set('workflows.v2.compatibility.current', 'release-2026-04-09');
962+
963+
$instance = WorkflowInstance::query()->create([
964+
'id' => 'health-routing-fleet-instance',
965+
'workflow_class' => 'WorkflowClass',
966+
'workflow_type' => 'workflow.test',
967+
'run_count' => 1,
968+
]);
969+
970+
$run = WorkflowRun::query()->create([
971+
'id' => '01JHEALTHROUTINGFLEET0001',
972+
'workflow_instance_id' => $instance->id,
973+
'run_number' => 1,
974+
'workflow_class' => 'WorkflowClass',
975+
'workflow_type' => 'workflow.test',
976+
'status' => 'running',
977+
'compatibility' => 'release-2026-04-09',
978+
'started_at' => now()->subMinutes(8),
979+
'last_progress_at' => now()->subMinute(),
980+
]);
981+
982+
$instance->forceFill([
983+
'current_run_id' => $run->id,
984+
])->save();
985+
986+
WorkflowRunSummary::query()->create([
987+
'id' => $run->id,
988+
'workflow_instance_id' => $instance->id,
989+
'run_number' => 1,
990+
'is_current_run' => true,
991+
'engine_source' => 'v2',
992+
'class' => 'WorkflowClass',
993+
'workflow_type' => 'workflow.test',
994+
'status' => 'running',
995+
'status_bucket' => 'running',
996+
'compatibility' => 'release-2026-04-09',
997+
'started_at' => now()->subMinutes(8),
998+
'next_task_at' => now()->subMinutes(5),
999+
'liveness_state' => 'workflow_task_waiting_for_compatible_worker',
1000+
'liveness_reason' => 'No active worker heartbeat advertises the required compatibility marker.',
1001+
'created_at' => now()->subMinutes(8),
1002+
'updated_at' => now(),
1003+
]);
1004+
1005+
$snapshot = HealthCheck::snapshot();
1006+
$routing = collect($snapshot['checks'])->firstWhere('name', 'routing_health');
1007+
1008+
$this->assertNotNull($routing);
1009+
$this->assertSame('warning', $routing['status']);
1010+
$this->assertStringContainsString(
1011+
'no active worker heartbeat advertises the required compatibility marker',
1012+
$routing['message'],
1013+
);
1014+
$this->assertSame('release-2026-04-09', $routing['data']['required_compatibility']);
1015+
$this->assertSame(0, $routing['data']['active_workers']);
1016+
$this->assertSame(0, $routing['data']['active_workers_supporting_required']);
1017+
$this->assertFalse($routing['data']['fleet_supports_required']);
1018+
$this->assertSame(1, $routing['data']['compatibility_blocked_runs']);
9371019
}
9381020

9391021
public function testSnapshotClassifiesEveryCheckAsCorrectnessOrAcceleration(): void

tests/Unit/V2/RolloutSafetyDocumentationTest.php

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,27 @@ public function testContractDocumentFreezesRoutingHealthWakeOwnerRollup(): void
608608
);
609609
}
610610

611+
public function testContractDocumentFreezesRoutingHealthFleetCoverageRollup(): void
612+
{
613+
$contents = $this->documentContents();
614+
615+
$this->assertMatchesRegularExpression(
616+
'/`routing_health`[\s\S]{0,1200}`required_compatibility`[\s\S]{0,200}`active_workers`[\s\S]{0,200}`active_workers_supporting_required`[\s\S]{0,200}`fleet_supports_required`/',
617+
$contents,
618+
'Rollout safety contract must pin the routing_health fleet-coverage rollup quad (required_compatibility, active_workers, active_workers_supporting_required, fleet_supports_required) so operators reading routing_health alone can tell whether a compatibility block reflects "no live worker advertises the required marker" — the canonical fail-closed admission case escalated under DW_V2_FLEET_VALIDATION_MODE=fail — without joining the worker_compatibility check separately.',
619+
);
620+
$this->assertMatchesRegularExpression(
621+
'/`fleet_supports_required` is `true` when no marker is required[\s\S]{0,200}or at least one heartbeat advertises the required\s+marker/i',
622+
$contents,
623+
'Rollout safety contract must define when fleet_supports_required is true (no marker required, or at least one heartbeat advertises the required marker) so operators can interpret the routing_health convenience boolean unambiguously.',
624+
);
625+
$this->assertMatchesRegularExpression(
626+
'/`compatibility_blocked_runs > 0`[\s\S]{0,200}`fleet_supports_required = false`[\s\S]{0,400}`message`[\s\S]{0,200}missing-coverage/i',
627+
$contents,
628+
'Rollout safety contract must require the routing_health message to name the missing-coverage case (compatibility_blocked_runs > 0 AND fleet_supports_required = false) explicitly so operators do not have to cross-reference worker_compatibility to understand the block.',
629+
);
630+
}
631+
611632
public function testContractDocumentFreezesHealthCheckNames(): void
612633
{
613634
$contents = $this->documentContents();

0 commit comments

Comments
 (0)