Skip to content

Commit 5cc08bd

Browse files
committed
VPR-54 feat(scheduler): admin nav entry, area README, and 7-char audit stamp
- Adds a "Scheduler" tab to MainNav, gated by SVMSecure.CATS.scheduledJobs and pointing to /scheduler/dashboard; selectedTopNav highlights it when the admin is on the dashboard - Adds web/Areas/Scheduler/README.md covering job-onboarding recipe, configuration reference, dashboard URL, permission requirements, and an ops runbook (heartbeat verification, retry, pause/resume expectations, reconciler outcome counters, pre-escalation checklist) - Shortens ISchedulerJobsService.SchedulerActor from "__scheduler" to "__sched" so the audit stamp fits tblRoleMembers.ModBy (varchar(8)); caught while smoke-testing the RAPS role-refresh trigger end-to-end. The "__scheduler:" recurring-job-id prefix is unchanged - that string never lands in narrow legacy ModBy columns
1 parent 855e978 commit 5cc08bd

8 files changed

Lines changed: 236 additions & 8 deletions

File tree

web/Areas/RAPS/Jobs/RapsRoleRefreshScheduledJob.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ namespace Viper.Areas.RAPS.Jobs;
1010
/// First consumer of the scheduled-job abstraction: nightly role membership
1111
/// refresh. Wraps <see cref="RoleViews.UpdateRoles"/> and threads the run's
1212
/// <see cref="ScheduledJobContext.ModBy"/> through so the audit log clearly
13-
/// distinguishes scheduler-driven changes (<c>"__scheduler"</c>) from
13+
/// distinguishes scheduler-driven changes (<c>"__sched"</c>) from
1414
/// admin-driven manual runs (a real LoginId).
1515
/// </summary>
1616
[ScheduledJob(id: "raps:role-refresh", cron: "0 0 * * *", TimeZoneId = "Pacific Standard Time")]

web/Areas/RAPS/Services/RoleViews.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public async Task<List<string>> GetViewNames()
3737
/// </summary>
3838
/// <param name="modBy">
3939
/// Audit actor stamped on every <c>TblRoleMember</c> and <c>TblLog</c>
40-
/// row written by this run. Pass <c>"__scheduler"</c> for nightly
40+
/// row written by this run. Pass <c>"__sched"</c> for nightly
4141
/// recurring runs, the LoginId for manual admin runs, or rely on the
4242
/// <see cref="DefaultModBy"/> for legacy callers.
4343
/// </param>

web/Areas/Scheduler/Models/DTOs/Responses/SchedulerJobDto.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public class SchedulerJobDto
3636
/// <summary>Local timestamp the pause was recorded; null when not paused. Matches SchedulerJobState.PausedAt.</summary>
3737
public DateTime? PausedAt { get; set; }
3838

39-
/// <summary>LoginId or "__scheduler" who recorded the pause; null when not paused.</summary>
39+
/// <summary>LoginId or "__sched" who recorded the pause; null when not paused.</summary>
4040
public string? PausedBy { get; set; }
4141

4242
/// <summary>True for jobs in the reserved "__scheduler:" namespace; UI must hide pause/resume.</summary>

web/Areas/Scheduler/Models/Entities/SchedulerJobState.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public class SchedulerJobState
2828
/// <summary>Local timestamp the pause was recorded (DateTimeKind.Local per project DB convention).</summary>
2929
public DateTime PausedAt { get; set; }
3030

31-
/// <summary>LoginId of the operator, or "__scheduler" for system pauses.</summary>
31+
/// <summary>LoginId of the operator, or "__sched" for system pauses.</summary>
3232
public string PausedBy { get; set; } = string.Empty;
3333

3434
/// <summary>SQL Server rowversion for optimistic concurrency on pause/resume.</summary>

web/Areas/Scheduler/Models/ScheduledJobContext.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ public ScheduledJobContext(TriggerSource triggerSource, string modBy)
2121
public TriggerSource TriggerSource { get; }
2222

2323
/// <summary>
24-
/// Audit actor: <c>"__scheduler"</c> for scheduled runs, the LoginId for
25-
/// manual runs.
24+
/// Audit actor: <c>"__sched"</c> for scheduled runs (see
25+
/// <c>ISchedulerJobsService.SchedulerActor</c>), the LoginId for manual runs.
2626
/// </summary>
2727
public string ModBy { get; }
2828
}

web/Areas/Scheduler/README.md

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
# Scheduler
2+
3+
Cron-driven background jobs for VIPER. Built on Hangfire 1.8 with SQL Server
4+
storage; jobs are written against a thin `IScheduledJob` abstraction so they
5+
do not depend on Hangfire types directly.
6+
7+
This document is the operational source of truth for the scheduler:
8+
how to add a job, how it is configured, and how to triage incidents.
9+
10+
---
11+
12+
## Onboarding a job
13+
14+
Every recurring job is a class that implements `IScheduledJob` and carries a
15+
`[ScheduledJob]` attribute. Discovery happens at startup; there is no
16+
manifest file to update.
17+
18+
### 1. Declare the job
19+
20+
Place the file under your area's `Jobs/` folder. Example, the RAPS
21+
role-membership refresh:
22+
23+
```csharp
24+
// web/Areas/RAPS/Jobs/RapsRoleRefreshScheduledJob.cs
25+
[ScheduledJob(id: "raps:role-refresh", cron: "0 0 * * *", TimeZoneId = "Pacific Standard Time")]
26+
public sealed class RapsRoleRefreshScheduledJob : IScheduledJob
27+
{
28+
private readonly RAPSContext _rapsContext;
29+
private readonly ILogger<RapsRoleRefreshScheduledJob> _logger;
30+
31+
public RapsRoleRefreshScheduledJob(
32+
RAPSContext rapsContext,
33+
ILogger<RapsRoleRefreshScheduledJob> logger)
34+
{
35+
_rapsContext = rapsContext;
36+
_logger = logger;
37+
}
38+
39+
public async Task RunAsync(ScheduledJobContext context, CancellationToken ct)
40+
{
41+
var roleViews = new RoleViews(_rapsContext);
42+
await roleViews.UpdateRoles(modBy: context.ModBy, debugOnly: false);
43+
}
44+
}
45+
```
46+
47+
### 2. Naming rules
48+
49+
| Field | Rule |
50+
|---|---|
51+
| `id` | `area:job-name` for user jobs (e.g. `raps:role-refresh`). Must NOT start with `__scheduler:`. |
52+
| `id` (system jobs) | If `IsSystem = true`, the id MUST start with `__scheduler:`. The discovery pass refuses any combination that violates this invariant. |
53+
| `cron` | Five-field Hangfire cron (`m h dom mon dow`). |
54+
| `TimeZoneId` | Defaults to `Pacific Standard Time`. UC Davis runs Windows; IANA aliases like `America/Los_Angeles` also work. |
55+
56+
### 3. Stamping audit rows
57+
58+
Background jobs run with no HTTP context, so `UserHelper.GetCurrentUser()`
59+
is **not available**. Every job receives a `ScheduledJobContext` carrying:
60+
61+
- `TriggerSource` &mdash; `Scheduled` for cron-driven runs, `Manual` for
62+
admin-triggered runs.
63+
- `ModBy` &mdash; the audit actor for this run. Pass it through to your
64+
service layer; do not derive it inside the job.
65+
66+
For the scheduler-triggered path, `ModBy` is `"__sched"` (7 chars; the
67+
legacy `tblRoleMembers.ModBy` column is `varchar(8)`, so the stamp is
68+
shortened to fit while staying distinct from the existing `"__system"`
69+
convention). Existing audit queries can filter on
70+
`WHERE ModBy = '__sched'` to isolate scheduler-driven changes from
71+
human-driven changes.
72+
73+
### 4. DI
74+
75+
Job dependencies are resolved from a fresh DI scope per execution. Any
76+
`Scoped` service (DbContexts, scoped services from Scrutor) works without
77+
extra wiring &mdash; the discovery pass registers your job type as
78+
`Scoped` for you.
79+
80+
### 5. What runs where
81+
82+
| Surface | Mechanism |
83+
|---|---|
84+
| Initial registration | At app startup, after Hangfire is mounted, every `[ScheduledJob]`-declared type is `AddOrUpdate`'d. Idempotent. |
85+
| Subsequent registrations | A fresh deploy with new jobs picks them up on next startup. |
86+
| Lost-registration heal | If a job is missing from Hangfire's storage and has no pause marker, the hourly `__scheduler:reconcile` recurring job re-registers it. |
87+
88+
---
89+
90+
## Configuration
91+
92+
All settings live in `appsettings.{Environment}.json` (or AWS SSM
93+
parameters in deployed environments).
94+
95+
| Key | Purpose | Default |
96+
|---|---|---|
97+
| `Hangfire:Enabled` | Master switch. When `false`, no scheduler wiring runs and the dashboard is unreachable. | `false` |
98+
| `ConnectionStrings:VIPER` | The database that hosts Hangfire's tables and our `[HangFire].[SchedulerJobState]` marker table. Required when `Hangfire:Enabled=true`. Hangfire and the marker table share VIPER; splitting them is not supported. | n/a |
99+
| `IPAddressAllowlistConfiguration:InternalAllowlist` | Source-IP gate for `/health/detail` and the HealthChecks UI. Add SVM infra ranges + your office subnet. | localhost only |
100+
101+
The dashboard does **not** read this config; it is always mounted at
102+
`/scheduler/dashboard` when Hangfire is enabled and is gated by RAPS,
103+
not IP.
104+
105+
---
106+
107+
## Access
108+
109+
| Surface | URL | Auth |
110+
|---|---|---|
111+
| Hangfire dashboard | `/scheduler/dashboard` | Cookie auth (CAS) + RAPS permission `SVMSecure.CATS.scheduledJobs` |
112+
| Pause/resume API | `/api/scheduler/jobs`, `/api/scheduler/jobs/{id}/pause`, `/api/scheduler/jobs/{id}/resume` | Same RAPS permission |
113+
| Health (liveness) | `/health` | Anonymous (Jenkins polls it) |
114+
| Health (detail) | `/health/detail` | IP-allowlisted to `InternalAllowlist` |
115+
116+
`SVMSecure.CATS.scheduledJobs` is the same permission the legacy
117+
ColdFusion VIPER scheduler (`cats/inc_scheduledTasks.cfm`) checks &mdash;
118+
admins who already manage the legacy scheduler inherit access without a
119+
provisioning step.
120+
121+
---
122+
123+
## Pause and resume
124+
125+
Hangfire has no native "paused" state, so we deregister the recurring job
126+
and persist its definition in the `[HangFire].[SchedulerJobState]` marker
127+
table. The marker is the **declared source of truth** for "is this job
128+
paused?".
129+
130+
| Property | Behavior |
131+
|---|---|
132+
| Pause ordering | Marker write first, then `RemoveIfExists`. If `RemoveIfExists` throws, the API returns HTTP 202 with `deregistrationPending: true` and the reconciler completes the deregistration on its next pass. |
133+
| Resume ordering | Re-register first, then delete the marker. A residual marker after a successful registration is healed by the reconciler. |
134+
| Idempotency | Pause-on-already-paused returns 200 with the existing rowversion. Resume-on-already-active returns 200. |
135+
| Concurrency | `RowVersion` (SQL Server `rowversion`) guards every write. Stale rowversion &rarr; HTTP 409. |
136+
| System jobs | Ids starting with `__scheduler:` are refused (HTTP 403, `error: "system_job_not_pausable"`) before any write. They remain visible in the list and on the dashboard. |
137+
138+
---
139+
140+
## Operations runbook
141+
142+
### Heartbeat verification
143+
144+
| Symptom | Where to look |
145+
|---|---|
146+
| "Is the scheduler alive?" | `/health/detail` &mdash; the `hangfire` check reports `Healthy` (one or more servers with recent heartbeats), `Degraded` (storage reachable but no servers), or `Unhealthy` (storage error or all heartbeats > 2 minutes stale). |
147+
| "Did this job run?" | Dashboard &rarr; Recurring Jobs &rarr; row for the id; columns show last/next execution and last state. |
148+
| "Are workers processing?" | Dashboard &rarr; Servers panel; heartbeats refresh every 30 seconds. |
149+
150+
### Retrying a failed job
151+
152+
1. Open `/scheduler/dashboard`.
153+
2. Failed jobs appear in the **Failed** queue.
154+
3. Click the job &rarr; **Requeue** to retry once, or **Delete** to discard.
155+
4. Recurring jobs that fail still trigger on their next cron schedule
156+
regardless &mdash; requeue is for retrying the specific failed
157+
instance.
158+
159+
### Pause / resume expectations
160+
161+
| Scenario | Expected outcome |
162+
|---|---|
163+
| Pause a running job | Marker created, registration removed, returns 200. |
164+
| Pause when registration removal fails | Marker created, returns 202 with `deregistrationPending: true`. Reconciler finishes within an hour. |
165+
| Pause an already-paused job | Returns 200 idempotently (no marker rewrite). |
166+
| Resume a paused job | Registration restored, marker deleted, returns 200. |
167+
| Resume with stale rowversion | Returns 409. Refresh and retry. |
168+
| Pause/resume a `__scheduler:` job | Returns 403 with `error: "system_job_not_pausable"`. |
169+
170+
### Reconciler outcomes
171+
172+
The hourly `__scheduler:reconcile` job logs an outcome counter on every
173+
pass. Look for the structured log entry:
174+
175+
```text
176+
Scheduler reconciler pass: split-brain healed=N, system markers deleted=N, paused ok=N, active ok=N, lost registrations healed=N, markers=N, registrations=N
177+
```
178+
179+
Non-zero `splitBrainHealed`, `systemMarkersDeleted`, or
180+
`lostRegistrationsHealed` indicate drift was corrected this pass.
181+
Persistent non-zero values across passes mean something keeps creating
182+
drift &mdash; investigate before accepting it as normal.
183+
184+
### Pre-escalation checklist
185+
186+
Before paging a developer, verify in this order:
187+
188+
1. **Connection string** &mdash; `ConnectionStrings:VIPER` (and
189+
`:Hangfire` if overridden) resolves and the SQL login has read/write
190+
on the `HangFire` schema.
191+
2. **Permission grant** &mdash; the user holds
192+
`SVMSecure.CATS.scheduledJobs` (check RAPS).
193+
3. **Server heartbeat** &mdash; `/health/detail` returns `hangfire`
194+
`Healthy`. If `Degraded` (no servers), the worker process is down or
195+
not started; confirm `Hangfire:Enabled=true` and check application
196+
startup logs for `"Hangfire is enabled but ConnectionStrings:VIPER is
197+
empty"` or DDL errors.
198+
4. **Recent deploys** &mdash; a job that disappeared after a deploy and
199+
has not yet been re-registered will be picked up by the next reconciler
200+
pass (within an hour). Force-trigger by restarting the app or by
201+
triggering `__scheduler:reconcile` from the dashboard.
202+
5. **Marker drift** &mdash; query
203+
`SELECT * FROM [HangFire].[SchedulerJobState]`. Markers with a
204+
`__scheduler:` id should not exist; the reconciler deletes them as a
205+
safety net but their presence indicates an attempted protected-prefix
206+
pause.
207+
208+
---
209+
210+
## Related code
211+
212+
| Concern | Location |
213+
|---|---|
214+
| Hangfire wiring (DI, dashboard mount, schema bootstrap) | `web/Classes/Scheduler/HangfireExtensions.cs` |
215+
| Dashboard auth filter | `web/Classes/Scheduler/HangfireDashboardAuthorizationFilter.cs` |
216+
| Per-job logging filter | `web/Classes/Scheduler/HangfireJobLoggingFilter.cs` |
217+
| Health check | `web/Classes/HealthChecks/HangfireHealthCheck.cs` |
218+
| Pause/resume API | `web/Areas/Scheduler/Controllers/JobsController.cs` |
219+
| Service layer (pause/resume/reconcile) | `web/Areas/Scheduler/Services/SchedulerJobsService.cs` |
220+
| Job abstraction | `web/Areas/Scheduler/Services/IScheduledJob.cs`, `ScheduledJobAttribute.cs`, `ScheduledJobDiscovery.cs`, `ScheduledJobRunner.cs` |
221+
| Marker table entity + DDL | `web/Areas/Scheduler/Models/Entities/SchedulerJobState.cs`, `web/Classes/Scheduler/SchedulerSchemaInitializer.cs` |

web/Areas/Scheduler/Services/ISchedulerJobsService.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@ public interface ISchedulerJobsService
1616
/// <summary>Stable id of the hourly reconciler job.</summary>
1717
public const string ReconcileJobId = SystemJobPrefix + "reconcile";
1818

19-
/// <summary>Stamp used on rows authored by scheduled (non-HTTP) executions.</summary>
20-
public const string SchedulerActor = "__scheduler";
19+
/// <summary>
20+
/// Stamp used on rows authored by scheduled (non-HTTP) executions. Kept
21+
/// at seven characters because the legacy <c>tblRoleMembers.ModBy</c>
22+
/// column is <c>varchar(8)</c> (mirrors the existing <c>"__system"</c>
23+
/// convention, distinct enough to filter on).
24+
/// </summary>
25+
public const string SchedulerActor = "__sched";
2126

2227
Task<List<SchedulerJobDto>> ListJobsAsync(CancellationToken ct = default);
2328

web/Views/Shared/Components/MainNav/MainNav.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public class MainNavViewComponent : ViewComponent
3030
new string[] { "~/Policy", "Policies", "SVMSecure" },
3131
new string[] { "/research/default.cfm", "Research", "SVMSecure.Research" },
3232
new string[] { "/schedule/default.cfm", "Schedule", "SVMSecure.Schedule" },
33+
new[] { "~/scheduler/dashboard", "Scheduler", "SVMSecure.CATS.scheduledJobs", "Hangfire scheduler dashboard" },
3334
new string[] { "/students/default.cfm", "Students", "SVMSecure.Students" },
3435
new string[] { "/Hospital/default.cfm", "VMTH", "SVMSecure" },
3536
new string[] { "https://ucdsvm.knowledgeowl.com/help", "", "", "Help" }
@@ -64,6 +65,7 @@ public async Task<IViewComponentResult> InvokeAsync(AaudUser user)
6465
{
6566
"raps" => "Computing",
6667
"policy" => "Policies",
68+
"scheduler" => "Scheduler",
6769
_ => "VIPER Home",
6870
};
6971
return await Task.Run(() => View("Default", user));

0 commit comments

Comments
 (0)