Skip to content

Commit ce1858e

Browse files
authored
Merge pull request #1984 from diffora/am/user-cleanup-caller-context
feat(account-management): cleanup caller context
2 parents d817f05 + 0e648a3 commit ce1858e

16 files changed

Lines changed: 415 additions & 121 deletions

File tree

libs/modkit-db/src/contention.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,16 @@ const MYSQL_DEADLOCK_SQLSTATE: &str = "40001";
4444
const PG_SERIALIZATION_FAILURE: &str = "40001";
4545
const PG_DEADLOCK_DETECTED: &str = "40P01";
4646

47+
/// `PostgreSQL` retryable error MESSAGE fragments. sea-orm/sqlx surface the
48+
/// condition through the error's *message text* on the `Display` path, not the
49+
/// numeric SQLSTATE — so matching the code alone (`40001` / `40P01`) misses real
50+
/// serialization failures and deadlocks (e.g. `"could not serialize access due
51+
/// to concurrent update"` carries no `"40001"` substring). Matching the message
52+
/// is the reliable signal here. Message text is `lc_messages`-dependent, so the
53+
/// numeric-code checks above remain as a locale-independent belt-and-suspenders.
54+
const PG_SERIALIZATION_MSG: &str = "could not serialize access";
55+
const PG_DEADLOCK_MSG: &str = "deadlock detected";
56+
4757
/// `SQLite` error codes for write contention.
4858
///
4959
/// sqlx surfaces these as `"error returned from database: (code: N) database is locked"`.
@@ -82,7 +92,10 @@ fn is_mysql_deadlock(msg: &str) -> bool {
8292
}
8393

8494
fn is_pg_contention(msg: &str) -> bool {
85-
msg.contains(PG_SERIALIZATION_FAILURE) || msg.contains(PG_DEADLOCK_DETECTED)
95+
msg.contains(PG_SERIALIZATION_FAILURE)
96+
|| msg.contains(PG_DEADLOCK_DETECTED)
97+
|| msg.contains(PG_SERIALIZATION_MSG)
98+
|| msg.contains(PG_DEADLOCK_MSG)
8699
}
87100

88101
fn is_sqlite_busy(msg: &str) -> bool {
@@ -126,6 +139,23 @@ mod tests {
126139
assert!(is_retryable_contention(DbBackend::Postgres, &err));
127140
}
128141

142+
#[test]
143+
fn pg_serialization_failure_by_message_detected() {
144+
// Real sea-orm/sqlx Display carries the message, not the numeric
145+
// SQLSTATE — this is the exact text Postgres emits for 40001 and the
146+
// concurrent-DELETE regression that the numeric-only match missed.
147+
let err = exec_err(
148+
"error returned from database: could not serialize access due to concurrent update",
149+
);
150+
assert!(is_retryable_contention(DbBackend::Postgres, &err));
151+
}
152+
153+
#[test]
154+
fn pg_deadlock_by_message_detected() {
155+
let err = exec_err("error returned from database: deadlock detected");
156+
assert!(is_retryable_contention(DbBackend::Postgres, &err));
157+
}
158+
129159
// ── SQLite BUSY (code 5) ─────────────────────────────────────────
130160

131161
#[test]

modules/system/account-management/account-management/src/domain/metrics.rs

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,22 @@
2020
//! call-site migration onto the typed ports proceeds family-by-family;
2121
//! the bridge and the [`emit_*`] helpers are removed in the final
2222
//! cleanup PR once every call site has moved.
23+
//!
24+
//! ## Metric naming
25+
//!
26+
//! The `AM_*` constants are the **full, literal Prometheus names** —
27+
//! the exact series names that appear in Prometheus / `VictoriaMetrics`.
28+
//! They bake in the suffix that the OTel→Prometheus translation would
29+
//! otherwise add: counters carry `_total`, and gauges / histograms that
30+
//! measure a quantity carry the unit word (`_seconds`, `_milliseconds`);
31+
//! gauges that are bare counts carry no suffix. No `.with_unit()` hint
32+
//! is set on any instrument. Because the name is already literal and
33+
//! unit-free at the instrument level, the rendered Prometheus name is
34+
//! identical whether the collector has `add_metric_suffixes` on or off
35+
//! (the exporter dedups an existing `_total` and adds no unit suffix
36+
//! when none is configured). The const VALUES equal the rendered name
37+
//! under the default `"am"` prefix, so the facade-bridge `match family`
38+
//! dispatch routes correctly.
2339
2440
use std::sync::Arc;
2541
use std::sync::LazyLock;
@@ -29,62 +45,62 @@ use modkit_macros::domain_model;
2945

3046
// @cpt-begin:cpt-cf-account-management-dod-errors-observability-metric-catalog:p1:inst-dod-metric-catalog-constants
3147
/// Dependency-call health: `IdP` / Resource Group / GTS / `AuthZ` outbound calls.
32-
pub const AM_DEPENDENCY_HEALTH: &str = "am.dependency_health";
48+
pub const AM_DEPENDENCY_HEALTH: &str = "am_dependency_health_total";
3349

3450
/// Tenant-metadata resolution operations and inheritance policy outcomes.
35-
pub const AM_METADATA_RESOLUTION: &str = "am.metadata_resolution";
51+
pub const AM_METADATA_RESOLUTION: &str = "am_metadata_resolution_total";
3652

3753
/// Root-tenant bootstrap lifecycle (phase transitions, IdP-wait timeouts).
38-
pub const AM_BOOTSTRAP_LIFECYCLE: &str = "am.bootstrap_lifecycle";
54+
pub const AM_BOOTSTRAP_LIFECYCLE: &str = "am_bootstrap_lifecycle_total";
3955

4056
/// Provisioning reaper / hard-delete / deprovision background job telemetry.
41-
pub const AM_TENANT_RETENTION: &str = "am.tenant_retention";
57+
pub const AM_TENANT_RETENTION: &str = "am_tenant_retention_total";
4258

4359
/// Invalid retention-window configuration encountered while evaluating due-ness.
44-
pub const AM_RETENTION_INVALID_WINDOW: &str = "am.retention.invalid_window";
60+
pub const AM_RETENTION_INVALID_WINDOW: &str = "am_retention_invalid_window_total";
4561

4662
/// Mode-conversion request transitions and outcomes.
47-
pub const AM_CONVERSION_LIFECYCLE: &str = "am.conversion_lifecycle";
63+
pub const AM_CONVERSION_LIFECYCLE: &str = "am_conversion_lifecycle_total";
4864

4965
/// Hierarchy-depth threshold exceedance (warning-band + hard-limit rejects).
50-
pub const AM_HIERARCHY_DEPTH_EXCEEDANCE: &str = "am.hierarchy_depth_exceedance";
66+
pub const AM_HIERARCHY_DEPTH_EXCEEDANCE: &str = "am_hierarchy_depth_exceedance_total";
5167

5268
/// Cross-tenant denial counter (security-alert candidate family).
53-
pub const AM_CROSS_TENANT_DENIAL: &str = "am.cross_tenant_denial";
69+
pub const AM_CROSS_TENANT_DENIAL: &str = "am_cross_tenant_denial_total";
5470

5571
/// Hierarchy-integrity violation telemetry (one per integrity category).
56-
pub const AM_HIERARCHY_INTEGRITY_VIOLATIONS: &str = "am.hierarchy_integrity_violations";
72+
pub const AM_HIERARCHY_INTEGRITY_VIOLATIONS: &str = "am_hierarchy_integrity_violations";
5773

5874
/// Periodic integrity-check job tick outcome (`outcome` ∈ `completed` |
5975
/// `skipped_in_progress` | `failed`). Distinguishes a clean tick from a
6076
/// never-ran job, which [`AM_HIERARCHY_INTEGRITY_VIOLATIONS`] alone cannot.
61-
pub const AM_HIERARCHY_INTEGRITY_RUNS: &str = "am.hierarchy_integrity_runs";
77+
pub const AM_HIERARCHY_INTEGRITY_RUNS: &str = "am_hierarchy_integrity_runs_total";
6278

6379
/// Periodic auto-repair tick outcome — separate family from
6480
/// [`AM_HIERARCHY_INTEGRITY_RUNS`] so its fixed-label set is not widened.
65-
pub const AM_HIERARCHY_INTEGRITY_REPAIR_RUNS: &str = "am.hierarchy_integrity_repair_runs";
81+
pub const AM_HIERARCHY_INTEGRITY_REPAIR_RUNS: &str = "am_hierarchy_integrity_repair_runs_total";
6682

6783
/// Periodic integrity-check tick wall-clock duration in milliseconds.
6884
/// The `phase` label disaggregates the check phase (`phase = "check"`)
6985
/// from the chained auto-repair phase (`phase = "repair"`) so
7086
/// dashboards can tell a slow check from a slow check + repair.
7187
/// Drives capacity-planning alerts ("p95 > 60s"), distinct from
7288
/// [`AM_HIERARCHY_INTEGRITY_RUNS`] which is a tick-outcome counter.
73-
pub const AM_HIERARCHY_INTEGRITY_DURATION: &str = "am.hierarchy_integrity_duration";
89+
pub const AM_HIERARCHY_INTEGRITY_DURATION: &str = "am_hierarchy_integrity_duration_milliseconds";
7490

7591
/// Unix-epoch seconds of the last successful integrity-check tick.
7692
/// Used for a freshness watchdog (alert when `last_success` is older
7793
/// than twice the configured interval) that the violation gauge
7894
/// cannot satisfy on its own — a stuck job and a perfectly-clean tree
7995
/// look identical at the violation-gauge level until this gauge stops
8096
/// advancing.
81-
pub const AM_HIERARCHY_INTEGRITY_LAST_SUCCESS: &str = "am.hierarchy_integrity_last_success";
97+
pub const AM_HIERARCHY_INTEGRITY_LAST_SUCCESS: &str = "am_hierarchy_integrity_last_success_seconds";
8298

8399
/// Unix-epoch seconds of the last failed integrity-check tick — paired
84100
/// with [`AM_HIERARCHY_INTEGRITY_LAST_SUCCESS`] so operators can tell
85101
/// "sustained failure" from "never ran" (the success gauge keeps the last
86102
/// good timestamp indefinitely).
87-
pub const AM_HIERARCHY_INTEGRITY_LAST_FAILURE: &str = "am.hierarchy_integrity_last_failure";
103+
pub const AM_HIERARCHY_INTEGRITY_LAST_FAILURE: &str = "am_hierarchy_integrity_last_failure_seconds";
88104

89105
/// Lock-lifecycle event counter for `integrity_check_runs`. Emitted
90106
/// from [`crate::infra::storage::integrity::lock::release`] when the
@@ -97,7 +113,7 @@ pub const AM_HIERARCHY_INTEGRITY_LAST_FAILURE: &str = "am.hierarchy_integrity_la
97113
/// scheduler-tick outcome set) so dashboards keyed on
98114
/// `RUNS{outcome=*}` stay stable; this counter exists for
99115
/// lock-health alerting.
100-
pub const AM_INTEGRITY_LOCK_EVENTS: &str = "am.integrity_lock_events";
116+
pub const AM_INTEGRITY_LOCK_EVENTS: &str = "am_integrity_lock_events_total";
101117

102118
/// Hierarchy-integrity repair telemetry. Emits one gauge sample per
103119
/// run with `category` ∈ all 10
@@ -107,13 +123,25 @@ pub const AM_INTEGRITY_LOCK_EVENTS: &str = "am.integrity_lock_events";
107123
/// did not appear). The five derivable categories carry counts only
108124
/// in `bucket = repaired`; the five operator-triage categories carry
109125
/// counts only in `bucket = deferred`.
110-
pub const AM_HIERARCHY_INTEGRITY_REPAIRED: &str = "am.hierarchy_integrity_repaired";
126+
pub const AM_HIERARCHY_INTEGRITY_REPAIRED: &str = "am_hierarchy_integrity_repaired";
111127

112128
/// SERIALIZABLE-isolation retry telemetry for the AM repo's
113129
/// `with_serializable_retry` helper.
114-
pub const AM_SERIALIZABLE_RETRY: &str = "am.serializable_retry";
130+
pub const AM_SERIALIZABLE_RETRY: &str = "am_serializable_retry_total";
115131
// @cpt-end:cpt-cf-account-management-dod-errors-observability-metric-catalog:p1:inst-dod-metric-catalog-constants
116132

133+
/// Live tenant inventory gauge: current tenant row count, broken down
134+
/// by `status` (provisioning | active | suspended | deleted) and
135+
/// `self_managed` (true | false). A bare-count gauge, so it carries no
136+
/// unit suffix. Refreshed each reaper tick.
137+
pub const AM_TENANTS: &str = "am_tenants";
138+
139+
/// Live `tenant_closure` table size gauge: total ancestor-descendant
140+
/// edge count. A bare-count gauge (no unit suffix), refreshed each
141+
/// reaper tick alongside [`AM_TENANTS`]. Grows ~O(tenants × depth);
142+
/// a divergence from that expectation flags closure bloat / stale edges.
143+
pub const AM_TENANT_CLOSURE_ROWS: &str = "am_tenant_closure_rows";
144+
117145
/// Kinds of metric samples the emitter supports.
118146
#[domain_model]
119147
#[derive(Debug, Clone, Copy, PartialEq, Eq)]

modules/system/account-management/account-management/src/domain/ports/metrics.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@
2525
//! * **Cardinality discipline.** No label accepts a free `&str`.
2626
//! Numeric labels (`threshold` on `hierarchy_depth_exceedance`) are
2727
//! typed primitives; the adapter renders them at emit time.
28-
//! * **Metric names** keep the dot-separated form
29-
//! (`am.dependency_health`, `am.bootstrap_lifecycle`) from
30-
//! [`crate::domain::metrics`]. Unit hints are attached on the
31-
//! adapter via `.with_unit("ms")`/`.with_unit("s")`, not embedded
32-
//! in the metric name.
28+
//! * **Metric names** are the full, literal Prometheus names
29+
//! (`am_dependency_health_total`, `am_bootstrap_lifecycle_total`)
30+
//! from [`crate::domain::metrics`], with the OTel→Prometheus suffix
31+
//! baked in (counters `_total`; quantity gauges / histograms carry
32+
//! the unit word, e.g. `_seconds` / `_milliseconds`). No
33+
//! `.with_unit()` hint is set on the adapter, so the rendered name is
34+
//! identical regardless of the collector's `add_metric_suffixes`
35+
//! setting.
3336
3437
use account_management_sdk::idp::{IdpDeprovisionFailure, IdpProvisionFailure};
3538
use account_management_sdk::idp_user::IdpUserOperationFailure;

modules/system/account-management/account-management/src/domain/system_actor.rs

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,12 @@
5050
//! the provisioning-reaper batch on rows stuck in retry.
5151
//! * [`for_retention_sweep`] — `deprovision_tenant` from the
5252
//! hard-delete retention sweep on rows past their retention window.
53-
//! * [`for_user_cleanup`] — RG membership cleanup on `delete_user`
54-
//! after the `IdP` `deprovision_user` succeeds.
5553
//! * [`for_user_groups_cascade`] — RG cascade-cleanup hook fired
5654
//! when a tenant is hard-deleted.
55+
//!
56+
//! `delete_user`'s RG membership cleanup used to mint a `for_user_cleanup`
57+
//! system actor here; VHP-190 moved it to run under the caller's context
58+
//! (it is a caller-initiated flow), so no system actor is needed for it.
5759
5860
use modkit_security::SecurityContext;
5961
use uuid::Uuid;
@@ -147,20 +149,6 @@ pub(crate) fn for_retention_sweep(tenant_id: Uuid) -> SecurityContext {
147149
build_inner(Some(tenant_id))
148150
}
149151

150-
/// `delete_user` — RG membership cleanup after the `IdP`
151-
/// `deprovision_user` round trip succeeds. `tenant_id` is the
152-
/// owning tenant of the user being removed.
153-
#[must_use]
154-
pub(crate) fn for_user_cleanup(tenant_id: Uuid) -> SecurityContext {
155-
tracing::info!(
156-
target: "am.system_actor",
157-
site = "user_cleanup",
158-
tenant_id = %tenant_id,
159-
"am system actor constructed",
160-
);
161-
build_inner(Some(tenant_id))
162-
}
163-
164152
/// User-groups cascade-cleanup hook — fired when a tenant is
165153
/// hard-deleted and AM must walk its user-group memberships in RG.
166154
/// `tenant_id` is the tenant being deleted.
@@ -205,7 +193,6 @@ mod tests {
205193
("bootstrap", for_bootstrap(tenant)),
206194
("provisioning_reaper", for_provisioning_reaper(tenant)),
207195
("retention_sweep", for_retention_sweep(tenant)),
208-
("user_cleanup", for_user_cleanup(tenant)),
209196
("user_groups_cascade", for_user_groups_cascade(tenant)),
210197
] {
211198
assert_eq!(

modules/system/account-management/account-management/src/domain/tenant/repo.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,25 @@ pub trait TenantRepo: Send + Sync {
367367
filter: ChildCountFilter,
368368
) -> Result<u64, DomainError>;
369369

370+
/// Count live tenants grouped by `(status, self_managed)` for the
371+
/// `am_tenants` inventory gauge. Visibility is bounded by `scope`
372+
/// (the periodic refresher passes `AccessScope::allow_all` for a
373+
/// platform-wide count). Returns one entry per
374+
/// `(TenantStatus, self_managed)` combination — including
375+
/// zero-count combos — so the emitted gauge series stay stable
376+
/// tick-to-tick.
377+
async fn count_tenants_by_status(
378+
&self,
379+
scope: &AccessScope,
380+
) -> Result<Vec<(TenantStatus, bool, u64)>, DomainError>;
381+
382+
/// Count rows in `tenant_closure` (ancestor-descendant edges) for the
383+
/// `am_tenant_closure_rows` size gauge. The closure grows
384+
/// ~O(tenants × depth); tracking its row count surfaces closure bloat
385+
/// or integrity drift early (e.g. orphaned / stale edges that the
386+
/// integrity checker hasn't swept yet).
387+
async fn count_closure_rows(&self, scope: &AccessScope) -> Result<u64, DomainError>;
388+
370389
/// Flip the tenant from its current SDK-visible state to
371390
/// `Deleted`, stamp `deleted_at = now` (which also starts the
372391
/// retention timer — eligibility for hard-delete becomes

modules/system/account-management/account-management/src/domain/tenant/service/mod.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -878,7 +878,7 @@ impl<R: TenantRepo> TenantService<R> {
878878
AM_TENANT_RETENTION,
879879
MetricKind::Counter,
880880
&[
881-
("job", "saga_compensation"),
881+
("retention_job", "saga_compensation"),
882882
("outcome", "compensate_failed"),
883883
],
884884
);
@@ -924,7 +924,7 @@ impl<R: TenantRepo> TenantService<R> {
924924
AM_TENANT_RETENTION,
925925
MetricKind::Counter,
926926
&[
927-
("job", "saga_compensation"),
927+
("retention_job", "saga_compensation"),
928928
("outcome", "compensate_failed"),
929929
],
930930
);
@@ -1276,7 +1276,7 @@ impl<R: TenantRepo> TenantService<R> {
12761276
AM_TENANT_RETENTION,
12771277
MetricKind::Counter,
12781278
&[
1279-
("job", "saga_compensation"),
1279+
("retention_job", "saga_compensation"),
12801280
("outcome", "unsupported_required"),
12811281
],
12821282
);
@@ -1303,7 +1303,10 @@ impl<R: TenantRepo> TenantService<R> {
13031303
emit_metric(
13041304
AM_TENANT_RETENTION,
13051305
MetricKind::Counter,
1306-
&[("job", "saga_compensation"), ("outcome", "idp_unconfirmed")],
1306+
&[
1307+
("retention_job", "saga_compensation"),
1308+
("outcome", "idp_unconfirmed"),
1309+
],
13071310
);
13081311
false
13091312
}
@@ -1332,7 +1335,7 @@ impl<R: TenantRepo> TenantService<R> {
13321335
AM_TENANT_RETENTION,
13331336
MetricKind::Counter,
13341337
&[
1335-
("job", "saga_compensation"),
1338+
("retention_job", "saga_compensation"),
13361339
("outcome", "compensate_failed"),
13371340
],
13381341
);

0 commit comments

Comments
 (0)