Skip to content

Commit f6d8dbe

Browse files
authored
[fm] everyone gets a (nullable) slot (#10127)
This is a smallish refactor to the database schema for the `ereport` table. Presently, we have `sp_type` and `sp_slot` columns, both of which are nullable, and a `CHECK` constraint that enforces they are non-NULL for SP ereports and NULL for host OS ereports. This was done because at the time, I figured we could just use the sled ID column to go find the location for host OS ereports. It turns out that this is somewhat more fraught than I had hoped --- especially if a sled bounces or is removed from the control plane, in which case these records don't make sense anymore --- and it's much easier to just always record location for a reporter when inserting the ereport. Thus, this branch changes the schema a bit to: 1. rename the columns a bit so they're no longer SP-specific 2. make `slot_type` (nee `sp_type`) non-NULL, since even if we don't know the sled number for a host ereport, we will always know it came from a sled, by construction 3. change the `CHECK` constraint to allow host OS ereports to say what slot the sled was in 4. change up some of the model and domain types to always have a slot number This closes #10096, which was my previous attempt to mess around with the schema for representing ereport physical locations. Unlike that branch, we leave the slot number nullable here for host OS ereports, since (as discussed [here][1]) there may be periods of time during which a sled-agent is known to the control plane, but we don't actually know the physical location of the sled that sled-agent corresponds to. We hope this won't happen very often, but we must be able to represent it rather than being forced to discard any ereports from such sleds. There's a bit of a complex migration that attempts to backfill the `slot` column for host OS ereports if we can find the sled UUID in the inventory. In practice, this will not actually end up doing anything, since...there's no code for actually collecting host OS ereports and putting them in CRDB, so there aren't actually going to be any existing records in need of backfilling outside of a couple integration tests. But it felt nicer to *try* to populate this information anyway, I guess because I'm a big dweeb or something. [1]: #10096 (comment)
1 parent 9f69562 commit f6d8dbe

25 files changed

Lines changed: 528 additions & 87 deletions

File tree

dev-tools/omdb/src/bin/omdb/db/ereport.rs

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ async fn cmd_db_ereport_info(
256256
const CLASS: &str = "class";
257257
const REPORTER: &str = "reported by";
258258
const RESTART_ID: &str = "restart ID";
259+
const SLED_ID: &str = " sled ID";
259260
const PART_NUMBER: &str = " part number";
260261
const SERIAL_NUMBER: &str = " serial number";
261262
const WIDTH: usize = const_max_len(&[
@@ -264,6 +265,7 @@ async fn cmd_db_ereport_info(
264265
TIME_DELETED,
265266
COLLECTOR_ID,
266267
REPORTER,
268+
SLED_ID,
267269
PART_NUMBER,
268270
SERIAL_NUMBER,
269271
]);
@@ -297,8 +299,15 @@ async fn cmd_db_ereport_info(
297299
" {REPORTER:>WIDTH$}: {sp_type:?} {slot} (service processor)"
298300
)
299301
}
300-
Ok(Reporter::HostOs { sled }) => {
301-
println!(" {REPORTER:>WIDTH$}: sled {sled:?} (host OS)");
302+
Ok(Reporter::HostOs { sled, slot }) => {
303+
if let Some(slot) = slot {
304+
println!(" {REPORTER:>WIDTH$}: sled {slot} (host OS)");
305+
} else {
306+
println!(
307+
" {REPORTER:>WIDTH$}: <unknown sled slot> (host OS)"
308+
);
309+
}
310+
println!(" {SLED_ID:>WIDTH$}: {sled:?}")
302311
}
303312
}
304313
println!(" {RESTART_ID:>WIDTH$}: {restart_id}");
@@ -361,8 +370,8 @@ async fn cmd_db_ereporters(
361370
dsl::restart_id,
362371
dsl::reporter,
363372
dsl::sled_id,
364-
dsl::sp_slot,
365-
dsl::sp_type,
373+
dsl::slot_type,
374+
dsl::slot,
366375
dsl::serial_number,
367376
dsl::part_number
368377
))
@@ -379,7 +388,7 @@ async fn cmd_db_ereporters(
379388
if let Some(slot) = slot {
380389
if slot_type.is_some() {
381390
query = query
382-
.filter(dsl::sp_slot.eq(db::model::SqlU16::new(slot)));
391+
.filter(dsl::slot.eq(db::model::SqlU16::new(slot)));
383392
} else {
384393
anyhow::bail!(
385394
"cannot filter reporters by slot without a value for `--type`"
@@ -389,7 +398,7 @@ async fn cmd_db_ereporters(
389398

390399
if let Some(slot_type) = slot_type {
391400
query = query
392-
.filter(dsl::sp_type.eq(slot_type));
401+
.filter(dsl::slot_type.eq(slot_type));
393402
}
394403

395404
if let Some(serial) = serial {

nexus/db-model/src/ereport.rs

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -104,31 +104,30 @@ pub struct Ereport {
104104
pub struct Reporter {
105105
pub reporter: EreporterType,
106106

107-
//
108-
// The physical location of the reporting SP.
109-
//
110-
/// SP location: the type of SP slot (sled, switch, power shelf).
111-
///
112-
/// For SP ereports (i.e. those with `reporter == EreporterType::Sp`) this
113-
/// is never NULL, which is enforced by the `reporter_identity_validity`
114-
/// CHECK constraint. This is because SPs are indexed by their physical
115-
/// location when requesting ereports through MGS.
116-
pub sp_type: Option<SpType>,
117-
/// SP location: the slot number.
118-
///
119-
/// For SP ereports (i.e. those with `reporter == EreporterType::Sp`) this
120-
/// is never NULL, which is enforced by the `reporter_identity_validity`
121-
/// CHECK constraint. This is because SPs are indexed by their physical
122-
/// location when requesting ereports through MGS.
123-
pub sp_slot: Option<SpMgsSlot>,
124-
125107
/// For host OS ereports, the sled UUID of the sled-agent from which this
126108
/// ereport was received.
127109
///
128110
/// This is never NULL for host OS ereports (i.e. those with `reporter ==
129111
/// EreporterType::Host`). This is enforced by the
130112
/// `reporter_identity_validity` CHECK constraint.
131113
pub sled_id: Option<DbTypedUuid<SledKind>>,
114+
115+
//
116+
// The physical location of the reporter
117+
//
118+
/// Reporter location: the type of the physical slot (sled, switch, power
119+
/// shelf).
120+
pub slot_type: SpType,
121+
/// Reporter location: the slot number.
122+
///
123+
/// For SP ereports (i.e. those with `reporter == EreporterType::Sp`) this
124+
/// is never NULL, which is enforced by the `reporter_identity_validity`
125+
/// CHECK constraint. This is because SPs are indexed by their physical
126+
/// location when requesting ereports through MGS. For host OS ereports,
127+
/// this may be NULL, as it is possible for a sled-agent to be part of the
128+
/// control plane before its location is included in an inventory
129+
/// collection.
130+
pub slot: Option<SpMgsSlot>,
132131
}
133132

134133
impl Ereport {
@@ -223,17 +222,17 @@ impl TryFrom<Ereport> for types::Ereport {
223222
impl From<types::Reporter> for Reporter {
224223
fn from(reporter: types::Reporter) -> Self {
225224
match reporter {
226-
types::Reporter::HostOs { sled } => Self {
225+
types::Reporter::HostOs { sled, slot } => Self {
227226
reporter: EreporterType::Host,
228227
sled_id: Some(sled.into()),
229-
sp_type: None,
230-
sp_slot: None,
228+
slot_type: SpType::Sled,
229+
slot: slot.map(SpMgsSlot::from),
231230
},
232231
types::Reporter::Sp { sp_type, slot } => Self {
233232
reporter: EreporterType::Sp,
234-
sp_type: Some(sp_type.into()),
235-
sp_slot: Some(slot.into()),
236233
sled_id: None,
234+
slot_type: sp_type.into(),
235+
slot: Some(slot.into()),
237236
},
238237
}
239238
}
@@ -245,35 +244,53 @@ impl TryFrom<Reporter> for types::Reporter {
245244
match reporter {
246245
Reporter {
247246
reporter: EreporterType::Sp,
248-
sp_type: Some(sp_type),
249-
sp_slot: Some(slot),
247+
slot_type,
248+
slot: Some(slot),
250249
..
251250
} => Ok(Self::Sp {
252-
sp_type: sp_type.into(),
251+
sp_type: slot_type.into(),
253252
slot: crate::SqlU16::from(slot).0,
254253
}),
255254
Reporter {
256-
reporter: EreporterType::Sp, sp_type, sp_slot, ..
255+
reporter: EreporterType::Sp, slot_type, slot, ..
257256
} => Err(Error::InternalError {
258257
internal_message: format!(
259258
"the 'reporter_identity_validity' CHECK constraint \
260259
should enforce that ereports with reporter='sp' have \
261-
a non-NULL SP type and slot, but this ereport has \
262-
sp_type={sp_type:?} and sp_slot={sp_slot:?}",
260+
a non-NULL `slot`, but this ereport has \
261+
slot_type={slot_type:?} and slot={slot:?}",
263262
),
264263
}),
265264
Reporter {
266265
reporter: EreporterType::Host,
267266
sled_id: Some(id),
267+
slot_type: SpType::Sled,
268+
slot,
268269
..
269-
} => Ok(Self::HostOs { sled: id.into() }),
270+
} => Ok(Self::HostOs {
271+
sled: id.into(),
272+
slot: slot.map(|slot| crate::SqlU16::from(slot).0),
273+
}),
270274
Reporter {
271-
reporter: EreporterType::Host, sled_id: None, ..
275+
reporter: EreporterType::Host,
276+
slot_type: SpType::Sled,
277+
sled_id: None,
278+
..
272279
} => Err(Error::internal_error(
273280
"the 'reporter_identity_validity' CHECK constraint \
274281
should enforce that ereports with reporter='host' \
275282
have a non-NULL sled_id, but this ereport does not",
276283
)),
284+
Reporter { reporter: EreporterType::Host, slot_type, .. } => {
285+
Err(Error::InternalError {
286+
internal_message: format!(
287+
"the 'reporter_identity_validity' CHECK constraint \
288+
should enforce that ereports with reporter='host' \
289+
have slot_type='sled', but this ereport has \
290+
slot_type={slot_type:?}",
291+
),
292+
})
293+
}
277294
}
278295
}
279296
}

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(241, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(242, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(242, "ereport-everyone-gets-a-slot"),
3132
KnownVersion::new(241, "audit-log-incomplete-timeout"),
3233
KnownVersion::new(240, "multicast-drop-mvlan"),
3334
KnownVersion::new(239, "fm-alert-request"),

nexus/db-queries/src/db/datastore/ereport.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,8 @@ impl DataStore {
197197
.group_by((
198198
dsl::restart_id,
199199
dsl::reporter,
200-
dsl::sp_slot,
201-
dsl::sp_type,
200+
dsl::slot_type,
201+
dsl::slot,
202202
dsl::sled_id,
203203
))
204204
.select((
@@ -236,7 +236,7 @@ impl DataStore {
236236
.get_result_async(conn)
237237
.await
238238
}
239-
fm::Reporter::HostOs { sled } => {
239+
fm::Reporter::HostOs { sled, .. } => {
240240
Self::host_latest_ereport_id_query(sled)
241241
.get_result_async(conn)
242242
.await
@@ -258,9 +258,9 @@ impl DataStore {
258258
) -> impl RunnableQuery<EreportIdTuple> {
259259
dsl::ereport
260260
.filter(
261-
dsl::sp_type
261+
dsl::slot_type
262262
.eq(sp_type)
263-
.and(dsl::sp_slot.eq(slot))
263+
.and(dsl::slot.eq(slot))
264264
.and(dsl::time_deleted.is_null()),
265265
)
266266
.order_by((dsl::time_collected.desc(), dsl::ena.desc()))

nexus/db-schema/src/schema.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2869,9 +2869,9 @@ table! {
28692869
report -> Jsonb,
28702870

28712871
reporter -> crate::enums::EreporterTypeEnum,
2872-
sp_type -> Nullable<crate::enums::SpTypeEnum>,
2873-
sp_slot -> Nullable<Int4>,
28742872
sled_id -> Nullable<Uuid>,
2873+
slot_type -> crate::enums::SpTypeEnum,
2874+
slot -> Nullable<Int4>,
28752875
}
28762876
}
28772877

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -587,10 +587,11 @@ mod test {
587587
async fn make_fake_ereports(datastore: &DataStore, opctx: &OpContext) {
588588
const SP_SERIAL: &str = "BRM42000069";
589589
const HOST_SERIAL: &str = "BRM66600042";
590+
const SLED_SLOT: u16 = 8;
590591
const GIMLET_PN: &str = "9130000019";
591592
// Make some SP ereports...
592593
let sp_restart_id = EreporterRestartUuid::new_v4();
593-
datastore.ereports_insert(&opctx, Reporter::Sp { sp_type: SpType::Sled, slot: 8}, vec![
594+
datastore.ereports_insert(&opctx, Reporter::Sp { sp_type: SpType::Sled, slot: SLED_SLOT}, vec![
594595
EreportData {
595596
id: EreportId { restart_id: sp_restart_id, ena: ereport_types::Ena(1) },
596597
time_collected: chrono::Utc::now(),
@@ -648,7 +649,10 @@ mod test {
648649
datastore
649650
.ereports_insert(
650651
&opctx,
651-
Reporter::HostOs { sled: SledUuid::new_v4() },
652+
Reporter::HostOs {
653+
sled: SledUuid::new_v4(),
654+
slot: Some(SLED_SLOT),
655+
},
652656
vec![
653657
EreportData {
654658
id: EreportId {
@@ -681,7 +685,7 @@ mod test {
681685
datastore
682686
.ereports_insert(
683687
&opctx,
684-
Reporter::HostOs { sled: SledUuid::new_v4() },
688+
Reporter::HostOs { sled: SledUuid::new_v4(), slot: Some(SLED_SLOT) },
685689
vec![
686690
EreportData {
687691
id: EreportId { restart_id: EreporterRestartUuid::new_v4(), ena: ereport_types::Ena(1) },

0 commit comments

Comments
 (0)