Skip to content

Commit 7c6ad41

Browse files
authored
Return sentinels from sled_insert_resource_query (#10399)
The failure mode seen in #10353 involves multiple contending sled reservations and lead to a sled reservation iterating over all possible potential allocation mappings, causing the instance-start to appear to hang when in reality it was exploring all potential permutations. The fix for that was to update the iterator's stored information so that it could recognize when the sled insert query it was about to execute could never work. This fix was insufficient as further testing lead to more instance-start sagas that appeared stuck. During contention, there are a few broad categories to how the sled reservation insert query could fail: - the target sled could no longer have the available resources - putting a VMM on the target sled would now violate affinity or anti-affinity rules - the pools affected by requested local storage mapping no longer have the required available space Updating the iterator's stored information would only let Nexus detect the last category where the instance-start saga hang seen today fell into the first. Nexus needs to know which part of the insert query failed so that it can decide what to do based on if local storage allocations are required or not, and this lead to this commit: using the "true or cast error" pattern, return sentinels from the sled insert resource query, and bail out of searching for a valid local storage allocation permutation if the sled target is no longer valid. Fixes #10393
1 parent eb2ff2d commit 7c6ad41

5 files changed

Lines changed: 391 additions & 50 deletions

File tree

nexus/db-queries/src/db/datastore/sled.rs

Lines changed: 262 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ use crate::db::pagination::paginated;
2525
use crate::db::queries::disk::MAX_DISKS_PER_INSTANCE;
2626
use crate::db::queries::sled_reservation::LocalStorageAllocation;
2727
use crate::db::queries::sled_reservation::LocalStorageAllocationRequired;
28+
use crate::db::queries::sled_reservation::SLED_INSERT_QUERY_SENTINELS;
29+
use crate::db::queries::sled_reservation::sentinel_to_reason;
2830
use crate::db::queries::sled_reservation::sled_find_targets_query;
2931
use crate::db::queries::sled_reservation::sled_insert_resource_query;
32+
use crate::db::true_or_cast_error::matches_sentinel;
3033
use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus};
3134
use async_bb8_diesel::AsyncRunQueryDsl;
3235
use chrono::Utc;
@@ -1151,7 +1154,8 @@ impl DataStore {
11511154
//
11521155
// In the uncontended case, however, we'll only iterate through this
11531156
// loop once.
1154-
loop {
1157+
1158+
'sled_reservation: loop {
11551159
// Pick a reservation target, given the constraints we previously
11561160
// saw in the database.
11571161
let sled_target = pick_sled_reservation_target(
@@ -1187,18 +1191,45 @@ impl DataStore {
11871191
// Try to INSERT the record. If this is still a valid target,
11881192
// we'll use it. If it isn't a valid target, we'll shrink the
11891193
// set of viable sled targets and try again.
1190-
let rows_inserted = sled_insert_resource_query(
1194+
1195+
match sled_insert_resource_query(
11911196
&resource,
11921197
&LocalStorageAllocationRequired::No,
11931198
)
11941199
.execute_async(&*conn)
1195-
.await?;
1200+
.await
1201+
{
1202+
Ok(rows_inserted) => {
1203+
if rows_inserted > 0 {
1204+
info!(&log, "reservation succeeded!");
1205+
return Ok(resource);
1206+
}
1207+
}
11961208

1197-
if rows_inserted > 0 {
1198-
info!(&log, "reservation succeeded!");
1199-
return Ok(resource);
1200-
}
1201-
info!(&log, "reservation failed");
1209+
Err(e) => {
1210+
if let Some(sentinel) =
1211+
matches_sentinel(&e, &SLED_INSERT_QUERY_SENTINELS)
1212+
{
1213+
// The only part of the `insert_valid` section of
1214+
// the insertion query that can fail are the same
1215+
// places where these sentinels are cast and thrown
1216+
// as errors, as this branch does not have any
1217+
// requested local storage allocations. Ignore these
1218+
// and proceed to the next sled_target.
1219+
let reason = sentinel_to_reason(sentinel);
1220+
info!(
1221+
&log,
1222+
"reservation failed: {reason}";
1223+
"sentinel" => sentinel,
1224+
);
1225+
} else {
1226+
// The query failed, return this as an error
1227+
return Err(
1228+
SledReservationTransactionError::Diesel(e),
1229+
);
1230+
}
1231+
}
1232+
};
12021233
} else {
12031234
// If local storage allocation is required, match the requests
12041235
// with all the zpools of this sled that have available space.
@@ -1218,7 +1249,7 @@ impl DataStore {
12181249
unpreferred.remove(&sled_target);
12191250
preferred.remove(&sled_target);
12201251

1221-
continue;
1252+
continue 'sled_reservation;
12221253
};
12231254

12241255
let mut complete_allocation_lists =
@@ -1242,7 +1273,7 @@ impl DataStore {
12421273
unpreferred.remove(&sled_target);
12431274
preferred.remove(&sled_target);
12441275

1245-
continue;
1276+
continue 'sled_reservation;
12461277
}
12471278
};
12481279

@@ -1251,18 +1282,18 @@ impl DataStore {
12511282
// that particular set.
12521283
//
12531284
// If the `complate_allocation_lists` iterator returns None,
1254-
// control will pass to the end of the loop marked with 'outer,
1285+
// control will exit the `local_storage_allocation_search` loop,
12551286
// which will then try the next possible sled target. In the
12561287
// case where there were pre-existing local storage allocations
12571288
// there will _not_ be any more sleds to try and the user will
12581289
// see a capacity error.
12591290

1260-
loop {
1291+
'local_storage_allocation_search: loop {
12611292
// If other concurrent sled reservations are not taken into
12621293
// account, another sled reservation could allocate local
12631294
// storage onto the same pools that this one considers
1264-
// candidates, and then this iterator will return an
1265-
// allocation that will never work. Because the iterator
1295+
// candidates, and then this iterator will return
1296+
// allocations that will never work. Because the iterator
12661297
// searches for _every_ possible combination, this will end
12671298
// up searching for a long time. It's important to prune the
12681299
// list that we're searching from: using the _current_
@@ -1281,7 +1312,7 @@ impl DataStore {
12811312
else {
12821313
// All done searching, nothing worked. Try another
12831314
// sled!
1284-
break;
1315+
break 'local_storage_allocation_search;
12851316
};
12861317

12871318
info!(
@@ -1296,19 +1327,68 @@ impl DataStore {
12961327
// local storage allocations still fit, we'll use it. If it
12971328
// isn't a valid target, we'll shrink the set of viable sled
12981329
// targets and try again.
1299-
let rows_inserted = sled_insert_resource_query(
1330+
match sled_insert_resource_query(
13001331
&resource,
13011332
&LocalStorageAllocationRequired::Yes { allocations },
13021333
)
13031334
.execute_async(&*conn)
1304-
.await?;
1335+
.await
1336+
{
1337+
Ok(rows_inserted) => {
1338+
if rows_inserted > 0 {
1339+
info!(&log, "reservation succeeded!");
1340+
return Ok(resource);
1341+
}
1342+
}
13051343

1306-
if rows_inserted > 0 {
1307-
info!(&log, "reservation succeeded!");
1308-
return Ok(resource);
1344+
Err(e) => {
1345+
if let Some(sentinel) = matches_sentinel(
1346+
&e,
1347+
&SLED_INSERT_QUERY_SENTINELS,
1348+
) {
1349+
// Concurrent sled reservations could have
1350+
// allocated enough hardware threads, RSS RAM,
1351+
// and/or reservoir RAM that means this
1352+
// sled_target is no longer valid.
1353+
//
1354+
// Alternatively, a concurrent reservation could
1355+
// have allocated another instance to this sled
1356+
// target which makes it invalid for this
1357+
// instance we are trying to allocate due to
1358+
// affinity / anti-affinity constraints.
1359+
//
1360+
// Both of these cases are not a problem when
1361+
// _not_ performing local storage allocations:
1362+
// in that branch, when the insert query returns
1363+
// that it inserted 0 rows or returned an error
1364+
// sentinel, another sled_target will be chosen
1365+
// right away.
1366+
//
1367+
// If a sentinel is returned, the insert query
1368+
// isn't succeeding for reasons other than the
1369+
// available space for local storage. If we
1370+
// don't bail out here, the loop will keep
1371+
// searching through permutations of local
1372+
// storage allocations to try. Bail out of the
1373+
// search right away and pick another
1374+
// sled_target.
1375+
1376+
let reason = sentinel_to_reason(sentinel);
1377+
info!(
1378+
&log,
1379+
"reservation failed: {reason}";
1380+
"sentinel" => sentinel,
1381+
);
1382+
1383+
break 'local_storage_allocation_search;
1384+
} else {
1385+
// The query failed, return this as an error
1386+
return Err(
1387+
SledReservationTransactionError::Diesel(e),
1388+
);
1389+
}
1390+
}
13091391
}
1310-
1311-
info!(&log, "reservation failed");
13121392
}
13131393
}
13141394

@@ -2307,16 +2387,27 @@ pub(in crate::db::datastore) mod test {
23072387
self.resources.clone(),
23082388
);
23092389

2310-
sled_insert_resource_query(
2390+
let conn = datastore.pool_connection_for_tests().await.unwrap();
2391+
2392+
match sled_insert_resource_query(
23112393
&resource,
23122394
&LocalStorageAllocationRequired::No,
23132395
)
2314-
.execute_async(
2315-
&*datastore.pool_connection_for_tests().await.unwrap(),
2316-
)
2396+
.execute_async(&*conn)
23172397
.await
2318-
.unwrap()
2319-
> 0
2398+
{
2399+
Ok(rows_inserted) => rows_inserted > 0,
2400+
2401+
Err(e) => {
2402+
if matches_sentinel(&e, &SLED_INSERT_QUERY_SENTINELS)
2403+
.is_some()
2404+
{
2405+
false
2406+
} else {
2407+
panic!("{e}")
2408+
}
2409+
}
2410+
}
23202411
}
23212412

23222413
fn use_many_resources(mut self) -> Self {
@@ -5837,6 +5928,149 @@ pub(in crate::db::datastore) mod test {
58375928
logctx.cleanup_successful();
58385929
}
58395930

5931+
/// Ensure that four sleds can have _many_ VMMs (enough to fully use all
5932+
/// cpus) allocate local storage, where the sled reservations happen
5933+
/// concurrently in chunks.
5934+
#[tokio::test]
5935+
async fn local_storage_allocation_four_sleds_concurrent_small() {
5936+
let logctx = dev::test_setup_log(
5937+
"local_storage_allocation_four_sleds_concurrent_small",
5938+
);
5939+
let db = TestDatabase::new_with_datastore(&logctx.log).await;
5940+
let (opctx, datastore) = (db.opctx(), db.datastore());
5941+
5942+
let mut config = LocalStorageTest {
5943+
sleds: vec![],
5944+
affinity_groups: vec![],
5945+
anti_affinity_groups: vec![],
5946+
instances: vec![],
5947+
};
5948+
5949+
const MAX_U2_PER_INSTANCE: usize = 10;
5950+
5951+
// Only define a few sleds, otherwise this test takes a considerable
5952+
// amount of time adding records to cockroach.
5953+
5954+
const NUM_SLEDS: usize = 4;
5955+
5956+
for i in 0..NUM_SLEDS {
5957+
let mut u2s = vec![];
5958+
5959+
for n in 0..MAX_U2_PER_INSTANCE {
5960+
u2s.push(LocalStorageTestSledU2 {
5961+
physical_disk_id: PhysicalDiskUuid::new_v4(),
5962+
physical_disk_serial: format!("phys_{i}_{n}"),
5963+
5964+
zpool_id: ZpoolUuid::new_v4(),
5965+
control_plane_storage_buffer:
5966+
external::ByteCount::from_gibibytes_u32(250),
5967+
5968+
inventory_total_size:
5969+
external::ByteCount::from_gibibytes_u32(1024),
5970+
5971+
crucible_dataset_id: DatasetUuid::new_v4(),
5972+
crucible_dataset_addr: format!(
5973+
"[fd00:1122:3344:{i}{n:02x}::1]:12345"
5974+
)
5975+
.parse()
5976+
.unwrap(),
5977+
5978+
local_storage_unencrypted_dataset_id: DatasetUuid::new_v4(),
5979+
});
5980+
}
5981+
5982+
config.sleds.push(LocalStorageTestSled {
5983+
sled_id: SledUuid::new_v4(),
5984+
sled_serial: format!("sled_{i}"),
5985+
u2s,
5986+
});
5987+
}
5988+
5989+
// Each instance requires 2 cpus. At 128 hardware threads per sled we
5990+
// need 64 instances to fully consume all available hardware threads.
5991+
5992+
for i in 0..(64 * NUM_SLEDS) {
5993+
let mut disks = vec![];
5994+
5995+
for n in 0..MAX_U2_PER_INSTANCE {
5996+
disks.push(LocalStorageTestInstanceDisk {
5997+
id: Uuid::new_v4(),
5998+
name: external::Name::try_from(format!("local-{i}-{n}"))
5999+
.unwrap(),
6000+
6001+
// 64 instances per sled, total provisionable size is 1024 -
6002+
// 250 = 774, so roughly 12 G disks, minus overhead of about
6003+
// 780M = drop to 10G
6004+
size: external::ByteCount::from_gibibytes_u32(10),
6005+
});
6006+
}
6007+
6008+
config.instances.push(LocalStorageTestInstance {
6009+
id: InstanceUuid::new_v4(),
6010+
name: format!("inst{i}"),
6011+
affinity: None,
6012+
disks,
6013+
});
6014+
}
6015+
6016+
setup_local_storage_allocation_test(&opctx, datastore, &config).await;
6017+
6018+
let mut vmms = vec![];
6019+
6020+
// Reserve the instances concurrently in chunks - 8 at a time, because
6021+
// that's the maximum concurrent claims that qorb currently supports.
6022+
for chunk in config.instances.chunks(8) {
6023+
let mut jhs = vec![];
6024+
6025+
for (i, config_instance) in chunk.iter().enumerate() {
6026+
let instance = Instance::new_with_id(config_instance.id);
6027+
6028+
let datastore = datastore.clone();
6029+
6030+
let opctx = opctx.child(BTreeMap::from([(
6031+
String::from("task"),
6032+
format!("{i}"),
6033+
)]));
6034+
6035+
let jh = tokio::spawn(async move {
6036+
datastore
6037+
.sled_reservation_create_inner(
6038+
&opctx,
6039+
instance.id,
6040+
PropolisUuid::new_v4(),
6041+
db::model::Resources::new(
6042+
2,
6043+
ByteCount::try_from(1024).unwrap(),
6044+
ByteCount::try_from(1024).unwrap(),
6045+
),
6046+
db::model::SledReservationConstraints::none(),
6047+
)
6048+
.await
6049+
.unwrap()
6050+
});
6051+
6052+
jhs.push(jh);
6053+
}
6054+
6055+
for jh in jhs {
6056+
let vmm = jh.await.unwrap();
6057+
vmms.push(vmm);
6058+
}
6059+
}
6060+
6061+
assert_eq!(vmms.len(), (64 * NUM_SLEDS));
6062+
6063+
let sleds: HashSet<_> =
6064+
vmms.into_iter().map(|vmm| vmm.sled_id).collect();
6065+
6066+
assert_eq!(sleds.len(), NUM_SLEDS);
6067+
6068+
validate_local_storage_allocations(&datastore).await;
6069+
6070+
db.terminate().await;
6071+
logctx.cleanup_successful();
6072+
}
6073+
58406074
/// Ensure that an affinity grouping will cause instances with local storage
58416075
/// to fail allocation even if there's space left.
58426076
#[tokio::test]

0 commit comments

Comments
 (0)