@@ -25,8 +25,11 @@ use crate::db::pagination::paginated;
2525use crate :: db:: queries:: disk:: MAX_DISKS_PER_INSTANCE ;
2626use crate :: db:: queries:: sled_reservation:: LocalStorageAllocation ;
2727use crate :: db:: queries:: sled_reservation:: LocalStorageAllocationRequired ;
28+ use crate :: db:: queries:: sled_reservation:: SLED_INSERT_QUERY_SENTINELS ;
29+ use crate :: db:: queries:: sled_reservation:: sentinel_to_reason;
2830use crate :: db:: queries:: sled_reservation:: sled_find_targets_query;
2931use crate :: db:: queries:: sled_reservation:: sled_insert_resource_query;
32+ use crate :: db:: true_or_cast_error:: matches_sentinel;
3033use crate :: db:: update_and_check:: { UpdateAndCheck , UpdateStatus } ;
3134use async_bb8_diesel:: AsyncRunQueryDsl ;
3235use chrono:: Utc ;
@@ -1151,7 +1154,8 @@ impl DataStore {
11511154 //
11521155 // In the uncontended case, however, we'll only iterate through this
11531156 // loop once.
1154- loop {
1157+
1158+ ' sled_reservation: loop {
11551159 // Pick a reservation target, given the constraints we previously
11561160 // saw in the database.
11571161 let sled_target = pick_sled_reservation_target (
@@ -1187,18 +1191,45 @@ impl DataStore {
11871191 // Try to INSERT the record. If this is still a valid target,
11881192 // we'll use it. If it isn't a valid target, we'll shrink the
11891193 // set of viable sled targets and try again.
1190- let rows_inserted = sled_insert_resource_query (
1194+
1195+ match sled_insert_resource_query (
11911196 & resource,
11921197 & LocalStorageAllocationRequired :: No ,
11931198 )
11941199 . execute_async ( & * conn)
1195- . await ?;
1200+ . await
1201+ {
1202+ Ok ( rows_inserted) => {
1203+ if rows_inserted > 0 {
1204+ info ! ( & log, "reservation succeeded!" ) ;
1205+ return Ok ( resource) ;
1206+ }
1207+ }
11961208
1197- if rows_inserted > 0 {
1198- info ! ( & log, "reservation succeeded!" ) ;
1199- return Ok ( resource) ;
1200- }
1201- info ! ( & log, "reservation failed" ) ;
1209+ Err ( e) => {
1210+ if let Some ( sentinel) =
1211+ matches_sentinel ( & e, & SLED_INSERT_QUERY_SENTINELS )
1212+ {
1213+ // The only part of the `insert_valid` section of
1214+ // the insertion query that can fail are the same
1215+ // places where these sentinels are cast and thrown
1216+ // as errors, as this branch does not have any
1217+ // requested local storage allocations. Ignore these
1218+ // and proceed to the next sled_target.
1219+ let reason = sentinel_to_reason ( sentinel) ;
1220+ info ! (
1221+ & log,
1222+ "reservation failed: {reason}" ;
1223+ "sentinel" => sentinel,
1224+ ) ;
1225+ } else {
1226+ // The query failed, return this as an error
1227+ return Err (
1228+ SledReservationTransactionError :: Diesel ( e) ,
1229+ ) ;
1230+ }
1231+ }
1232+ } ;
12021233 } else {
12031234 // If local storage allocation is required, match the requests
12041235 // with all the zpools of this sled that have available space.
@@ -1218,7 +1249,7 @@ impl DataStore {
12181249 unpreferred. remove ( & sled_target) ;
12191250 preferred. remove ( & sled_target) ;
12201251
1221- continue ;
1252+ continue ' sled_reservation ;
12221253 } ;
12231254
12241255 let mut complete_allocation_lists =
@@ -1242,7 +1273,7 @@ impl DataStore {
12421273 unpreferred. remove ( & sled_target) ;
12431274 preferred. remove ( & sled_target) ;
12441275
1245- continue ;
1276+ continue ' sled_reservation ;
12461277 }
12471278 } ;
12481279
@@ -1251,18 +1282,18 @@ impl DataStore {
12511282 // that particular set.
12521283 //
12531284 // If the `complate_allocation_lists` iterator returns None,
1254- // control will pass to the end of the loop marked with 'outer ,
1285+ // control will exit the `local_storage_allocation_search` loop,
12551286 // which will then try the next possible sled target. In the
12561287 // case where there were pre-existing local storage allocations
12571288 // there will _not_ be any more sleds to try and the user will
12581289 // see a capacity error.
12591290
1260- loop {
1291+ ' local_storage_allocation_search : loop {
12611292 // If other concurrent sled reservations are not taken into
12621293 // account, another sled reservation could allocate local
12631294 // storage onto the same pools that this one considers
1264- // candidates, and then this iterator will return an
1265- // allocation that will never work. Because the iterator
1295+ // candidates, and then this iterator will return
1296+ // allocations that will never work. Because the iterator
12661297 // searches for _every_ possible combination, this will end
12671298 // up searching for a long time. It's important to prune the
12681299 // list that we're searching from: using the _current_
@@ -1281,7 +1312,7 @@ impl DataStore {
12811312 else {
12821313 // All done searching, nothing worked. Try another
12831314 // sled!
1284- break ;
1315+ break ' local_storage_allocation_search ;
12851316 } ;
12861317
12871318 info ! (
@@ -1296,19 +1327,68 @@ impl DataStore {
12961327 // local storage allocations still fit, we'll use it. If it
12971328 // isn't a valid target, we'll shrink the set of viable sled
12981329 // targets and try again.
1299- let rows_inserted = sled_insert_resource_query (
1330+ match sled_insert_resource_query (
13001331 & resource,
13011332 & LocalStorageAllocationRequired :: Yes { allocations } ,
13021333 )
13031334 . execute_async ( & * conn)
1304- . await ?;
1335+ . await
1336+ {
1337+ Ok ( rows_inserted) => {
1338+ if rows_inserted > 0 {
1339+ info ! ( & log, "reservation succeeded!" ) ;
1340+ return Ok ( resource) ;
1341+ }
1342+ }
13051343
1306- if rows_inserted > 0 {
1307- info ! ( & log, "reservation succeeded!" ) ;
1308- return Ok ( resource) ;
1344+ Err ( e) => {
1345+ if let Some ( sentinel) = matches_sentinel (
1346+ & e,
1347+ & SLED_INSERT_QUERY_SENTINELS ,
1348+ ) {
1349+ // Concurrent sled reservations could have
1350+ // allocated enough hardware threads, RSS RAM,
1351+ // and/or reservoir RAM that means this
1352+ // sled_target is no longer valid.
1353+ //
1354+ // Alternatively, a concurrent reservation could
1355+ // have allocated another instance to this sled
1356+ // target which makes it invalid for this
1357+ // instance we are trying to allocate due to
1358+ // affinity / anti-affinity constraints.
1359+ //
1360+ // Both of these cases are not a problem when
1361+ // _not_ performing local storage allocations:
1362+ // in that branch, when the insert query returns
1363+ // that it inserted 0 rows or returned an error
1364+ // sentinel, another sled_target will be chosen
1365+ // right away.
1366+ //
1367+ // If a sentinel is returned, the insert query
1368+ // isn't succeeding for reasons other than the
1369+ // available space for local storage. If we
1370+ // don't bail out here, the loop will keep
1371+ // searching through permutations of local
1372+ // storage allocations to try. Bail out of the
1373+ // search right away and pick another
1374+ // sled_target.
1375+
1376+ let reason = sentinel_to_reason ( sentinel) ;
1377+ info ! (
1378+ & log,
1379+ "reservation failed: {reason}" ;
1380+ "sentinel" => sentinel,
1381+ ) ;
1382+
1383+ break ' local_storage_allocation_search;
1384+ } else {
1385+ // The query failed, return this as an error
1386+ return Err (
1387+ SledReservationTransactionError :: Diesel ( e) ,
1388+ ) ;
1389+ }
1390+ }
13091391 }
1310-
1311- info ! ( & log, "reservation failed" ) ;
13121392 }
13131393 }
13141394
@@ -2307,16 +2387,27 @@ pub(in crate::db::datastore) mod test {
23072387 self . resources . clone ( ) ,
23082388 ) ;
23092389
2310- sled_insert_resource_query (
2390+ let conn = datastore. pool_connection_for_tests ( ) . await . unwrap ( ) ;
2391+
2392+ match sled_insert_resource_query (
23112393 & resource,
23122394 & LocalStorageAllocationRequired :: No ,
23132395 )
2314- . execute_async (
2315- & * datastore. pool_connection_for_tests ( ) . await . unwrap ( ) ,
2316- )
2396+ . execute_async ( & * conn)
23172397 . await
2318- . unwrap ( )
2319- > 0
2398+ {
2399+ Ok ( rows_inserted) => rows_inserted > 0 ,
2400+
2401+ Err ( e) => {
2402+ if matches_sentinel ( & e, & SLED_INSERT_QUERY_SENTINELS )
2403+ . is_some ( )
2404+ {
2405+ false
2406+ } else {
2407+ panic ! ( "{e}" )
2408+ }
2409+ }
2410+ }
23202411 }
23212412
23222413 fn use_many_resources ( mut self ) -> Self {
@@ -5837,6 +5928,149 @@ pub(in crate::db::datastore) mod test {
58375928 logctx. cleanup_successful ( ) ;
58385929 }
58395930
5931+ /// Ensure that four sleds can have _many_ VMMs (enough to fully use all
5932+ /// cpus) allocate local storage, where the sled reservations happen
5933+ /// concurrently in chunks.
5934+ #[ tokio:: test]
5935+ async fn local_storage_allocation_four_sleds_concurrent_small ( ) {
5936+ let logctx = dev:: test_setup_log (
5937+ "local_storage_allocation_four_sleds_concurrent_small" ,
5938+ ) ;
5939+ let db = TestDatabase :: new_with_datastore ( & logctx. log ) . await ;
5940+ let ( opctx, datastore) = ( db. opctx ( ) , db. datastore ( ) ) ;
5941+
5942+ let mut config = LocalStorageTest {
5943+ sleds : vec ! [ ] ,
5944+ affinity_groups : vec ! [ ] ,
5945+ anti_affinity_groups : vec ! [ ] ,
5946+ instances : vec ! [ ] ,
5947+ } ;
5948+
5949+ const MAX_U2_PER_INSTANCE : usize = 10 ;
5950+
5951+ // Only define a few sleds, otherwise this test takes a considerable
5952+ // amount of time adding records to cockroach.
5953+
5954+ const NUM_SLEDS : usize = 4 ;
5955+
5956+ for i in 0 ..NUM_SLEDS {
5957+ let mut u2s = vec ! [ ] ;
5958+
5959+ for n in 0 ..MAX_U2_PER_INSTANCE {
5960+ u2s. push ( LocalStorageTestSledU2 {
5961+ physical_disk_id : PhysicalDiskUuid :: new_v4 ( ) ,
5962+ physical_disk_serial : format ! ( "phys_{i}_{n}" ) ,
5963+
5964+ zpool_id : ZpoolUuid :: new_v4 ( ) ,
5965+ control_plane_storage_buffer :
5966+ external:: ByteCount :: from_gibibytes_u32 ( 250 ) ,
5967+
5968+ inventory_total_size :
5969+ external:: ByteCount :: from_gibibytes_u32 ( 1024 ) ,
5970+
5971+ crucible_dataset_id : DatasetUuid :: new_v4 ( ) ,
5972+ crucible_dataset_addr : format ! (
5973+ "[fd00:1122:3344:{i}{n:02x}::1]:12345"
5974+ )
5975+ . parse ( )
5976+ . unwrap ( ) ,
5977+
5978+ local_storage_unencrypted_dataset_id : DatasetUuid :: new_v4 ( ) ,
5979+ } ) ;
5980+ }
5981+
5982+ config. sleds . push ( LocalStorageTestSled {
5983+ sled_id : SledUuid :: new_v4 ( ) ,
5984+ sled_serial : format ! ( "sled_{i}" ) ,
5985+ u2s,
5986+ } ) ;
5987+ }
5988+
5989+ // Each instance requires 2 cpus. At 128 hardware threads per sled we
5990+ // need 64 instances to fully consume all available hardware threads.
5991+
5992+ for i in 0 ..( 64 * NUM_SLEDS ) {
5993+ let mut disks = vec ! [ ] ;
5994+
5995+ for n in 0 ..MAX_U2_PER_INSTANCE {
5996+ disks. push ( LocalStorageTestInstanceDisk {
5997+ id : Uuid :: new_v4 ( ) ,
5998+ name : external:: Name :: try_from ( format ! ( "local-{i}-{n}" ) )
5999+ . unwrap ( ) ,
6000+
6001+ // 64 instances per sled, total provisionable size is 1024 -
6002+ // 250 = 774, so roughly 12 G disks, minus overhead of about
6003+ // 780M = drop to 10G
6004+ size : external:: ByteCount :: from_gibibytes_u32 ( 10 ) ,
6005+ } ) ;
6006+ }
6007+
6008+ config. instances . push ( LocalStorageTestInstance {
6009+ id : InstanceUuid :: new_v4 ( ) ,
6010+ name : format ! ( "inst{i}" ) ,
6011+ affinity : None ,
6012+ disks,
6013+ } ) ;
6014+ }
6015+
6016+ setup_local_storage_allocation_test ( & opctx, datastore, & config) . await ;
6017+
6018+ let mut vmms = vec ! [ ] ;
6019+
6020+ // Reserve the instances concurrently in chunks - 8 at a time, because
6021+ // that's the maximum concurrent claims that qorb currently supports.
6022+ for chunk in config. instances . chunks ( 8 ) {
6023+ let mut jhs = vec ! [ ] ;
6024+
6025+ for ( i, config_instance) in chunk. iter ( ) . enumerate ( ) {
6026+ let instance = Instance :: new_with_id ( config_instance. id ) ;
6027+
6028+ let datastore = datastore. clone ( ) ;
6029+
6030+ let opctx = opctx. child ( BTreeMap :: from ( [ (
6031+ String :: from ( "task" ) ,
6032+ format ! ( "{i}" ) ,
6033+ ) ] ) ) ;
6034+
6035+ let jh = tokio:: spawn ( async move {
6036+ datastore
6037+ . sled_reservation_create_inner (
6038+ & opctx,
6039+ instance. id ,
6040+ PropolisUuid :: new_v4 ( ) ,
6041+ db:: model:: Resources :: new (
6042+ 2 ,
6043+ ByteCount :: try_from ( 1024 ) . unwrap ( ) ,
6044+ ByteCount :: try_from ( 1024 ) . unwrap ( ) ,
6045+ ) ,
6046+ db:: model:: SledReservationConstraints :: none ( ) ,
6047+ )
6048+ . await
6049+ . unwrap ( )
6050+ } ) ;
6051+
6052+ jhs. push ( jh) ;
6053+ }
6054+
6055+ for jh in jhs {
6056+ let vmm = jh. await . unwrap ( ) ;
6057+ vmms. push ( vmm) ;
6058+ }
6059+ }
6060+
6061+ assert_eq ! ( vmms. len( ) , ( 64 * NUM_SLEDS ) ) ;
6062+
6063+ let sleds: HashSet < _ > =
6064+ vmms. into_iter ( ) . map ( |vmm| vmm. sled_id ) . collect ( ) ;
6065+
6066+ assert_eq ! ( sleds. len( ) , NUM_SLEDS ) ;
6067+
6068+ validate_local_storage_allocations ( & datastore) . await ;
6069+
6070+ db. terminate ( ) . await ;
6071+ logctx. cleanup_successful ( ) ;
6072+ }
6073+
58406074 /// Ensure that an affinity grouping will cause instances with local storage
58416075 /// to fail allocation even if there's space left.
58426076 #[ tokio:: test]
0 commit comments