Skip to content

Commit 8061ac9

Browse files
authored
[3/5] User data export: fix region snapshot replacement step saga (#8557)
For user data export, read-only volume copies are attached to a Pantry so that Nexus can read from them. The region snapshot replacement machinery needs to be taught about this new location of read-only volumes: this commit represents that fix. This PR is intentionally before the others in the set that actually create the user data export objects so that this saga is fixed before that :)
1 parent 55ec7e4 commit 8061ac9

1 file changed

Lines changed: 198 additions & 52 deletions

File tree

nexus/src/app/sagas/region_snapshot_replacement_step.rs

Lines changed: 198 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ use super::{
4747
ACTION_GENERATE_ID, ActionRegistry, NexusActionContext, NexusSaga,
4848
SagaInitError,
4949
};
50+
use crate::app::db::datastore;
5051
use crate::app::db::datastore::ExistingTarget;
5152
use crate::app::db::datastore::ReplacementTarget;
5253
use crate::app::db::datastore::VolumeReplaceResult;
@@ -55,12 +56,13 @@ use crate::app::db::datastore::VolumeWithTarget;
5556
use crate::app::sagas::declare_saga_actions;
5657
use crate::app::{authn, authz, db};
5758
use nexus_db_lookup::LookupPath;
59+
use nexus_db_model::UserDataExport;
5860
use nexus_db_model::VmmState;
61+
use nexus_types::identity::Asset;
5962
use nexus_types::saga::saga_action_failed;
6063
use omicron_common::api::external::Error;
6164
use omicron_uuid_kinds::GenericUuid;
6265
use omicron_uuid_kinds::VolumeUuid;
63-
use propolis_client::types::ReplaceResult;
6466
use serde::Deserialize;
6567
use serde::Serialize;
6668
use sled_agent_client::CrucibleOpts;
@@ -395,63 +397,35 @@ async fn rsrss_replace_snapshot_in_volume_undo(
395397
Ok(())
396398
}
397399

398-
async fn rsrss_notify_upstairs(
400+
async fn notify_potential_propolis_upstairs(
399401
sagactx: NexusActionContext,
402+
disk: datastore::CrucibleDisk,
400403
) -> Result<(), ActionError> {
401404
let osagactx = sagactx.user_data();
405+
let datastore = osagactx.datastore();
402406
let params = sagactx.saga_params::<Params>()?;
403407
let log = sagactx.user_data().log();
404408

405-
// If the associated volume was deleted, then skip this notification step as
406-
// there is no Upstairs to talk to. Continue with the saga to transition the
407-
// step request to Complete, and then perform the associated clean up.
408-
409-
let volume_replace_snapshot_result = sagactx
410-
.lookup::<VolumeReplaceResult>("volume_replace_snapshot_result")?;
411-
if matches!(
412-
volume_replace_snapshot_result,
413-
VolumeReplaceResult::ExistingVolumeSoftDeleted
414-
| VolumeReplaceResult::ExistingVolumeHardDeleted
415-
) {
416-
return Ok(());
417-
}
418-
419-
// Make an effort to notify a Propolis if one was booted for this volume.
420-
// This is best effort: if there is a failure, this saga will unwind and be
421-
// triggered again for the same request. If there is no Propolis booted for
422-
// this volume, then there's nothing to be done: any future Propolis will
423-
// receive the updated Volume.
424-
//
425-
// Unlike for region replacement, there's no step required here if there
426-
// isn't an active Propolis: any Upstairs created after the snapshot_addr
427-
// is replaced will reference the cloned data.
409+
let opctx = crate::context::op_context_for_saga_action(
410+
&sagactx,
411+
&params.serialized_authn,
412+
);
428413

429-
let Some(disk) = osagactx
430-
.datastore()
431-
.disk_for_volume_id(params.request.volume_id())
432-
.await
433-
.map_err(saga_action_failed)?
434-
else {
435-
return Ok(());
436-
};
414+
// Bail out if this disk is not attached to an instance
437415

438416
let Some(instance_id) = disk.runtime().attach_instance_id else {
439417
return Ok(());
440418
};
441419

442-
let opctx = crate::context::op_context_for_saga_action(
443-
&sagactx,
444-
&params.serialized_authn,
445-
);
420+
// Bail if there is no active VMM
446421

447-
let (.., authz_instance) = LookupPath::new(&opctx, osagactx.datastore())
422+
let (.., authz_instance) = LookupPath::new(&opctx, datastore)
448423
.instance_id(instance_id)
449424
.lookup_for(authz::Action::Read)
450425
.await
451426
.map_err(saga_action_failed)?;
452427

453-
let instance_and_vmm = osagactx
454-
.datastore()
428+
let instance_and_vmm = datastore
455429
.instance_fetch_with_vmm(&opctx, &authz_instance)
456430
.await
457431
.map_err(saga_action_failed)?;
@@ -464,15 +438,17 @@ async fn rsrss_notify_upstairs(
464438

465439
info!(
466440
log,
467-
"volume associated with disk attached to instance with vmm in \
468-
state {state}";
441+
"volume associated with disk attached to instance with vmm in state \
442+
{state}";
469443
"request id" => %params.request.id,
470444
"volume id" => %params.request.volume_id(),
471445
"disk id" => ?disk.id(),
472446
"instance id" => ?instance_id,
473447
"vmm id" => ?vmm.id,
474448
);
475449

450+
// Bail if the VMM is not in a state to receive requests
451+
476452
match &state {
477453
VmmState::Running | VmmState::Rebooting => {
478454
// Propolis server is ok to receive the volume replacement request.
@@ -495,8 +471,9 @@ async fn rsrss_notify_upstairs(
495471
}
496472
}
497473

498-
let new_volume_vcr = match osagactx
499-
.datastore()
474+
// Send the new VCR via a replacement request
475+
476+
let new_volume_vcr = match datastore
500477
.volume_get(params.request.volume_id())
501478
.await
502479
.map_err(saga_action_failed)?
@@ -511,7 +488,7 @@ async fn rsrss_notify_upstairs(
511488
};
512489

513490
let instance_lookup =
514-
LookupPath::new(&opctx, osagactx.datastore()).instance_id(instance_id);
491+
LookupPath::new(&opctx, datastore).instance_id(instance_id);
515492

516493
let (vmm, client) = osagactx
517494
.nexus()
@@ -568,27 +545,27 @@ async fn rsrss_notify_upstairs(
568545
);
569546

570547
match &replace_result {
571-
ReplaceResult::Started => {
548+
propolis_client::types::ReplaceResult::Started => {
572549
// This saga's call just started the replacement
573550
}
574551

575-
ReplaceResult::StartedAlready => {
552+
propolis_client::types::ReplaceResult::StartedAlready => {
576553
// A previous run of this saga (or saga node) started the
577554
// replacement
578555
}
579556

580-
ReplaceResult::CompletedAlready => {
557+
propolis_client::types::ReplaceResult::CompletedAlready => {
581558
// It's done! We see this if the same propolis that received the
582559
// original replace request started and finished the replacement.
583560
}
584561

585-
ReplaceResult::VcrMatches => {
562+
propolis_client::types::ReplaceResult::VcrMatches => {
586563
// This propolis booted with the updated VCR
587564
}
588565

589-
ReplaceResult::Missing => {
590-
// The volume does not contain the region to be replaced. This is an
591-
// error!
566+
propolis_client::types::ReplaceResult::Missing => {
567+
// The volume does not contain the read-only target to be replaced.
568+
// This is an error!
592569
return Err(saga_action_failed(Error::internal_error(
593570
"saw ReplaceResult::Missing",
594571
)));
@@ -598,6 +575,175 @@ async fn rsrss_notify_upstairs(
598575
Ok(())
599576
}
600577

578+
async fn notify_pantry_upstairs(
579+
sagactx: NexusActionContext,
580+
pantry_address: SocketAddrV6,
581+
attachment_id: Uuid,
582+
) -> Result<(), ActionError> {
583+
let osagactx = sagactx.user_data();
584+
let datastore = osagactx.datastore();
585+
let params = sagactx.saga_params::<Params>()?;
586+
let log = sagactx.user_data().log();
587+
588+
info!(
589+
log,
590+
"volume attached to pantry {pantry_address} with id {attachment_id}";
591+
"request id" => %params.request.id,
592+
"volume id" => %params.request.volume_id(),
593+
);
594+
595+
// Grab the new volume's VCR
596+
597+
let volume_construction_request = match datastore
598+
.volume_get(params.request.volume_id())
599+
.await
600+
.map_err(saga_action_failed)?
601+
{
602+
Some(volume) => serde_json::from_str(&volume.data()).map_err(|e| {
603+
saga_action_failed(Error::internal_error(&format!(
604+
"failed to deserialize volume {} data: {e}",
605+
volume.id()
606+
)))
607+
})?,
608+
609+
None => {
610+
return Err(saga_action_failed(Error::internal_error(
611+
"new volume is gone!",
612+
)));
613+
}
614+
};
615+
616+
let endpoint = format!("http://{}", pantry_address);
617+
let client = crucible_pantry_client::Client::new(&endpoint);
618+
619+
let replace_request = crucible_pantry_client::types::ReplaceRequest {
620+
volume_construction_request,
621+
};
622+
623+
let replace_result = client
624+
.replace(&attachment_id.to_string(), &replace_request)
625+
.await
626+
.map_err(|e| {
627+
saga_action_failed(Error::internal_error(&e.to_string()))
628+
})?;
629+
630+
match replace_result.into_inner() {
631+
crucible_pantry_client::types::ReplaceResult::Started => {
632+
// This saga's call just started the replacement
633+
}
634+
635+
crucible_pantry_client::types::ReplaceResult::StartedAlready => {
636+
// A previous run of this saga (or saga node) started the
637+
// replacement
638+
}
639+
640+
crucible_pantry_client::types::ReplaceResult::CompletedAlready => {
641+
// It's done! We see this if the same pantry that received the
642+
// original replace request started and finished the replacement.
643+
}
644+
645+
crucible_pantry_client::types::ReplaceResult::VcrMatches => {
646+
// This pantry booted with the updated VCR
647+
}
648+
649+
crucible_pantry_client::types::ReplaceResult::Missing => {
650+
// The volume does not contain the read-only target to be replaced.
651+
// This is an error!
652+
return Err(saga_action_failed(Error::internal_error(
653+
String::from("saw ReplaceResult::Missing"),
654+
)));
655+
}
656+
}
657+
658+
Ok(())
659+
}
660+
661+
async fn rsrss_notify_upstairs(
662+
sagactx: NexusActionContext,
663+
) -> Result<(), ActionError> {
664+
let osagactx = sagactx.user_data();
665+
let params = sagactx.saga_params::<Params>()?;
666+
667+
let opctx = crate::context::op_context_for_saga_action(
668+
&sagactx,
669+
&params.serialized_authn,
670+
);
671+
672+
// If the associated volume was deleted, then skip this notification step as
673+
// there is no Upstairs to talk to. Continue with the saga to transition the
674+
// step request to Complete, and then perform the associated clean up.
675+
676+
let volume_replace_snapshot_result = sagactx
677+
.lookup::<VolumeReplaceResult>("volume_replace_snapshot_result")?;
678+
if matches!(
679+
volume_replace_snapshot_result,
680+
VolumeReplaceResult::ExistingVolumeSoftDeleted
681+
| VolumeReplaceResult::ExistingVolumeHardDeleted
682+
) {
683+
return Ok(());
684+
}
685+
686+
// Make an effort to notify an Upstairs if one was constructed for this
687+
// volume. This is best effort: if there is a failure, this saga will unwind
688+
// and be triggered again for the same request. If there is no Upstairs
689+
// constructed for this volume, then there's nothing to be done: any future
690+
// construction will receive the updated Volume.
691+
//
692+
// Unlike for region replacement, there's no step required here if there
693+
// isn't an active Upstairs: any Upstairs created after the snapshot_addr is
694+
// replaced will reference the cloned data.
695+
696+
let maybe_disk = osagactx
697+
.datastore()
698+
.disk_for_volume_id(params.request.volume_id())
699+
.await
700+
.map_err(saga_action_failed)?;
701+
702+
let maybe_user_data_export = osagactx
703+
.datastore()
704+
.user_data_export_lookup_by_volume_id(
705+
&opctx,
706+
params.request.volume_id(),
707+
)
708+
.await
709+
.map_err(saga_action_failed)?;
710+
711+
if let Some(disk) = maybe_disk {
712+
notify_potential_propolis_upstairs(sagactx, disk).await?;
713+
} else if let Some(record) = maybe_user_data_export {
714+
let (pantry_address, volume_id) = match record.is_live() {
715+
Err(s) => {
716+
// There was an error with a Live user data export record that
717+
// means we have to unwind here. This will likely require
718+
// support intervention, as the record is in state Live but does
719+
// not have either a Pantry address or volume id.
720+
return Err(saga_action_failed(Error::internal_error(
721+
s.to_string(),
722+
)));
723+
}
724+
725+
Ok(UserDataExport::NotLive) => {
726+
// The user data export is not Live, meaning no notification is
727+
// required.
728+
return Ok(());
729+
}
730+
731+
Ok(UserDataExport::Live { pantry_address, volume_id }) => {
732+
(pantry_address, volume_id)
733+
}
734+
};
735+
736+
notify_pantry_upstairs(
737+
sagactx,
738+
pantry_address,
739+
volume_id.into_untyped_uuid(),
740+
)
741+
.await?;
742+
}
743+
744+
Ok(())
745+
}
746+
601747
async fn rsrss_update_request_record(
602748
sagactx: NexusActionContext,
603749
) -> Result<(), ActionError> {

0 commit comments

Comments
 (0)