Skip to content

Commit 3e51e69

Browse files
authored
Remove simulated disk stuff from sim sled-agent (#9914)
As part of looking into how expunge will affect disks backed by local storage, I'm a few yaks deep and am looking at the work required to change `DiskState`. Problem is that `DiskState` is in common and used by Nexus' external and internal apis, as well as sled-agent. Wait a minute you ask, why is `DiskState` used by the sled-agent api? That type refers to Nexus' Disk type, which is an abstraction layer higher than sled-agent should be aware of. All this code is left-over from very early days of the simulated sled-agent, and it's time it goes. This commit removes all the simulated disk machinery from the simulated sled-agent. So next, what part of Nexus' internal api uses `DiskState`? It's from `cpapi_disks_put`. What calls this? The simulated disk machinery in the simulated sled-agent notifies Nexus of state changes. This is incorrect: Nexus should completely own the state of a Disk, and having the simulated sled-agent poke into Nexus this way could hide real bugs. It would be best to completely remove `cpapi_disks_put`, but until the nexus-internal API trait can have new versions we cannot. This commit instead replaces the implementation of that endpoint with a 400. No part of the actual product uses this. Happily, with all that taken out there were no failing tests!
1 parent 3cb178f commit 3e51e69

13 files changed

Lines changed: 22 additions & 645 deletions

File tree

clients/sled-agent-client/src/lib.rs

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,6 @@ pub trait TestInterfaces {
316316
id: PropolisUuid,
317317
params: SimulateMigrationSource,
318318
);
319-
async fn disk_finish_transition(&self, id: Uuid);
320319
}
321320

322321
#[async_trait]
@@ -350,18 +349,6 @@ impl TestInterfaces for Client {
350349
Ok(())
351350
}
352351

353-
async fn disk_finish_transition(&self, id: Uuid) {
354-
let baseurl = self.baseurl();
355-
let client = self.client();
356-
let url = format!("{}/disks/{}/poke", baseurl, id);
357-
client
358-
.post(url)
359-
.api_version_header(self.api_version())
360-
.send()
361-
.await
362-
.expect("disk_finish_transition() failed unexpectedly");
363-
}
364-
365352
async fn vmm_simulate_migration_source(
366353
&self,
367354
id: PropolisUuid,

nexus/src/app/disk.rs

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ use omicron_common::api::external::LookupResult;
2727
use omicron_common::api::external::NameOrId;
2828
use omicron_common::api::external::UpdateResult;
2929
use omicron_common::api::external::http_pagination::PaginatedBy;
30-
use omicron_common::api::internal::nexus::DiskRuntimeState;
3130
use std::sync::Arc;
3231
use uuid::Uuid;
3332

@@ -359,63 +358,6 @@ impl super::Nexus {
359358
Ok(disks.into_iter().map(Into::into).collect())
360359
}
361360

362-
pub(crate) async fn notify_disk_updated(
363-
&self,
364-
opctx: &OpContext,
365-
id: Uuid,
366-
new_state: &DiskRuntimeState,
367-
) -> Result<(), Error> {
368-
let log = &self.log;
369-
let (.., authz_disk) = LookupPath::new(&opctx, &self.db_datastore)
370-
.disk_id(id)
371-
.lookup_for(authz::Action::Modify)
372-
.await?;
373-
374-
let result = self
375-
.db_datastore
376-
.disk_update_runtime(opctx, &authz_disk, &new_state.clone().into())
377-
.await;
378-
379-
// TODO-cleanup commonize with notify_instance_updated()
380-
match result {
381-
Ok(true) => {
382-
info!(log, "disk updated by sled agent";
383-
"disk_id" => %id,
384-
"new_state" => ?new_state);
385-
Ok(())
386-
}
387-
388-
Ok(false) => {
389-
info!(log, "disk update from sled agent ignored (old)";
390-
"disk_id" => %id);
391-
Ok(())
392-
}
393-
394-
// If the disk doesn't exist, swallow the error -- there's
395-
// nothing to do here.
396-
// TODO-robustness This could only be possible if we've removed a
397-
// disk from the datastore altogether. When would we do that?
398-
// We don't want to do it as soon as something's destroyed, I think,
399-
// and in that case, we'd need some async task for cleaning these
400-
// up.
401-
Err(Error::ObjectNotFound { .. }) => {
402-
warn!(log, "non-existent disk updated by sled agent";
403-
"instance_id" => %id,
404-
"new_state" => ?new_state);
405-
Ok(())
406-
}
407-
408-
// If the datastore is unavailable, propagate that to the caller.
409-
Err(error) => {
410-
warn!(log, "failed to update disk from sled agent";
411-
"disk_id" => %id,
412-
"new_state" => ?new_state,
413-
"error" => ?error);
414-
Err(error)
415-
}
416-
}
417-
}
418-
419361
pub(crate) async fn project_delete_disk(
420362
self: &Arc<Self>,
421363
opctx: &OpContext,

nexus/src/app/sagas/instance_create.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,8 +1642,8 @@ pub mod test {
16421642
== "detached"
16431643
}
16441644

1645-
async fn no_instances_or_disks_on_sled(sled_agent: &SledAgent) -> bool {
1646-
sled_agent.vmm_count().await == 0 && sled_agent.disk_count().await == 0
1645+
async fn no_instances_on_sled(sled_agent: &SledAgent) -> bool {
1646+
sled_agent.vmm_count().await == 0
16471647
}
16481648

16491649
pub(crate) async fn verify_clean_slate(
@@ -1672,7 +1672,7 @@ pub mod test {
16721672
.await
16731673
);
16741674
assert!(disk_is_detached(datastore).await);
1675-
assert!(no_instances_or_disks_on_sled(&sled_agent).await);
1675+
assert!(no_instances_on_sled(&sled_agent).await);
16761676

16771677
let v2p_mappings = &*sled_agent.v2p_mappings.lock().unwrap();
16781678
assert!(v2p_mappings.is_empty());

nexus/src/internal_api/http_entrypoints.rs

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -131,24 +131,16 @@ impl NexusInternalApi for NexusInternalApiImpl {
131131
}
132132

133133
async fn cpapi_disks_put(
134-
rqctx: RequestContext<Self::Context>,
135-
path_params: Path<DiskPathParam>,
136-
new_runtime_state: TypedBody<DiskRuntimeState>,
134+
_rqctx: RequestContext<Self::Context>,
135+
_path_params: Path<DiskPathParam>,
136+
_new_runtime_state: TypedBody<DiskRuntimeState>,
137137
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
138-
let apictx = &rqctx.context().context;
139-
let nexus = &apictx.nexus;
140-
let path = path_params.into_inner();
141-
let new_state = new_runtime_state.into_inner();
142-
let handler = async {
143-
let opctx =
144-
crate::context::op_context_for_internal_api(&rqctx).await;
145-
nexus.notify_disk_updated(&opctx, path.disk_id, &new_state).await?;
146-
Ok(HttpResponseUpdatedNoContent())
147-
};
148-
apictx
149-
.internal_latencies
150-
.instrument_dropshot_handler(&rqctx, handler)
151-
.await
138+
// TODO: remove from the nexus-internal api trait when we can make new
139+
// versions of it
140+
Err(HttpError::for_bad_request(
141+
None,
142+
String::from("removed due to incorrect abstraction"),
143+
))
152144
}
153145

154146
async fn cpapi_volume_remove_read_only_parent(

nexus/tests/integration_tests/instances.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5008,12 +5008,12 @@ async fn test_disks_detached_when_instance_destroyed(
50085008
let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id);
50095009
let apictx = &cptestctx.server.server_context();
50105010
let nexus = &apictx.nexus;
5011-
let sa = nexus
5011+
5012+
nexus
50125013
.active_instance_info(&instance_id, None)
50135014
.await
50145015
.unwrap()
5015-
.expect("instance should be on a sled while it's running")
5016-
.sled_client;
5016+
.expect("instance should be on a sled while it's running");
50175017

50185018
// Stop and delete instance
50195019
instance_post(&client, instance_name, InstanceOp::Stop).await;
@@ -5037,9 +5037,6 @@ async fn test_disks_detached_when_instance_destroyed(
50375037
assert_eq!(disks.len(), 8);
50385038
for disk in &disks {
50395039
assert_eq!(disk.state, DiskState::Detached);
5040-
5041-
// Simulate each one of the disks to move from "Detaching" to "Detached"
5042-
sa.disk_finish_transition(disk.identity.id).await;
50435040
}
50445041

50455042
// Ensure that the disks can be attached to another instance

0 commit comments

Comments
 (0)