Skip to content

Commit a9bfb1f

Browse files
authored
[nexus] Don't assumed hardcoded MGD port (#10435)
Prior to this PR, we had two different implementations that build up a `HashMap<SwitchSlot, MgdClient>` via the same process: 1. Call `switch_zone_address_mappings()` to get a `HashMap<SwitchSlot, Ipv6Addr>` via a couple different DNS lookups + a query to each switch zone's MGS 2. Look up MGD in DNS 3. Pair up the IP addresses from 1 and 2 to build a `HashMap<SwitchSlot, MgdClient>` using the ports from 2; for any IPs in 1 that we didn't find in 2, construct a client pointed to the IP from 1 with the hardcoded `MGD_PORT` I believe the history here is that we used to not have step 2 at all, and we just slapped `MGD_PORT` on all the IPs from 1 (back when MGD didn't have its own DNS entries). And all of this predates being able to ask MGD itself which switch slot it is (internally, it asks MGS over `::1` within its own switch zone). This PR makes two changes: it squishes us down to a single implementation, and changes the mechanics of that implementation to: 1. Look up MGD in DNS 2. For each entry returned, construct an MgdClient and ask it for its switch slot This potentially introduces a new failure mode: if MGD doesn't know its own switch slot but we would've been able to ask MGS at the same IP for the switch slot, the prior implementation would've worked and this PR won't. But that failure mode ought to only be possible if MGD itself is buggy: it asking MGS over localhost for the switch slot should, in general, be more successful than Nexus finding MGS in DNS then asking MGS for the switch slot, since there are a lot more moving pieces in that path. But I think the upsides are enough to make up for that risk - we streamline the process, reduce duplication, the new implementation has fewer places to fail transiently overall, and we always use port numbers from DNS (which should make this more reliable in tests - in tests we basically never want to assume the fixed `MGD_PORT`, and pairing up IP addresses also doesn't usually work since they're all `::1`). I'd like to remove `switch_zone_address_mappings()` altogether, but removing the remaining two uses of it are both blocked - lldp by #10361, and building up the scrimlet clients by #10167.
1 parent 76386e1 commit a9bfb1f

8 files changed

Lines changed: 111 additions & 100 deletions

File tree

dev-tools/omdb/tests/successes.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ task: "bfd_manager"
626626
configured period: every <REDACTED_DURATION>s
627627
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
628628
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
629-
last completion reported error: failed to resolve addresses for Dendrite services: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
629+
last completion reported error: proto error: no records found for Query { name: Name("_mgd._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
630630

631631
task: "blueprint_planner"
632632
configured period: every <REDACTED_DURATION>m
@@ -1309,7 +1309,7 @@ task: "bfd_manager"
13091309
configured period: every <REDACTED_DURATION>s
13101310
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
13111311
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1312-
last completion reported error: failed to resolve addresses for Dendrite services: proto error: no records found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
1312+
last completion reported error: proto error: no records found for Query { name: Name("_mgd._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
13131313

13141314
task: "blueprint_planner"
13151315
configured period: every <REDACTED_DURATION>m

nexus/src/app/background/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ pub(crate) use init::BackgroundTasksInternal;
142142
pub use nexus_background_task_interface::Activator;
143143
pub(crate) use tasks::blueprint_load::LoadedTargetBlueprint;
144144
pub use tasks::fm_sitrep_load::CurrentSitrep;
145+
pub(crate) use tasks::networking::resolve_mgd_clients;
145146
pub use tasks::saga_recovery::SagaRecoveryHelpers;
146147

147148
use futures::future::BoxFuture;

nexus/src/app/background/tasks/bfd.rs

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,8 @@
55
//! Background task for managing switch bidirectional forwarding detection
66
//! (BFD) sessions.
77
8-
use crate::app::{
9-
background::tasks::networking::build_mgd_clients,
10-
switch_zone_address_mappings,
11-
};
12-
138
use crate::app::background::BackgroundTask;
9+
use crate::app::background::tasks::networking::resolve_mgd_clients;
1410
use futures::FutureExt;
1511
use futures::future::BoxFuture;
1612
use internal_dns_resolver::Resolver;
@@ -120,29 +116,27 @@ impl BackgroundTask for BfdManager {
120116

121117
let mut current: HashSet<BfdSessionKey> = HashSet::new();
122118

123-
let mappings = match switch_zone_address_mappings(&self.resolver, log).await {
124-
Ok(mappings) => mappings,
125-
Err(e) => {
126-
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
127-
return json!({
128-
"error":
129-
format!(
130-
"failed to resolve addresses for Dendrite services: {:#}",
131-
e
132-
)
133-
});
134-
},
135-
};
136-
137-
let mgd_clients = build_mgd_clients(mappings, log, &self.resolver).await;
119+
let mgd_clients =
120+
match resolve_mgd_clients(&self.resolver, log).await {
121+
Ok(clients) => clients,
122+
Err(e) => {
123+
let e = InlineErrorChain::new(&e);
124+
error!(
125+
log,
126+
"failed to resolve addresses for MGD services";
127+
&e,
128+
);
129+
return json!({ "error": e.to_string() });
130+
}
131+
};
138132

139133
for (location, c) in &mgd_clients {
140134
let client_current = match c.get_bfd_peers().await {
141135
Ok(x) => x.into_inner(),
142136
Err(e) => {
143137
error!(&log, "failed to get bfd sessions from mgd: {}",
144138
c.baseurl();
145-
"error" => e.to_string()
139+
InlineErrorChain::new(&e),
146140
);
147141
continue;
148142
}

nexus/src/app/background/tasks/networking.rs

Lines changed: 52 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,47 +6,68 @@ use db::datastore::SwitchPortSettingsCombinedResult;
66
use dpd_client::types::{
77
LinkCreate, LinkId, LinkSettings, PortFec, PortSettings, PortSpeed, TxEq,
88
};
9+
use internal_dns_resolver::ResolveError;
910
use internal_dns_types::names::ServiceName;
1011
use nexus_db_model::{SwitchLinkFec, SwitchLinkSpeed};
1112
use nexus_db_queries::db;
12-
use omicron_common::address::MGD_PORT;
1313
use sled_agent_types::early_networking::SwitchSlot;
14-
use std::{
15-
collections::HashMap,
16-
net::{Ipv6Addr, SocketAddrV6},
17-
};
14+
use slog_error_chain::InlineErrorChain;
15+
use std::collections::HashMap;
1816

19-
pub(crate) async fn build_mgd_clients(
20-
mappings: HashMap<SwitchSlot, std::net::Ipv6Addr>,
21-
log: &slog::Logger,
17+
/// Get all MGD known `SwitchSlot -> MGD client` pairs.
18+
///
19+
/// # Errors
20+
///
21+
/// Fails if we cannot resolve MGD in DNS.
22+
///
23+
/// For any MGD instance we resolve via DNS, if the MGD instance does not know
24+
/// its own switch slot, the switch slot -> client mapping for that instance
25+
/// will be omitted from the returned map. Callers must not assume an `Ok(_)`
26+
/// return value contains any client.
27+
pub(crate) async fn resolve_mgd_clients(
2228
resolver: &internal_dns_resolver::Resolver,
23-
) -> HashMap<SwitchSlot, mg_admin_client::Client> {
24-
let mut clients: Vec<(SwitchSlot, mg_admin_client::Client)> = vec![];
25-
for (switch_slot, addr) in &mappings {
26-
let port = match resolver.lookup_all_socket_v6(ServiceName::Mgd).await {
27-
Ok(addrs) => {
28-
let port_map: HashMap<Ipv6Addr, u16> = addrs
29-
.into_iter()
30-
.map(|sockaddr| (*sockaddr.ip(), sockaddr.port()))
31-
.collect();
32-
33-
*port_map.get(&addr).unwrap_or(&MGD_PORT)
34-
}
35-
Err(e) => {
36-
error!(log, "failed to addresses"; "error" => %e);
37-
MGD_PORT
38-
}
39-
};
40-
41-
let socketaddr =
42-
std::net::SocketAddr::V6(SocketAddrV6::new(*addr, port, 0, 0));
29+
log: &slog::Logger,
30+
) -> Result<HashMap<SwitchSlot, mg_admin_client::Client>, ResolveError> {
31+
let mgd_addrs = resolver.lookup_all_socket_v6(ServiceName::Mgd).await?;
32+
let mut clients = HashMap::new();
33+
for addr in mgd_addrs {
4334
let client = mg_admin_client::Client::new(
44-
format!("http://{}", socketaddr).as_str(),
35+
&format!("http://{addr}"),
4536
log.clone(),
4637
);
47-
clients.push((*switch_slot, client));
38+
let switch_slot = match client.switch_identifiers().await {
39+
Ok(response) => match response.slot {
40+
Some(0) => SwitchSlot::Switch0,
41+
Some(1) => SwitchSlot::Switch1,
42+
Some(n) => {
43+
warn!(
44+
log, "failed to determine switch slot for mgd";
45+
"addr" => %addr,
46+
"error" => format!("mgd returned unknown slot {n}"),
47+
);
48+
continue;
49+
}
50+
None => {
51+
warn!(
52+
log, "failed to determine switch slot for mgd";
53+
"addr" => %addr,
54+
"error" => "mgd does not yet know its switch slot",
55+
);
56+
continue;
57+
}
58+
},
59+
Err(err) => {
60+
warn!(
61+
log, "failed to determine switch slot for mgd";
62+
"addr" => %addr,
63+
InlineErrorChain::new(&err),
64+
);
65+
continue;
66+
}
67+
};
68+
clients.insert(switch_slot, client);
4869
}
49-
clients.into_iter().collect::<HashMap<_, _>>()
70+
Ok(clients)
5071
}
5172

5273
pub(crate) fn api_to_dpd_port_settings(

nexus/src/app/background/tasks/sync_switch_configuration.rs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use crate::app::{
99
background::{
1010
LoadedTargetBlueprint,
11-
tasks::networking::{api_to_dpd_port_settings, build_mgd_clients},
11+
tasks::networking::{api_to_dpd_port_settings, resolve_mgd_clients},
1212
},
1313
dpd_clients, switch_zone_address_mappings,
1414
};
@@ -364,7 +364,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
364364
let dpd_clients = match
365365
dpd_clients(&self.resolver, &log).await
366366
{
367-
Ok(mappings) => mappings,
367+
Ok(clients) => clients,
368368
Err(e) => {
369369
error!(
370370
log,
@@ -376,7 +376,18 @@ impl BackgroundTask for SwitchPortSettingsManager {
376376

377377
// TODO https://github.com/oxidecomputer/omicron/issues/5201
378378
// build mgd clients
379-
let mgd_clients = build_mgd_clients(mappings, &log, &self.resolver).await;
379+
let mgd_clients =
380+
match resolve_mgd_clients(&self.resolver, &log).await {
381+
Ok(clients) => clients,
382+
Err(e) => {
383+
error!(
384+
log,
385+
"failed to resolve addresses for MGD";
386+
InlineErrorChain::new(&e),
387+
);
388+
continue;
389+
},
390+
};
380391

381392
let port_list = match self.switch_ports(opctx, &log).await {
382393
Ok(value) => value,

nexus/src/app/mod.rs

Lines changed: 15 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ use crate::DropshotServer;
1010
use crate::app::background::BackgroundTasksData;
1111
use crate::app::background::CurrentSitrep;
1212
use crate::app::background::SagaRecoveryHelpers;
13+
use crate::app::background::resolve_mgd_clients;
1314
use crate::app::update::UpdateStatusHandle;
1415
use crate::populate::PopulateArgs;
1516
use crate::populate::PopulateStatus;
1617
use crate::populate::populate_start;
1718
use ::oximeter::types::ProducerRegistry;
1819
use anyhow::anyhow;
20+
use internal_dns_resolver::ResolveError;
1921
use internal_dns_types::names::ServiceName;
2022
use nexus_background_task_interface::BackgroundTasks;
2123
use nexus_config::NexusConfig;
@@ -32,7 +34,6 @@ use nexus_mgs_updates::MgsUpdateDriver;
3234
use nexus_types::deployment::PendingMgsUpdates;
3335
use nexus_types::deployment::ReconfiguratorConfigParam;
3436

35-
use omicron_common::address::MGD_PORT;
3637
use omicron_common::address::MGS_PORT;
3738
use omicron_common::api::external::ByteCount;
3839
use omicron_common::api::external::Error;
@@ -44,7 +45,6 @@ use sled_agent_types::early_networking::SwitchSlot;
4445
use slog::Logger;
4546
use slog_error_chain::InlineErrorChain;
4647
use std::collections::HashMap;
47-
use std::net::SocketAddr;
4848
use std::net::SocketAddrV6;
4949
use std::net::{IpAddr, Ipv6Addr};
5050
use std::num::NonZeroU32;
@@ -1178,47 +1178,21 @@ impl Nexus {
11781178
lldpd_clients(resolver, rack_id, &self.log).await
11791179
}
11801180

1181+
/// Get all MGD known `SwitchSlot -> MGD client` pairs.
1182+
///
1183+
/// # Errors
1184+
///
1185+
/// Fails if we cannot resolve MGD in DNS.
1186+
///
1187+
/// For any MGD instance we resolve via DNS, if the MGD instance does not
1188+
/// know its own switch slot, the switch slot -> client mapping for that
1189+
/// instance will be omitted from the returned map. Callers must not
1190+
/// assume an `Ok(_)` return value contains any client.
11811191
pub(crate) async fn mg_clients(
11821192
&self,
1183-
) -> Result<HashMap<SwitchSlot, mg_admin_client::Client>, String> {
1184-
let resolver = self.resolver();
1185-
let mappings =
1186-
switch_zone_address_mappings(resolver, &self.log).await?;
1187-
let mgd_addrs = resolver
1188-
.lookup_all_socket_v6(ServiceName::Mgd)
1189-
.await
1190-
.map_err(|err| {
1191-
format!(
1192-
"failed to resolve mgd in DNS: {}",
1193-
InlineErrorChain::new(&err)
1194-
)
1195-
})?;
1196-
let mut clients = HashMap::new();
1197-
for (switch_slot, ip) in mappings {
1198-
let addr =
1199-
match mgd_addrs.iter().copied().find(|addr| *addr.ip() == ip) {
1200-
Some(addr) => SocketAddr::V6(addr),
1201-
None => {
1202-
warn!(
1203-
self.log,
1204-
"no MGD DNS entry found matching switch slot \
1205-
IP address; assuming default port";
1206-
"switch-slot" => ?switch_slot,
1207-
"switch-ip" => %ip,
1208-
"mgd-dns-entries" => ?mgd_addrs,
1209-
);
1210-
SocketAddr::V6(SocketAddrV6::new(ip, MGD_PORT, 0, 0))
1211-
}
1212-
};
1213-
clients.insert(
1214-
switch_slot,
1215-
mg_admin_client::Client::new(
1216-
&format!("http://{addr}"),
1217-
self.log.clone(),
1218-
),
1219-
);
1220-
}
1221-
Ok(clients)
1193+
) -> Result<HashMap<SwitchSlot, mg_admin_client::Client>, ResolveError>
1194+
{
1195+
resolve_mgd_clients(self.resolver(), &self.log).await
12221196
}
12231197

12241198
pub(crate) fn demo_sagas(

nexus/test-utils/src/starter.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,9 +448,13 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
448448
pub async fn start_mgd(&mut self, switch_slot: SwitchSlot) {
449449
let log = &self.logctx.log;
450450
debug!(log, "Starting mgd"; "switch_slot" => ?switch_slot);
451+
let mgs = self.gateway.get(&switch_slot).unwrap();
452+
let mgs_addr =
453+
SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into();
451454

452455
// Set up an instance of mgd
453-
let mgd = dev::maghemite::MgdInstance::start(0).await.unwrap();
456+
let mgd =
457+
dev::maghemite::MgdInstance::start(0, mgs_addr).await.unwrap();
454458
let port = mgd.port;
455459
self.mgd.insert(switch_slot, mgd);
456460
let address = SocketAddrV6::new(Ipv6Addr::LOCALHOST, port, 0, 0);

test-utils/src/dev/maghemite.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
//! Tools for managing Maghemite during development
66
7+
use std::net::SocketAddr;
78
use std::path::{Path, PathBuf};
89
use std::process::Stdio;
910
use std::time::Duration;
@@ -35,7 +36,10 @@ pub struct MgdInstance {
3536
}
3637

3738
impl MgdInstance {
38-
pub async fn start(mut port: u16) -> Result<Self, anyhow::Error> {
39+
pub async fn start(
40+
mut port: u16,
41+
mgs_addr: SocketAddr,
42+
) -> Result<Self, anyhow::Error> {
3943
let temp_dir = TempDir::new()?;
4044

4145
let args = vec![
@@ -51,6 +55,8 @@ impl MgdInstance {
5155
uuid::Uuid::new_v4().to_string(),
5256
"--sled-uuid".into(),
5357
uuid::Uuid::new_v4().to_string(),
58+
"--mgs-addr".into(),
59+
mgs_addr.to_string(),
5460
];
5561

5662
let child = tokio::process::Command::new("mgd")

0 commit comments

Comments
 (0)