Skip to content

Commit b8a2a82

Browse files
refactor(machine): separate Machine fields into config/status
Add `MachineConfig` and `MachineStatus` to the `forge.Machine` proto message and corresponding structs to `carbide-api-model`. Protobuf changes: - Add `MachineConfig` - Add `MachineStatus` - Populate both messages in the `From<Machine>` RPC conversion while maintaining compatibility with rest-api. rest-api consumers are unaffected until a follow-up PR migrates them. See #2793 - Mark all moved fields `[deprecated = true]` with a `TODO` to reserve the field numbers once rest-api is updated. Model Changes: - Extract `MachineConfig` (`config.rs`) and `MachineStatus` (`status.rs`) from the flat `Machine` struct; callers updated throughout the workspace. - No deprecated copies in the Rust model — all internal callsites use the new paths (`machine.config.X`, `machine.status.X`).
1 parent 874659f commit b8a2a82

87 files changed

Lines changed: 1743 additions & 686 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

crates/admin-cli/src/main.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717

1818
// CLI enums variants can be rather large, we are ok with that.
1919
#![allow(clippy::large_enum_variant)]
20+
// The deprecated fields on `rpc::forge::Machine` must still be read here for
21+
// backwards-compat. See https://github.com/NVIDIA/infra-controller/issues/2793
22+
#![allow(deprecated)]
2023

2124
use std::fs::File;
2225
use std::io::Write;

crates/api-core/src/attestation/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ pub async fn get_ek_cert_by_machine_id(
5353

5454
// obtain an ek cert
5555
let tpm_ek_cert = machine
56+
.status
5657
.hardware_info
5758
.as_ref()
5859
.ok_or_else(|| CarbideError::internal("Hardware Info not found.".to_string()))?

crates/api-core/src/dpa/handler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ async fn handle_dpa_message(services: Arc<Api>, message: SetVni, topic: String)
201201

202202
let machine = machine.unwrap();
203203

204-
let cur_spx_status_observations = machine.spx_status_observation.unwrap_or_default();
204+
let cur_spx_status_observations = machine.status.spx_status_observation.unwrap_or_default();
205205
let mut new_spx_status_observations = MachineSpxStatusObservation::default();
206206

207207
let mut add_new_observation = true;

crates/api-core/src/ethernet_virtualization.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -332,10 +332,15 @@ pub async fn admin_network(
332332
// If we loop through the machine interfaces for the host snapshot and look for
333333
// that combo, the segment_id of that interface should be the network segment we want,
334334
// but checking against known admin segments adds a little bit of defense.
335-
let interface = snapshot.host_snapshot.interfaces.iter().find(|interface| {
336-
interface.attached_dpu_machine_id.as_ref() == Some(dpu_machine_id)
337-
&& admin_segment_ids.contains(&interface.segment_id)
338-
});
335+
let interface = snapshot
336+
.host_snapshot
337+
.status
338+
.interfaces
339+
.iter()
340+
.find(|interface| {
341+
interface.attached_dpu_machine_id.as_ref() == Some(dpu_machine_id)
342+
&& admin_segment_ids.contains(&interface.segment_id)
343+
});
339344

340345
let host_machine_id = snapshot.host_snapshot.id;
341346
let Some(interface) = interface else {
@@ -351,6 +356,7 @@ pub async fn admin_network(
351356
// still disables the admin DHCP path on non-primary DPUs via is_primary_dpu.
352357
let active_interface = snapshot
353358
.host_snapshot
359+
.status
354360
.interfaces
355361
.iter()
356362
.find(|interface| {

crates/api-core/src/handlers/attestation.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ pub(crate) async fn trigger_machine_attestation(
5858
id: format!("{}", machine_id),
5959
}));
6060
}
61-
1 => &machines[0].bmc_info,
61+
1 => &machines[0].status.bmc_info,
6262
_ => {
6363
return Err(Status::from(CarbideError::Internal {
6464
message: format!("Found more than one machine for machine id {}", machine_id),

crates/api-core/src/handlers/bmc_endpoint_explorer.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,7 @@ pub(crate) async fn admin_power_control(
594594

595595
if let Some(power_state) = snapshot
596596
.host_snapshot
597+
.status
597598
.power_options
598599
.map(|x| x.desired_power_state)
599600
&& power_state == model::power_manager::PowerState::On
@@ -1091,13 +1092,13 @@ pub(crate) async fn validate_and_complete_bmc_endpoint_request(
10911092
id: machine_id.to_string(),
10921093
})?;
10931094

1094-
let bmc_ip = machine.bmc_info.ip.as_ref().ok_or_else(|| {
1095+
let bmc_ip = machine.status.bmc_info.ip.as_ref().ok_or_else(|| {
10951096
CarbideError::internal(format!(
10961097
"Machine found for {machine_id} but BMC IP is missing"
10971098
))
10981099
})?;
10991100

1100-
let bmc_mac_address = machine.bmc_info.mac.ok_or_else(|| {
1101+
let bmc_mac_address = machine.status.bmc_info.mac.ok_or_else(|| {
11011102
CarbideError::internal(format!("BMC endpoint for {bmc_ip} ({machine_id}) found but does not have associated MAC"))
11021103
})?;
11031104

crates/api-core/src/handlers/component_manager.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ async fn group_machine_ids_by_rack(
571571
let machine = machines_by_id
572572
.get(machine_id)
573573
.ok_or_else(|| Status::not_found(format!("machine {machine_id} not found")))?;
574-
let rack_id = machine.rack_id.clone().ok_or_else(|| {
574+
let rack_id = machine.config.rack_id.clone().ok_or_else(|| {
575575
Status::failed_precondition(format!(
576576
"machine {machine_id} is not associated with a rack"
577577
))
@@ -585,6 +585,7 @@ async fn group_machine_ids_by_rack(
585585
/// Returns whether the machine is a rack-scale server (today just GB200, but will later include other SKUs)
586586
fn is_rack_scale_server(machine: &Machine) -> bool {
587587
machine
588+
.status
588589
.hardware_info
589590
.as_ref()
590591
.is_some_and(|hw| hw.is_gbx00())
@@ -1126,15 +1127,15 @@ async fn resolve_compute_tray_endpoints(
11261127
continue;
11271128
};
11281129

1129-
let Some(bmc_mac) = machine.bmc_info.mac else {
1130+
let Some(bmc_mac) = machine.status.bmc_info.mac else {
11301131
unresolved.push(UnresolvedDevice {
11311132
id: machine_id,
11321133
reason: "BMC MAC not available".into(),
11331134
});
11341135
continue;
11351136
};
11361137

1137-
let Some(bmc_ip) = machine.bmc_info.ip else {
1138+
let Some(bmc_ip) = machine.status.bmc_info.ip else {
11381139
unresolved.push(UnresolvedDevice {
11391140
id: machine_id,
11401141
reason: "BMC IP not configured".into(),

crates/api-core/src/handlers/dpf.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ pub(crate) async fn modify_dpf_state(
4949
id: machine_id.to_string(),
5050
})?;
5151

52-
if !request.dpf_enabled && machine_snapshot.host_snapshot.dpf.used_for_ingestion {
52+
if !request.dpf_enabled && machine_snapshot.host_snapshot.config.dpf.used_for_ingestion {
5353
return Err(CarbideError::FailedPrecondition(format!(
5454
"Cannot disable DPF for host {}: machine was ingested via DPF.",
5555
machine_id

crates/api-core/src/handlers/dpu.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ pub(crate) async fn get_managed_host_network_config_inner(
104104

105105
let primary_dpu_snapshot = snapshot
106106
.host_snapshot
107+
.status
107108
.interfaces
108109
.iter()
109110
.find(|x| x.primary_interface)
@@ -190,6 +191,7 @@ pub(crate) async fn get_managed_host_network_config_inner(
190191

191192
let booturl_override = if snapshot
192193
.host_snapshot
194+
.status
193195
.hardware_info
194196
.as_ref()
195197
.map(|h| h.machine_type)
@@ -927,7 +929,7 @@ pub(crate) async fn record_dpu_network_status(
927929
id: dpu_machine_id.to_string(),
928930
})?;
929931

930-
if snapshot.host_snapshot.dpf.used_for_ingestion {
932+
if snapshot.host_snapshot.config.dpf.used_for_ingestion {
931933
// DPF-managed DPUs don't use this upgrade path. Clear any stale flag so the DPU
932934
// doesn't keep receiving upgrade signals after the host was switched to DPF.
933935
if dpu_machine.needs_agent_upgrade() {

crates/api-core/src/handlers/instance.rs

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -893,16 +893,16 @@ pub(crate) async fn invoke_power(
893893
log_tenant_organization_id(instance.config.tenant.tenant_organization_id.as_str());
894894
}
895895

896-
let bmc_ip =
897-
snapshot
898-
.host_snapshot
899-
.bmc_info
900-
.ip
901-
.as_ref()
902-
.ok_or_else(|| CarbideError::NotFoundError {
903-
kind: "bmc_ip",
904-
id: machine_id.to_string(),
905-
})?;
896+
let bmc_ip = snapshot
897+
.host_snapshot
898+
.status
899+
.bmc_info
900+
.ip
901+
.as_ref()
902+
.ok_or_else(|| CarbideError::NotFoundError {
903+
kind: "bmc_ip",
904+
id: machine_id.to_string(),
905+
})?;
906906

907907
let run_provisioning_instructions_on_every_boot = snapshot
908908
.instance
@@ -1030,6 +1030,7 @@ pub(crate) async fn invoke_power(
10301030
let bmc_mac_address =
10311031
snapshot
10321032
.host_snapshot
1033+
.status
10331034
.bmc_info
10341035
.mac
10351036
.ok_or_else(|| CarbideError::NotFoundError {
@@ -1045,7 +1046,7 @@ pub(crate) async fn invoke_power(
10451046
.redfish_pool
10461047
.create_client(
10471048
&bmc_ip,
1048-
snapshot.host_snapshot.bmc_info.port,
1049+
snapshot.host_snapshot.status.bmc_info.port,
10491050
RedfishAuth::Key(CredentialKey::BmcCredentials {
10501051
credential_type: BmcCredentialType::BmcRoot { bmc_mac_address },
10511052
}),

0 commit comments

Comments
 (0)