Skip to content

Commit 6d04f46

Browse files
committed
[bug/5614743] fix: scout repeatedly fails machine discovery with 'AttestKeyInfo is not populated' error
1 parent 09809c2 commit 6d04f46

5 files changed

Lines changed: 136 additions & 24 deletions

File tree

crates/scout/src/deprovision/scrabbing.rs

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,12 @@ use carbide_uuid::machine::MachineId;
2323
use regex::Regex;
2424
use scout::CarbideClientError;
2525
use serde::Deserialize;
26-
use smbioslib::SMBiosSystemInformation;
2726
use tracing::Instrument;
2827

2928
use crate::cfg::Options;
3029
use crate::client::create_forge_client;
3130
use crate::deprovision::cmdrun;
32-
use crate::{CarbideClientResult, IN_QEMU_VM};
31+
use crate::{CarbideClientResult, IN_QEMU_VM, platform};
3332

3433
fn check_memory_overwrite_efi_var() -> Result<(), CarbideClientError> {
3534
let name = match efivar::efi::Variable::from_str(
@@ -1093,22 +1092,9 @@ async fn do_cleanup(machine_id: &MachineId) -> CarbideClientResult<rpc::MachineC
10931092
Ok(cleanup_result)
10941093
}
10951094

1096-
fn is_host() -> bool {
1097-
match smbioslib::table_load_from_device() {
1098-
Ok(data) => data.any(|sys_info: SMBiosSystemInformation| {
1099-
!sys_info
1100-
.product_name()
1101-
.to_string()
1102-
.to_lowercase()
1103-
.contains("bluefield")
1104-
}),
1105-
Err(_err) => true,
1106-
}
1107-
}
1108-
11091095
pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClientResult<()> {
11101096
tracing::info!("full deprovision starts.");
1111-
if !is_host() {
1097+
if !platform::is_host() {
11121098
tracing::info!("full deprovision skipped, we are not running on a host.");
11131099
// do not send API cleanup_machine_completed
11141100
return Ok(());
@@ -1122,7 +1108,7 @@ pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClie
11221108
}
11231109

11241110
pub async fn run_no_api(tpm_path: &str) -> Result<(), CarbideClientError> {
1125-
if !is_host() {
1111+
if !platform::is_host() {
11261112
tracing::info!("No cleanup needed on DPU.");
11271113
return Ok(());
11281114
}

crates/scout/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ mod discovery;
5555
mod firmware_upgrade;
5656
mod machine_validation;
5757
mod mlx_device;
58+
mod platform;
5859
mod register;
5960
mod stream;
6061
mod tpm;

crates/scout/src/platform.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
use smbioslib::{SMBiosSystemInformation, table_load_from_device};
19+
20+
/// Returns `true` when scout is running on a managed host (as opposed to a DPU).
21+
pub(crate) fn is_host() -> bool {
22+
match table_load_from_device() {
23+
Ok(data) => data.any(|sys_info: SMBiosSystemInformation| {
24+
!sys_info
25+
.product_name()
26+
.to_string()
27+
.to_lowercase()
28+
.contains("bluefield")
29+
}),
30+
Err(_err) => true,
31+
}
32+
}
33+
34+
#[cfg(test)]
35+
mod tests {
36+
use super::*;
37+
38+
#[test]
39+
fn is_host_returns_bool_without_panicking() {
40+
let _ = is_host();
41+
}
42+
}

crates/scout/src/register.rs

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use tracing::info;
2424
use tss_esapi::Context;
2525
use tss_esapi::handles::KeyHandle;
2626

27-
use crate::{CarbideClientError, attestation as attest};
27+
use crate::{CarbideClientError, attestation as attest, platform, tpm};
2828

2929
pub async fn run(
3030
forge_api: &str,
@@ -36,7 +36,9 @@ pub async fn run(
3636
let mut hardware_info = enumerate_hardware()?;
3737
info!("Successfully enumerated hardware");
3838

39-
let is_dpu = hardware_info.tpm_ek_certificate.is_none();
39+
// Missing TPM EK material must not be treated as DPU detection. DPUs are
40+
// identified from platform SMBIOS data, not from TPM availability.
41+
let is_dpu = !platform::is_host();
4042

4143
if machine_interface_id.is_none() && !is_dpu {
4244
return Err(CarbideClientError::GenericError(
@@ -55,15 +57,31 @@ pub async fn run(
5557
crate::tpm::set_tpm_max_auth_fail()?;
5658

5759
// create tss context
58-
let mut tss_ctx = attest::create_context_from_path(tpm_path)
59-
.map_err(|e| CarbideClientError::TpmError(format!("Could not create context: {e}")))?;
60+
let mut tss_ctx = match attest::create_context_from_path(tpm_path) {
61+
Ok(ctx) => ctx,
62+
Err(e) => {
63+
let err = CarbideClientError::TpmError(format!("Could not create context: {e}"));
64+
if tpm::is_recoverable_tpm_client_error(&err) {
65+
tpm::recover_tpm_and_reboot(tpm_path)?;
66+
}
67+
return Err(err);
68+
}
69+
};
6070

6171
// CHANGETO - supply context externally
6272
hardware_info.tpm_description = attest::get_tpm_description(&mut tss_ctx);
6373

64-
let result = attest::create_attest_key_info(&mut tss_ctx).map_err(|e| {
65-
CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}"))
66-
})?;
74+
let result = match attest::create_attest_key_info(&mut tss_ctx) {
75+
Ok(result) => result,
76+
Err(e) => {
77+
let err =
78+
CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}"));
79+
if tpm::is_recoverable_tpm_client_error(&err) {
80+
tpm::recover_tpm_and_reboot(tpm_path)?;
81+
}
82+
return Err(err);
83+
}
84+
};
6785

6886
hardware_info.attest_key_info = Some(result.0);
6987
endorsement_key_handle_opt = Some(result.1);

crates/scout/src/tpm.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,18 @@
1515
* limitations under the License.
1616
*/
1717

18+
use std::fs::File;
19+
use std::io::Write;
20+
use std::path::Path;
1821
use std::process::Command;
1922

2023
use tss_esapi::handles::AuthHandle;
2124
use tss_esapi::interface_types::session_handles::AuthSession;
2225

2326
use crate::{CarbideClientError, attestation as attest};
2427

28+
pub(crate) const TPM_RECOVERY_ATTEMPTED_PATH: &str = "/tmp/tpm_recovery_reboot_attempted";
29+
2530
// From https://superuser.com/questions/1404738/tpm-2-0-hardware-error-da-lockout-mode
2631
pub(crate) fn set_tpm_max_auth_fail() -> Result<(), CarbideClientError> {
2732
let output = Command::new("tpm2_dictionarylockout")
@@ -81,3 +86,63 @@ pub(crate) fn clear_tpm(tpm_path: &str) -> Result<(), CarbideClientError> {
8186
tracing::info!("TPM lockout hierarchy clear completed");
8287
Ok(())
8388
}
89+
90+
pub(crate) fn is_recoverable_tpm_client_error(error: &CarbideClientError) -> bool {
91+
match error {
92+
CarbideClientError::TpmError(message) => {
93+
message.contains("Could not create AttestKeyInfo")
94+
|| message.contains("Could not create context")
95+
|| message.contains("TPM2_Clear")
96+
}
97+
_ => false,
98+
}
99+
}
100+
101+
/// Clears the TPM and reboots the host once per boot cycle to recover from missing TPM material.
102+
pub(crate) fn recover_tpm_and_reboot(tpm_path: &str) -> Result<(), CarbideClientError> {
103+
if Path::new(TPM_RECOVERY_ATTEMPTED_PATH).exists() {
104+
return Err(CarbideClientError::TpmError(
105+
"TPM recovery was already attempted this boot cycle; refusing to loop".to_string(),
106+
));
107+
}
108+
109+
tracing::warn!("Attempting automated TPM clear and reboot to recover attestation state");
110+
clear_tpm(tpm_path)?;
111+
112+
let mut marker =
113+
File::create(TPM_RECOVERY_ATTEMPTED_PATH).map_err(CarbideClientError::StdIo)?;
114+
marker
115+
.write_all(b"tpm recovery reboot requested\n")
116+
.map_err(CarbideClientError::StdIo)?;
117+
118+
let output = Command::new("systemctl")
119+
.arg("reboot")
120+
.output()
121+
.map_err(CarbideClientError::StdIo)?;
122+
if !output.status.success() {
123+
return Err(CarbideClientError::GenericError(format!(
124+
"systemctl reboot failed with status {:?}: {}",
125+
output.status.code(),
126+
String::from_utf8_lossy(&output.stderr)
127+
)));
128+
}
129+
130+
Ok(())
131+
}
132+
133+
#[cfg(test)]
134+
mod tests {
135+
use super::*;
136+
137+
#[test]
138+
fn recoverable_tpm_errors_include_attest_key_info_failures() {
139+
let err = CarbideClientError::TpmError("Could not create AttestKeyInfo: test".to_string());
140+
assert!(is_recoverable_tpm_client_error(&err));
141+
}
142+
143+
#[test]
144+
fn non_tpm_client_errors_are_not_recoverable() {
145+
let err = CarbideClientError::GenericError("transport failed".to_string());
146+
assert!(!is_recoverable_tpm_client_error(&err));
147+
}
148+
}

0 commit comments

Comments
 (0)