Skip to content

Commit d180d48

Browse files
committed
Add omdb support-bundle collect subcommand
Wires a new subcommand on omdb that calls into the `support-bundle-collection` crate to gather a bundle locally. Unlike the Nexus background task, this path does not register a row in the `support_bundle` table, does not transfer the bundle to a sled agent, and does not require Nexus to be up — it only needs CRDB, internal DNS, MGS, and sled-agents reachable on the underlay. This is intended for incident response: when Nexus is down (the most important time to gather a bundle), an operator can still produce one locally.
1 parent 863c24b commit d180d48

6 files changed

Lines changed: 279 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/omdb/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ base64.workspace = true
1919
bootstrap-agent-lockstep-client.workspace = true
2020
bytes.workspace = true
2121
camino.workspace = true
22+
camino-tempfile.workspace = true
2223
chrono.workspace = true
2324
clap.workspace = true
2425
clickhouse-admin-single-client.workspace = true
@@ -54,6 +55,7 @@ nexus-db-queries.workspace = true
5455
nexus-db-schema.workspace = true
5556
nexus-inventory.workspace = true
5657
nexus-lockstep-client.workspace = true
58+
nexus-networking.workspace = true
5759
nexus-reconfigurator-preparation.workspace = true
5860
nexus-saga-recovery.workspace = true
5961
nexus-types.workspace = true
@@ -83,6 +85,7 @@ slog.workspace = true
8385
slog-error-chain.workspace = true
8486
steno.workspace = true
8587
strum.workspace = true
88+
support-bundle-collection.workspace = true
8689
support-bundle-viewer.workspace = true
8790
supports-color.workspace = true
8891
tabled.workspace = true
@@ -104,6 +107,7 @@ nexus-test-utils-macros.workspace = true
104107
omicron-nexus.workspace = true
105108
omicron-test-utils.workspace = true
106109
subprocess.workspace = true
110+
zip.workspace = true
107111

108112
# Disable doc builds by default for our binaries to work around issue
109113
# rust-lang/cargo#8373. These docs would not be very useful anyway.

dev-tools/omdb/src/bin/omdb/main.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ mod oxql;
5858
mod reconfigurator;
5959
mod sled_agent;
6060
mod support_bundle;
61+
mod support_bundle_collect;
6162

6263
fn main() -> Result<(), anyhow::Error> {
6364
sigpipe::reset();
@@ -83,6 +84,7 @@ async fn main_impl() -> Result<(), anyhow::Error> {
8384
reconfig.run_cmd(&args, &log).await
8485
}
8586
OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await,
87+
OmdbCommands::SupportBundle(sb) => sb.run_cmd(&args, &log).await,
8688
OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await,
8789
OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await,
8890
OmdbCommands::ClickhouseAdmin(ch) => ch.run_cmd(&args, &log).await,
@@ -297,6 +299,8 @@ enum OmdbCommands {
297299
Reconfigurator(reconfigurator::ReconfiguratorArgs),
298300
/// Debug a specific Sled
299301
SledAgent(sled_agent::SledAgentArgs),
302+
/// Collect or inspect a support bundle
303+
SupportBundle(support_bundle_collect::SupportBundleArgs),
300304
}
301305

302306
fn parse_dropshot_log_level(
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! `omdb support-bundle collect` — collect a support bundle locally,
6+
//! without going through Nexus.
7+
//!
8+
//! Unlike the Nexus background task, this path:
9+
//!
10+
//! - Does not register a row in the `support_bundle` table.
11+
//! - Does not transfer the resulting bundle to a sled-agent for durable
12+
//! storage. The zip is written to a local file path.
13+
//! - Does not require Nexus to be up. It only needs CRDB, internal
14+
//! DNS, MGS, and the rack's sled-agents reachable on the underlay.
15+
//!
16+
//! This is intended for incident response, where the operator may need
17+
//! to collect a bundle precisely because Nexus is unhealthy.
18+
19+
use crate::Omdb;
20+
use crate::db::DbUrlOptions;
21+
use anyhow::Context;
22+
use camino::Utf8PathBuf;
23+
use camino_tempfile::tempdir_in;
24+
use clap::Args;
25+
use clap::Subcommand;
26+
use clap::ValueEnum;
27+
use nexus_db_queries::context::OpContext;
28+
use nexus_db_queries::db::DataStore;
29+
use nexus_types::fm::ereport::EreportFilters;
30+
use nexus_types::support_bundle::BundleDataSelection;
31+
use omicron_uuid_kinds::SupportBundleUuid;
32+
use std::io::Seek;
33+
use std::io::SeekFrom;
34+
use std::sync::Arc;
35+
use support_bundle_collection::BundleCollection;
36+
use support_bundle_collection::BundleInfo;
37+
use support_bundle_collection::zip::bundle_to_zipfile;
38+
39+
/// Categories of data the bundle collector knows how to gather.
40+
///
41+
/// Mirrors `nexus_types::support_bundle::BundleDataCategory`, but is
42+
/// declared here so it can derive `clap::ValueEnum` without making
43+
/// `nexus-types` depend on clap.
44+
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, ValueEnum)]
45+
enum BundleCategory {
46+
Reconfigurator,
47+
HostInfo,
48+
SledCubbyInfo,
49+
SpDumps,
50+
Ereports,
51+
}
52+
53+
/// Arguments to the "omdb support-bundle" subcommand
54+
#[derive(Debug, Args)]
55+
pub struct SupportBundleArgs {
56+
#[command(subcommand)]
57+
command: SupportBundleCommands,
58+
}
59+
60+
#[derive(Debug, Subcommand)]
61+
enum SupportBundleCommands {
62+
/// Collect a support bundle without involving Nexus.
63+
///
64+
/// Connects directly to CockroachDB, internal DNS, MGS, and the
65+
/// rack's sled-agents — none of which depend on Nexus being up.
66+
/// The bundle is written to a local zip file. No row is created
67+
/// in the `support_bundle` table.
68+
Collect(CollectArgs),
69+
}
70+
71+
#[derive(Debug, Args)]
72+
struct CollectArgs {
73+
#[command(flatten)]
74+
db_url_opts: DbUrlOptions,
75+
76+
/// Path where the resulting bundle zip will be written.
77+
#[clap(long, short = 'o')]
78+
output: Utf8PathBuf,
79+
80+
/// Reason recorded inside the bundle's metadata.
81+
#[clap(long, default_value = "collected via omdb")]
82+
reason: String,
83+
84+
/// Directory to use for staging the bundle contents before zipping.
85+
#[clap(long, default_value = "/var/tmp")]
86+
tempdir: Utf8PathBuf,
87+
88+
/// Categories of data to collect. May be supplied multiple times.
89+
/// Defaults to all categories.
90+
#[clap(long, value_enum)]
91+
include: Vec<BundleCategory>,
92+
}
93+
94+
impl CollectArgs {
95+
fn data_selection(&self) -> BundleDataSelection {
96+
let categories: &[BundleCategory] = if self.include.is_empty() {
97+
BundleCategory::value_variants()
98+
} else {
99+
self.include.as_slice()
100+
};
101+
102+
let mut sel = BundleDataSelection::new();
103+
for category in categories {
104+
sel = match category {
105+
BundleCategory::Reconfigurator => sel.with_reconfigurator(),
106+
BundleCategory::HostInfo => sel.with_all_sleds(),
107+
BundleCategory::SledCubbyInfo => sel.with_sled_cubby_info(),
108+
BundleCategory::SpDumps => sel.with_sp_dumps(),
109+
BundleCategory::Ereports => sel.with_ereports(
110+
EreportFilters::new()
111+
.with_start_time(
112+
omicron_common::now_db_precision()
113+
- chrono::Days::new(7),
114+
)
115+
.expect("no end time set, cannot fail"),
116+
),
117+
};
118+
}
119+
sel
120+
}
121+
}
122+
123+
impl SupportBundleArgs {
124+
pub async fn run_cmd(
125+
&self,
126+
omdb: &Omdb,
127+
log: &slog::Logger,
128+
) -> anyhow::Result<()> {
129+
match &self.command {
130+
SupportBundleCommands::Collect(args) => args.run(omdb, log).await,
131+
}
132+
}
133+
}
134+
135+
impl CollectArgs {
136+
async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> {
137+
self.db_url_opts
138+
.with_datastore(omdb, log, async |opctx, datastore| {
139+
self.collect(omdb, log, opctx, datastore).await
140+
})
141+
.await
142+
}
143+
144+
async fn collect(
145+
&self,
146+
omdb: &Omdb,
147+
log: &slog::Logger,
148+
opctx: OpContext,
149+
datastore: Arc<DataStore>,
150+
) -> anyhow::Result<()> {
151+
let resolver = omdb.dns_resolver(log.clone()).await?;
152+
153+
let bundle = BundleInfo {
154+
id: SupportBundleUuid::new_v4(),
155+
reason_for_creation: self.reason.clone(),
156+
};
157+
let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string()));
158+
eprintln!("Collecting support bundle {}", bundle.id);
159+
160+
let collection = Arc::new(BundleCollection::new(
161+
datastore,
162+
resolver,
163+
bundle_log,
164+
opctx,
165+
self.data_selection(),
166+
bundle,
167+
));
168+
169+
// Wire Ctrl-C to cancel the in-flight collection.
170+
let cancel_handle = tokio::spawn({
171+
let token = collection.cancellation_token().clone();
172+
async move {
173+
let _ = tokio::signal::ctrl_c().await;
174+
eprintln!("\nCtrl-C received — cancelling bundle collection.");
175+
token.cancel();
176+
}
177+
});
178+
179+
let dir = tempdir_in(&self.tempdir).with_context(|| {
180+
format!("creating temp dir under {}", self.tempdir)
181+
})?;
182+
let collect_result = collection.collect_bundle_locally(&dir).await;
183+
cancel_handle.abort();
184+
let _ = cancel_handle.await;
185+
let report = collect_result?;
186+
187+
let zip_tempdir = self.tempdir.clone();
188+
let output = self.output.clone();
189+
tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
190+
let mut tempfile = bundle_to_zipfile(&dir, &zip_tempdir)?;
191+
tempfile.seek(SeekFrom::Start(0))?;
192+
let mut out = std::fs::File::create(&output)
193+
.with_context(|| format!("creating {output}"))?;
194+
std::io::copy(&mut tempfile, &mut out)?;
195+
Ok(())
196+
})
197+
.await
198+
.context("zip task panicked")??;
199+
200+
eprintln!("Wrote bundle to {}", self.output);
201+
eprintln!("{} steps executed:", report.steps.len());
202+
for step in &report.steps {
203+
let dur = step.end - step.start;
204+
eprintln!(
205+
" {:>9}ms {:?} {}",
206+
dur.num_milliseconds(),
207+
step.status,
208+
step.name,
209+
);
210+
}
211+
if let Some(ereports) = &report.ereports {
212+
eprintln!(
213+
"ereports: {} found, {} collected, {} errors",
214+
ereports.n_found,
215+
ereports.n_collected,
216+
ereports.errors.len(),
217+
);
218+
}
219+
Ok(())
220+
}
221+
}

dev-tools/omdb/tests/test_all_output.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,51 @@ async fn test_omdb_success_cases() {
436436
);
437437
assert!(!parsed.collections.is_empty());
438438

439+
// Exercise `omdb support-bundle collect` end-to-end. We don't add this
440+
// to the `successes.out` snapshot because the output includes a
441+
// randomly-generated bundle UUID, timing-dependent step durations,
442+
// and per-sled step names that would all need redaction. Instead we
443+
// run the command and verify the resulting zip is well-formed and
444+
// contains the expected metadata files.
445+
let bundle_path = tmpdir.path().join("bundle.zip");
446+
let bundle_args: &[&str] = &[
447+
"support-bundle",
448+
"collect",
449+
"--output",
450+
bundle_path.as_str(),
451+
"--tempdir",
452+
tmpdir.path().as_str(),
453+
"--reason",
454+
"integration test",
455+
];
456+
let mut bundle_output = String::new();
457+
let p = postgres_url.clone();
458+
let dns = cptestctx.internal_dns.dns_server.local_address().to_string();
459+
do_run_no_redactions(
460+
&mut bundle_output,
461+
move |exec| exec.env("OMDB_DB_URL", &p).env("OMDB_DNS_SERVER", &dns),
462+
&cmd_path,
463+
bundle_args,
464+
)
465+
.await;
466+
let zip_file = std::fs::File::open(&bundle_path).unwrap_or_else(|err| {
467+
panic!(
468+
"bundle zip not produced at {bundle_path}: {}\n\
469+
omdb output was:\n{bundle_output}",
470+
InlineErrorChain::new(&err),
471+
)
472+
});
473+
let mut archive =
474+
zip::ZipArchive::new(zip_file).expect("bundle is a valid zip archive");
475+
for required in
476+
["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"]
477+
{
478+
assert!(
479+
archive.by_name(required).is_ok(),
480+
"bundle zip is missing expected entry {required}",
481+
);
482+
}
483+
439484
let ox_invocation = &["oximeter", "list-producers"];
440485
let mut ox_output = String::new();
441486
let ox = ox_url.clone();

dev-tools/omdb/tests/usage_errors.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Commands:
1919
oxql Enter the Oximeter Query Language shell for interactive querying
2020
reconfigurator Interact with the Reconfigurator system
2121
sled-agent Debug a specific Sled
22+
support-bundle Collect or inspect a support bundle
2223
help Print this message or the help of the given subcommand(s)
2324

2425
Options:
@@ -54,6 +55,7 @@ Commands:
5455
oxql Enter the Oximeter Query Language shell for interactive querying
5556
reconfigurator Interact with the Reconfigurator system
5657
sled-agent Debug a specific Sled
58+
support-bundle Collect or inspect a support bundle
5759
help Print this message or the help of the given subcommand(s)
5860

5961
Options:

0 commit comments

Comments
 (0)