Skip to content

Commit 5eefdfd

Browse files
committed
Add omdb support-bundle collect subcommand
Wires a new subcommand on omdb that calls into the `support-bundle-collection` crate to gather a bundle locally. Unlike the Nexus background task, this path does not register a row in the `support_bundle` table, does not transfer the bundle to a sled agent, and does not require Nexus to be up — it only needs CRDB, internal DNS, MGS, and sled-agents reachable on the underlay. This is intended for incident response: when Nexus is down (the most important time to gather a bundle), an operator can still produce one locally.
1 parent c8cb3dd commit 5eefdfd

8 files changed

Lines changed: 192 additions & 18 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/omdb/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ base64.workspace = true
1919
bootstrap-agent-lockstep-client.workspace = true
2020
bytes.workspace = true
2121
camino.workspace = true
22+
camino-tempfile.workspace = true
2223
chrono.workspace = true
2324
clap.workspace = true
2425
clickhouse-admin-single-client.workspace = true
@@ -54,6 +55,7 @@ nexus-db-queries.workspace = true
5455
nexus-db-schema.workspace = true
5556
nexus-inventory.workspace = true
5657
nexus-lockstep-client.workspace = true
58+
nexus-networking.workspace = true
5759
nexus-reconfigurator-preparation.workspace = true
5860
nexus-saga-recovery.workspace = true
5961
nexus-types.workspace = true
@@ -83,6 +85,7 @@ slog.workspace = true
8385
slog-error-chain.workspace = true
8486
steno.workspace = true
8587
strum.workspace = true
88+
support-bundle-collection.workspace = true
8689
support-bundle-viewer.workspace = true
8790
supports-color.workspace = true
8891
tabled.workspace = true

dev-tools/omdb/src/bin/omdb/main.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ mod oxql;
5858
mod reconfigurator;
5959
mod sled_agent;
6060
mod support_bundle;
61+
mod support_bundle_collect;
6162

6263
fn main() -> Result<(), anyhow::Error> {
6364
sigpipe::reset();
@@ -83,6 +84,7 @@ async fn main_impl() -> Result<(), anyhow::Error> {
8384
reconfig.run_cmd(&args, &log).await
8485
}
8586
OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await,
87+
OmdbCommands::SupportBundle(sb) => sb.run_cmd(&args, &log).await,
8688
OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await,
8789
OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await,
8890
OmdbCommands::ClickhouseAdmin(ch) => ch.run_cmd(&args, &log).await,
@@ -297,6 +299,8 @@ enum OmdbCommands {
297299
Reconfigurator(reconfigurator::ReconfiguratorArgs),
298300
/// Debug a specific Sled
299301
SledAgent(sled_agent::SledAgentArgs),
302+
/// Collect or inspect a support bundle
303+
SupportBundle(support_bundle_collect::SupportBundleArgs),
300304
}
301305

302306
fn parse_dropshot_log_level(
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! `omdb support-bundle collect` — collect a support bundle locally,
6+
//! without going through Nexus.
7+
//!
8+
//! Unlike the Nexus background task, this path:
9+
//!
10+
//! - Does not register a row in the `support_bundle` table.
11+
//! - Does not transfer the resulting bundle to a sled-agent for durable
12+
//! storage. The zip is written to a local file path.
13+
//! - Does not require Nexus to be up. It only needs CRDB, internal
14+
//! DNS, MGS, and the rack's sled-agents reachable on the underlay.
15+
//!
16+
//! This is intended for incident response, where the operator may need
17+
//! to collect a bundle precisely because Nexus is unhealthy.
18+
19+
use crate::Omdb;
20+
use crate::db::DbUrlOptions;
21+
use anyhow::Context;
22+
use camino::Utf8PathBuf;
23+
use camino_tempfile::tempdir_in;
24+
use clap::Args;
25+
use clap::Subcommand;
26+
use nexus_db_queries::context::OpContext;
27+
use nexus_db_queries::db::DataStore;
28+
use nexus_types::support_bundle::BundleDataSelection;
29+
use omicron_uuid_kinds::SupportBundleUuid;
30+
use std::io::Seek;
31+
use std::io::SeekFrom;
32+
use std::sync::Arc;
33+
use support_bundle_collection::BundleCollection;
34+
use support_bundle_collection::BundleInfo;
35+
use support_bundle_collection::zip::bundle_to_zipfile;
36+
37+
/// Arguments to the "omdb support-bundle" subcommand
38+
#[derive(Debug, Args)]
39+
pub struct SupportBundleArgs {
40+
#[command(subcommand)]
41+
command: SupportBundleCommands,
42+
}
43+
44+
#[derive(Debug, Subcommand)]
45+
enum SupportBundleCommands {
46+
/// Collect a support bundle without involving Nexus.
47+
///
48+
/// Connects directly to CockroachDB, internal DNS, MGS, and the
49+
/// rack's sled-agents — none of which depend on Nexus being up.
50+
/// The bundle is written to a local zip file. No row is created
51+
/// in the `support_bundle` table.
52+
Collect(CollectArgs),
53+
}
54+
55+
#[derive(Debug, Args)]
56+
struct CollectArgs {
57+
#[command(flatten)]
58+
db_url_opts: DbUrlOptions,
59+
60+
/// Path where the resulting bundle zip will be written.
61+
#[clap(long, short = 'o')]
62+
output: Utf8PathBuf,
63+
64+
/// Reason recorded inside the bundle's metadata.
65+
#[clap(long, default_value = "collected via omdb")]
66+
reason: String,
67+
68+
/// Directory to use for staging the bundle contents before zipping.
69+
#[clap(long, default_value = "/var/tmp")]
70+
tempdir: Utf8PathBuf,
71+
}
72+
73+
impl SupportBundleArgs {
74+
pub async fn run_cmd(
75+
&self,
76+
omdb: &Omdb,
77+
log: &slog::Logger,
78+
) -> anyhow::Result<()> {
79+
match &self.command {
80+
SupportBundleCommands::Collect(args) => args.run(omdb, log).await,
81+
}
82+
}
83+
}
84+
85+
impl CollectArgs {
86+
async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> {
87+
self.db_url_opts
88+
.with_datastore(omdb, log, async |opctx, datastore| {
89+
self.collect(omdb, log, opctx, datastore).await
90+
})
91+
.await
92+
}
93+
94+
async fn collect(
95+
&self,
96+
omdb: &Omdb,
97+
log: &slog::Logger,
98+
opctx: OpContext,
99+
datastore: Arc<DataStore>,
100+
) -> anyhow::Result<()> {
101+
let resolver = omdb.dns_resolver(log.clone()).await?;
102+
103+
let bundle = BundleInfo {
104+
id: SupportBundleUuid::new_v4(),
105+
reason_for_creation: self.reason.clone(),
106+
};
107+
let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string()));
108+
eprintln!("Collecting support bundle {}", bundle.id);
109+
110+
let collection = Arc::new(BundleCollection::new(
111+
datastore,
112+
resolver,
113+
bundle_log,
114+
opctx,
115+
BundleDataSelection::all(),
116+
bundle,
117+
));
118+
119+
// Wire Ctrl-C to cancel the in-flight collection.
120+
let cancel_handle = tokio::spawn({
121+
let token = collection.cancellation_token().clone();
122+
async move {
123+
let _ = tokio::signal::ctrl_c().await;
124+
eprintln!("\nCtrl-C received — cancelling bundle collection.");
125+
token.cancel();
126+
}
127+
});
128+
129+
let dir = tempdir_in(&self.tempdir).with_context(|| {
130+
format!("creating temp dir under {}", self.tempdir)
131+
})?;
132+
let collect_result = collection.collect_bundle_locally(&dir).await;
133+
cancel_handle.abort();
134+
let _ = cancel_handle.await;
135+
let report = collect_result?;
136+
137+
let zip_tempdir = self.tempdir.clone();
138+
let output = self.output.clone();
139+
tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
140+
let mut tempfile = bundle_to_zipfile(&dir, &zip_tempdir)?;
141+
tempfile.seek(SeekFrom::Start(0))?;
142+
let mut out = std::fs::File::create(&output)
143+
.with_context(|| format!("creating {output}"))?;
144+
std::io::copy(&mut tempfile, &mut out)?;
145+
Ok(())
146+
})
147+
.await
148+
.context("zip task panicked")??;
149+
150+
eprintln!("Wrote bundle to {}", self.output);
151+
eprintln!("{} steps executed:", report.steps.len());
152+
for step in &report.steps {
153+
let dur = step.end - step.start;
154+
eprintln!(
155+
" {:>9}ms {:?} {}",
156+
dur.num_milliseconds(),
157+
step.status,
158+
step.name,
159+
);
160+
}
161+
if let Some(ereports) = &report.ereports {
162+
eprintln!(
163+
"ereports: {} found, {} collected, {} errors",
164+
ereports.n_found,
165+
ereports.n_collected,
166+
ereports.errors.len(),
167+
);
168+
}
169+
Ok(())
170+
}
171+
}

dev-tools/omdb/tests/usage_errors.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Commands:
1919
oxql Enter the Oximeter Query Language shell for interactive querying
2020
reconfigurator Interact with the Reconfigurator system
2121
sled-agent Debug a specific Sled
22+
support-bundle Collect or inspect a support bundle
2223
help Print this message or the help of the given subcommand(s)
2324

2425
Options:
@@ -54,6 +55,7 @@ Commands:
5455
oxql Enter the Oximeter Query Language shell for interactive querying
5556
reconfigurator Interact with the Reconfigurator system
5657
sled-agent Debug a specific Sled
58+
support-bundle Collect or inspect a support bundle
5759
help Print this message or the help of the given subcommand(s)
5860

5961
Options:

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -408,8 +408,7 @@ impl SupportBundleCollector {
408408
// to drain in-flight work and return.
409409
let cancel_task = tokio::spawn({
410410
let datastore = self.datastore.clone();
411-
let cancel_opctx =
412-
opctx.child(std::collections::BTreeMap::new());
411+
let cancel_opctx = opctx.child(std::collections::BTreeMap::new());
413412
let token = collection.cancellation_token().clone();
414413
let log = bundle_log.clone();
415414
let bundle_id = bundle.id;
@@ -495,13 +494,9 @@ impl SupportBundleCollector {
495494
.datastore
496495
.zpool_get_sled_if_in_service(opctx, bundle.zpool_id.into())
497496
.await?;
498-
let sled_client = nexus_networking::sled_client(
499-
&self.datastore,
500-
opctx,
501-
sled_id,
502-
log,
503-
)
504-
.await?;
497+
let sled_client =
498+
nexus_networking::sled_client(&self.datastore, opctx, sled_id, log)
499+
.await?;
505500

506501
let zpool = ZpoolUuid::from(bundle.zpool_id);
507502
let dataset = DatasetUuid::from(bundle.dataset_id);
@@ -610,8 +605,7 @@ async fn check_for_cancellation(
610605

611606
match datastore.support_bundle_get(&opctx, bundle_id).await {
612607
Ok(SupportBundle {
613-
state: SupportBundleState::Collecting,
614-
..
608+
state: SupportBundleState::Collecting, ..
615609
}) => {
616610
// Bundle still collecting; continue...
617611
continue;
@@ -700,7 +694,6 @@ impl BackgroundTask for SupportBundleCollector {
700694
mod test {
701695
use super::*;
702696

703-
use support_bundle_collection::perfetto;
704697
use crate::app::support_bundles::SupportBundleQueryType;
705698
use http_body_util::BodyExt;
706699
use nexus_db_model::PhysicalDisk;
@@ -731,6 +724,7 @@ mod test {
731724
};
732725
use sled_agent_types::inventory::ZpoolHealth;
733726
use std::num::NonZeroU64;
727+
use support_bundle_collection::perfetto;
734728
use uuid::Uuid;
735729

736730
type ControlPlaneTestContext =

support-bundle-collection/src/steps/ereports.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,8 @@ async fn save_ereports(
7676
dir: Utf8PathBuf,
7777
status: &mut SupportBundleEreportStatus,
7878
) -> anyhow::Result<()> {
79-
let mut paginator = Paginator::new(
80-
datastore::SQL_BATCH_SIZE,
81-
PaginationOrder::Ascending,
82-
);
79+
let mut paginator =
80+
Paginator::new(datastore::SQL_BATCH_SIZE, PaginationOrder::Ascending);
8381
while let Some(p) = paginator.next() {
8482
let pagparams = p.current_pagparams();
8583
let ereports = tokio::select! {

support-bundle-collection/src/zip.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
//! the directory of collected data — both Nexus (for storing on a sled
99
//! agent) and omdb (for writing to local storage).
1010
11+
use ::zip::ZipWriter;
12+
use ::zip::write::FullFileOptions;
1113
use anyhow::Result;
1214
use camino::Utf8DirEntry;
1315
use camino::Utf8Path;
1416
use camino_tempfile::Utf8TempDir;
1517
use camino_tempfile::tempfile_in;
16-
use ::zip::ZipWriter;
17-
use ::zip::write::FullFileOptions;
1818

1919
/// Takes the contents of `dir`, and zips them into a single zipfile
2020
/// stored as a tempfile under `tempdir`.

0 commit comments

Comments
 (0)