Skip to content

Commit e3994e0

Browse files
authored
cockroach-admin: Allow partial success parsing cockroach node status --all rows (#10072)
The `cockroach-admin` server runs `cockroach node status --all` both to find its local node ID and to service the `/node/status` endpoint. We have no production callers of the latter, but inventory relies on the former. We've previously seen that we can't get reasonable status from decommissioned nodes (in particular, we rely on socket addresses being present, and decommissioned nodes have `NULL` socket addresses) and so already skip rows from decommissioned nodes. However, in #10068 we saw similar invalid rows from not-yet-decommissioned nodes, which broke inventory collection. This change relaxes our parsing: instead of bailing on any failure parsing a single row, we accumulate separate vecs of successfully parsed rows and failures parsing rows and return both. `cockroach-admin` logs the errors but otherwise ignores them. I don't love this but I _think_ it should be okay; a failure mode here is "we fail to parse any rows at all", but in practice the effect of having an empty vec of successfully-parsed rows is the same as failing the call entirely: we won't be able to find our local node ID or report any useful node status. I think we'd be better off jettisoning this parsing entirely, if there's a way we could get at all this information from another source (either the prometheus endpoint we're already scraping, or maaaaaybe poking at files on disk, e.g., if there's a file that contains the local node ID?). But either of those would be larger changes, and this should unblock testing that bumped into the parsing issue. Closes #10068.
1 parent f828af7 commit e3994e0

7 files changed

Lines changed: 273 additions & 29 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cockroach-admin/src/cockroach_cli.rs

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use cockroach_admin_types::node::{NodeDecommission, NodeStatus, ParseError};
77
use dropshot::HttpError;
88
use illumos_utils::ExecutionError;
99
use illumos_utils::output_to_exec_error;
10+
use slog::Logger;
11+
use slog::warn;
1012
use slog_error_chain::InlineErrorChain;
1113
use slog_error_chain::SlogInlineError;
1214
use std::io;
@@ -159,20 +161,40 @@ impl CockroachCli {
159161

160162
pub async fn node_status(
161163
&self,
164+
log: &Logger,
162165
) -> Result<Vec<NodeStatus>, CockroachCliError> {
163-
self.invoke_cli_with_format_csv(
164-
["node", "status", "--all"],
165-
NodeStatus::parse_from_csv,
166-
"node status",
167-
)
168-
.await
166+
let (statuses, errs) = self
167+
.invoke_cli_with_format_csv(
168+
["node", "status", "--all"],
169+
NodeStatus::parse_from_csv,
170+
"node status",
171+
)
172+
.await?;
173+
if !errs.is_empty() {
174+
warn!(
175+
log,
176+
"partial failure parsing `node status --all` output: \
177+
successfully parsed {} of {} rows",
178+
statuses.len(),
179+
statuses.len() + errs.len(),
180+
);
181+
for err in errs {
182+
warn!(
183+
log,
184+
"skipped row from `node status --all` due to parse error";
185+
InlineErrorChain::new(&err),
186+
);
187+
}
188+
}
189+
Ok(statuses)
169190
}
170191

171192
pub async fn node_decommission(
172193
&self,
173194
node_id: &str,
195+
log: &Logger,
174196
) -> Result<NodeDecommission, CockroachCliError> {
175-
let statuses = self.node_status().await?;
197+
let statuses = self.node_status(log).await?;
176198
self.validate_node_decommissionable(node_id, statuses)?;
177199
self.invoke_node_decommission(node_id).await
178200
}
@@ -423,7 +445,8 @@ mod tests {
423445
#[tokio::test]
424446
async fn test_node_status_compatibility() {
425447
let logctx = dev::test_setup_log("test_node_status_compatibility");
426-
let db = TestDatabase::new_populate_nothing(&logctx.log).await;
448+
let log = &logctx.log;
449+
let db = TestDatabase::new_populate_nothing(log).await;
427450
let db_url = db.crdb().listen_url().to_string();
428451

429452
let expected_headers = "id,address,sql_address,build,started_at,updated_at,locality,is_available,is_live,replicas_leaders,replicas_leaseholders,ranges,ranges_unavailable,ranges_underreplicated,live_bytes,key_bytes,value_bytes,intent_bytes,system_bytes,gossiped_replicas,is_decommissioning,membership,is_draining";
@@ -463,7 +486,7 @@ mod tests {
463486
cockroach_address,
464487
SocketAddr::V6(cockroach_address),
465488
);
466-
let status = cli.node_status().await.expect("got node status");
489+
let status = cli.node_status(log).await.expect("got node status");
467490

468491
// We can't check all the fields exactly, but some we know based on the
469492
// fact that our test database is a single node.

cockroach-admin/src/context.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ impl ServerContext {
113113
// crdb instance.
114114
let node_statuses = self
115115
.cockroach_cli()
116-
.node_status()
116+
.node_status(&self.log)
117117
.await
118118
.context("failed to get node status")?;
119119

cockroach-admin/src/http_entrypoints.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,11 @@ impl CockroachAdminApi for CockroachAdminImpl {
4848
rqctx: RequestContext<Self::Context>,
4949
) -> Result<HttpResponseOk<ClusterNodeStatus>, HttpError> {
5050
let ctx = rqctx.context();
51-
let all_nodes =
52-
ctx.cockroach_cli().node_status().await.map_err(HttpError::from)?;
51+
let all_nodes = ctx
52+
.cockroach_cli()
53+
.node_status(ctx.log())
54+
.await
55+
.map_err(HttpError::from)?;
5356
Ok(HttpResponseOk(ClusterNodeStatus { all_nodes }))
5457
}
5558

cockroach-admin/types/versions/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ thiserror.workspace = true
1818

1919
[dev-dependencies]
2020
proptest.workspace = true
21+
slog-error-chain.workspace = true
2122
test-strategy.workspace = true

0 commit comments

Comments
 (0)