Skip to content

Commit b4b2622

Browse files
authored
feat: Add Site Explorer run status to admin UI (#2591)
## Description Surfaces the latest Site Explorer iteration status in the admin Site Explorer UI so operators do not have to inspect `carbide-api` pod logs to understand why a run failed. This PR adds a persisted singleton last-run record for Site Explorer and threads it through the existing site exploration report path. The admin explored-endpoint pages now show: - Last run status: `Success`, `Failed`, or `No Run` - Start and finish timestamps - Endpoint exploration counts: attempted, successful, and errored - The whole-run failure string when Site Explorer fails before per-endpoint errors can be written, such as missing global credentials ## Type of Change - [x] **Add** - New feature or capability - [ ] **Change** - Changes in existing functionality - [ ] **Fix** - Bug fixes - [ ] **Remove** - Removed features or deprecated functionality - [ ] **Internal** - Internal changes (refactoring, tests, docs, etc.) ## Related Issues Fixes #2046 ## Breaking Changes - [ ] This PR contains breaking changes No breaking API behavior is expected. The RPC report gains an optional `last_run` field, and the database migration adds a new singleton table used only for the latest Site Explorer run metadata. ## Implementation Notes - Adds `SiteExplorerLastRun` to the API model and Site Explorer report RPC. - Adds `site_explorer_run_status` storage with upsert/fetch helpers. - Records run metadata at the end of each `run_single_iteration`, including failures that occur before endpoint exploration starts. - Keeps failure-to-record status non-fatal so a reporting write problem does not mask the original Site Explorer result. - Adds a reusable admin template for the last-run status block and includes it on all explored-endpoint list views. ## Testing - [ ] Unit tests added/updated - [x] Integration tests added/updated - [ ] Manual testing performed - [ ] No testing required (docs, internal refactor, etc.) Validated with: ```bash cargo fmt --all cargo fmt --all -- --check cargo make check-format-nightly git diff --check cargo test -p carbide-site-explorer test_site_explorer_records_last_run -- --nocapture cargo test -p carbide-api-web test_site_explorer_run_status_banner -- --nocapture ``` The database-backed tests were run against a disposable Postgres 14.5 container with `DATABASE_URL` set for the SQLx test harness. --------- Signed-off-by: Hasan Khan <hasank@nvidia.com>
1 parent c4c4d09 commit b4b2622

29 files changed

Lines changed: 1003 additions & 32 deletions

crates/admin-cli/src/rpc.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,7 @@ impl ApiClient {
664664
&self,
665665
page_size: usize,
666666
) -> CarbideCliResult<::rpc::site_explorer::SiteExplorationReport> {
667+
let last_run = self.get_site_explorer_last_run().await?;
667668
// grab endpoints
668669
let endpoint_ids = match self.0.find_explored_endpoint_ids().await {
669670
Ok(endpoint_ids) => endpoint_ids,
@@ -698,9 +699,20 @@ impl ApiClient {
698699
Ok(::rpc::site_explorer::SiteExplorationReport {
699700
endpoints: all_endpoints.endpoints,
700701
managed_hosts: all_hosts,
702+
last_run,
701703
})
702704
}
703705

706+
pub async fn get_site_explorer_last_run(
707+
&self,
708+
) -> CarbideCliResult<Option<::rpc::site_explorer::SiteExplorerLastRun>> {
709+
match self.0.get_site_explorer_last_run().await {
710+
Ok(response) => Ok(response.last_run),
711+
Err(status) if maybe_unimplemented(&status) => Ok(None),
712+
Err(status) => Err(status.into()),
713+
}
714+
}
715+
704716
pub async fn get_explored_endpoints_by_ids(
705717
&self,
706718
endpoint_ids: &[String],

crates/admin-cli/src/site_explorer/get_report/cmd.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ async fn get_exploration_report_for_bmc_address(
184184
Ok(::rpc::site_explorer::SiteExplorationReport {
185185
endpoints: endpoints.endpoints,
186186
managed_hosts: managed_host,
187+
last_run: None,
187188
})
188189
}
189190

crates/api-core/src/api.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,13 @@ impl Forge for Api {
10641064
crate::handlers::site_explorer::get_site_exploration_report(self, request).await
10651065
}
10661066

1067+
async fn get_site_explorer_last_run(
1068+
&self,
1069+
request: Request<()>,
1070+
) -> Result<Response<::rpc::site_explorer::SiteExplorerLastRunResponse>, Status> {
1071+
crate::handlers::site_explorer::get_site_explorer_last_run(self, request).await
1072+
}
1073+
10671074
async fn find_explored_endpoint_ids(
10681075
&self,
10691076
request: Request<::rpc::site_explorer::ExploredEndpointSearchFilter>,

crates/api-core/src/auth/internal_rbac_rules.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ impl InternalRBACRules {
270270
"GetSiteExplorationReport",
271271
vec![ForgeAdminCLI, Machineatron],
272272
);
273+
x.perm("GetSiteExplorerLastRun", vec![ForgeAdminCLI, Machineatron]);
273274
x.perm("ClearSiteExplorationError", vec![ForgeAdminCLI]);
274275
x.perm("IsBmcInManagedHost", vec![ForgeAdminCLI]);
275276
x.perm("Explore", vec![ForgeAdminCLI, Flow]);

crates/api-core/src/handlers/site_explorer.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,21 @@ pub(crate) async fn get_site_exploration_report(
150150
Ok(tonic::Response::new(report.into()))
151151
}
152152

153+
pub(crate) async fn get_site_explorer_last_run(
154+
api: &Api,
155+
request: tonic::Request<()>,
156+
) -> Result<Response<::rpc::site_explorer::SiteExplorerLastRunResponse>, Status> {
157+
log_request_data(&request);
158+
159+
let last_run = db::site_explorer_run_status::fetch(&mut api.db_reader()).await?;
160+
161+
Ok(tonic::Response::new(
162+
::rpc::site_explorer::SiteExplorerLastRunResponse {
163+
last_run: last_run.map(Into::into),
164+
},
165+
))
166+
}
167+
153168
pub(crate) async fn find_explored_mlx_device_host_ids(
154169
api: &Api,
155170
request: Request<::rpc::site_explorer::ExploredMlxDeviceHostSearchFilter>,

crates/api-core/src/tests/common/api_fixtures/mod.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,14 +1647,7 @@ pub async fn create_test_env_with_overrides(
16471647
.build_for_manual_iterations(cancel_token.clone())
16481648
.expect("Unable to build RackStateController");
16491649

1650-
let fake_endpoint_explorer = MockEndpointExplorer {
1651-
reports: Arc::new(std::sync::Mutex::new(Default::default())),
1652-
power_states: Arc::new(std::sync::Mutex::new(Default::default())),
1653-
redfish_power_control_calls: Arc::new(std::sync::Mutex::new(Default::default())),
1654-
power_control_failures: Arc::new(std::sync::Mutex::new(Default::default())),
1655-
set_nic_mode_calls: Arc::new(std::sync::Mutex::new(Default::default())),
1656-
explore_endpoint_calls: Arc::new(std::sync::Mutex::new(Default::default())),
1657-
};
1650+
let fake_endpoint_explorer = MockEndpointExplorer::default();
16581651

16591652
// The API server is launched with a disabled site-explorer config so that it doesn't launch one
16601653
// on its own. TestEnv's site_explorer is a separate instance talking to the same database that
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
-- Store the operator-facing result of the most recent Site Explorer iteration.
2+
-- Endpoint rows already hold per-endpoint exploration errors; this singleton
3+
-- captures whole-run failures such as missing global credentials or database
4+
-- setup issues that otherwise only appear in nico-api logs.
5+
CREATE TABLE site_explorer_run_status (
6+
id smallint PRIMARY KEY DEFAULT 1 CHECK (id = 1),
7+
started_at timestamptz NOT NULL,
8+
finished_at timestamptz NOT NULL,
9+
success boolean NOT NULL,
10+
error text,
11+
failure_category text,
12+
endpoint_explorations bigint NOT NULL,
13+
endpoint_explorations_success bigint NOT NULL,
14+
endpoint_explorations_failed bigint NOT NULL,
15+
last_successful_finished_at timestamptz,
16+
last_failed_finished_at timestamptz,
17+
CONSTRAINT site_explorer_run_status_finished_after_started
18+
CHECK (finished_at >= started_at),
19+
CONSTRAINT site_explorer_run_status_endpoint_explorations_non_negative
20+
CHECK (endpoint_explorations >= 0),
21+
CONSTRAINT site_explorer_run_status_endpoint_explorations_success_non_negative
22+
CHECK (endpoint_explorations_success >= 0),
23+
CONSTRAINT site_explorer_run_status_endpoint_explorations_failed_non_negative
24+
CHECK (endpoint_explorations_failed >= 0),
25+
CONSTRAINT site_explorer_run_status_endpoint_explorations_within_total
26+
CHECK (endpoint_explorations_success + endpoint_explorations_failed <= endpoint_explorations),
27+
CONSTRAINT site_explorer_run_status_success_has_no_failure_category
28+
CHECK (NOT success OR failure_category IS NULL)
29+
);

crates/api-db/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ pub mod retained_boot_interface;
8585
pub mod route_servers;
8686
pub mod secrets;
8787
pub mod site_exploration_report;
88+
pub mod site_explorer_run_status;
8889
pub mod sku;
8990
pub mod spx_partition;
9091
pub mod state_history;

crates/api-db/src/site_exploration_report.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ where
2626
for<'db> &'db mut DB: DbReader<'db>,
2727
{
2828
let endpoints = crate::explored_endpoints::find_all(&mut *db).await?;
29-
let managed_hosts = crate::explored_managed_host::find_all(db).await?;
29+
let managed_hosts = crate::explored_managed_host::find_all(&mut *db).await?;
30+
let last_run = crate::site_explorer_run_status::fetch(&mut *db).await?;
3031
Ok(SiteExplorationReport {
32+
last_run,
3133
endpoints,
3234
managed_hosts,
3335
})
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
use chrono::{DateTime, Utc};
19+
use model::site_explorer::SiteExplorerLastRun;
20+
use sqlx::{FromRow, PgConnection};
21+
22+
use crate::db_read::DbReader;
23+
use crate::{DatabaseError, DatabaseResult};
24+
25+
const LAST_RUN_ID: i16 = 1;
26+
27+
#[derive(Debug, FromRow)]
28+
struct DbSiteExplorerLastRun {
29+
started_at: DateTime<Utc>,
30+
finished_at: DateTime<Utc>,
31+
success: bool,
32+
error: Option<String>,
33+
failure_category: Option<String>,
34+
endpoint_explorations: i64,
35+
endpoint_explorations_success: i64,
36+
endpoint_explorations_failed: i64,
37+
last_successful_finished_at: Option<DateTime<Utc>>,
38+
last_failed_finished_at: Option<DateTime<Utc>>,
39+
}
40+
41+
impl From<DbSiteExplorerLastRun> for SiteExplorerLastRun {
42+
fn from(run: DbSiteExplorerLastRun) -> Self {
43+
Self {
44+
started_at: run.started_at,
45+
finished_at: run.finished_at,
46+
success: run.success,
47+
error: run.error,
48+
failure_category: run.failure_category,
49+
endpoint_explorations: run.endpoint_explorations,
50+
endpoint_explorations_success: run.endpoint_explorations_success,
51+
endpoint_explorations_failed: run.endpoint_explorations_failed,
52+
last_successful_finished_at: run.last_successful_finished_at,
53+
last_failed_finished_at: run.last_failed_finished_at,
54+
}
55+
}
56+
}
57+
58+
/// Fetches metadata for the latest site explorer run.
59+
pub async fn fetch(db: impl DbReader<'_>) -> DatabaseResult<Option<SiteExplorerLastRun>> {
60+
let query = "SELECT started_at, finished_at, success, error, failure_category, endpoint_explorations, endpoint_explorations_success, endpoint_explorations_failed, last_successful_finished_at, last_failed_finished_at
61+
FROM site_explorer_run_status
62+
WHERE id = $1";
63+
64+
sqlx::query_as::<_, DbSiteExplorerLastRun>(query)
65+
.bind(LAST_RUN_ID)
66+
.fetch_optional(db)
67+
.await
68+
.map(|run| run.map(Into::into))
69+
.map_err(|e| DatabaseError::query(query, e))
70+
}
71+
72+
/// Replaces metadata for the latest site explorer run.
73+
pub async fn upsert(txn: &mut PgConnection, last_run: &SiteExplorerLastRun) -> DatabaseResult<()> {
74+
let query = "INSERT INTO site_explorer_run_status (
75+
id,
76+
started_at,
77+
finished_at,
78+
success,
79+
error,
80+
failure_category,
81+
endpoint_explorations,
82+
endpoint_explorations_success,
83+
endpoint_explorations_failed,
84+
last_successful_finished_at,
85+
last_failed_finished_at
86+
)
87+
VALUES (
88+
$1, $2, $3, $4, $5, $6, $7, $8, $9,
89+
CASE WHEN $4 THEN $3 ELSE $10 END,
90+
CASE WHEN NOT $4 THEN $3 ELSE $11 END
91+
)
92+
ON CONFLICT (id) DO UPDATE SET
93+
started_at = EXCLUDED.started_at,
94+
finished_at = EXCLUDED.finished_at,
95+
success = EXCLUDED.success,
96+
error = EXCLUDED.error,
97+
failure_category = EXCLUDED.failure_category,
98+
endpoint_explorations = EXCLUDED.endpoint_explorations,
99+
endpoint_explorations_success = EXCLUDED.endpoint_explorations_success,
100+
endpoint_explorations_failed = EXCLUDED.endpoint_explorations_failed,
101+
last_successful_finished_at = CASE
102+
WHEN EXCLUDED.success THEN EXCLUDED.finished_at
103+
ELSE COALESCE(
104+
EXCLUDED.last_successful_finished_at,
105+
site_explorer_run_status.last_successful_finished_at
106+
)
107+
END,
108+
last_failed_finished_at = CASE
109+
WHEN NOT EXCLUDED.success THEN EXCLUDED.finished_at
110+
ELSE COALESCE(
111+
EXCLUDED.last_failed_finished_at,
112+
site_explorer_run_status.last_failed_finished_at
113+
)
114+
END";
115+
116+
sqlx::query(query)
117+
.bind(LAST_RUN_ID)
118+
.bind(last_run.started_at)
119+
.bind(last_run.finished_at)
120+
.bind(last_run.success)
121+
.bind(&last_run.error)
122+
.bind(&last_run.failure_category)
123+
.bind(last_run.endpoint_explorations)
124+
.bind(last_run.endpoint_explorations_success)
125+
.bind(last_run.endpoint_explorations_failed)
126+
.bind(last_run.last_successful_finished_at)
127+
.bind(last_run.last_failed_finished_at)
128+
.execute(txn)
129+
.await
130+
.map_err(|e| DatabaseError::query(query, e))?;
131+
132+
Ok(())
133+
}

0 commit comments

Comments
 (0)