Skip to content

Commit 2650ef8

Browse files
authored
fix: disable endpoint refresh functionality + remove worklock dependency (#2890)
### Problem PR #1635 (adhoc endpoint exploration) added a per-endpoint `WorkLock` that the periodic site-explorer loop acquired before probing each BMC. This doesn't scale on large sites. ### Change Back out the work-lock usage introduced for the adhoc exploration feature; it shouldn't touch the work-lock manager at all: - **Periodic loop:** removed the per-endpoint work-lock acquire/skip. - **Adhoc refresh RPC (`RefreshEndpointReport`):** short-circuited to return gRPC `Unavailable` ("Endpoint refresh is temporarily unavailable") for now. - Removed the now-dead `endpoint_exploration_work_key` helper and the lock-coordination tests; replaced the refresh tests with one asserting the disabled behavior. ### Notes - Any orphaned `endpoint_exploration::*` lock rows from the prior version are never read by the new code and age out via keepalive timeout. - Proto is unchanged; the refresh CLI/UI surface the `Unavailable` error cleanly (no panic, no DB write). The UI Refresh button still appears and will error until the feature is re-enabled. ## Related issues <!-- Refer to existing GitHub issues here --> ## Type of Change <!-- Check one that best describes this PR --> - [ ] **Add** - New feature or capability - [ ] **Change** - Changes in existing functionality - [x] **Fix** - Bug fixes - [ ] **Remove** - Removed features or deprecated functionality - [ ] **Internal** - Internal changes (refactoring, tests, docs, etc.) ## Breaking Changes <!-- If checked, describe the breaking changes and migration steps --> <!-- Breaking changes are not generally permitted, please discuss on a GitHub discussion or with the development team if you believe you need to break a backward compatibility guarantee --> - [ ] **This PR contains breaking changes** ## Testing <!-- How was this tested? Check all that apply --> - [x] Unit tests added/updated - [ ] Integration tests added/updated - [ ] Manual testing performed - [ ] No testing required (docs, internal refactor, etc.) ## Additional Notes <!-- Any additional context, deployment notes, or reviewer guidance --> Signed-off-by: Krish Dandiwala <kdandiwala@nvidia.com>
1 parent 898ccf3 commit 2650ef8

3 files changed

Lines changed: 17 additions & 267 deletions

File tree

crates/api-core/src/handlers/site_explorer.rs

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,8 @@ use std::net::{IpAddr, SocketAddr};
1919
use std::str::FromStr;
2020

2121
use ::rpc::forge::{self as rpc, IsBmcInManagedHostResponse};
22-
use carbide_site_explorer::{endpoint_exploration_work_key, enrich_endpoint_exploration_report};
22+
use carbide_site_explorer::enrich_endpoint_exploration_report;
2323
use config_version::ConfigVersion;
24-
use db::work_lock_manager::AcquireLockError;
2524
use model::expected_entity::ExpectedEntity;
2625
use tokio::net::lookup_host;
2726
use tonic::{Request, Response, Status};
@@ -305,11 +304,20 @@ pub(crate) async fn re_explore_endpoint(
305304
Ok(Response::new(()))
306305
}
307306

307+
// Short-circuited: adhoc endpoint refresh is temporarily disabled. The probe
308+
// path below is retained but unreachable until the feature is re-enabled.
309+
#[allow(unreachable_code, unused_variables)]
308310
pub(crate) async fn refresh_endpoint_report(
309311
api: &Api,
310312
request: Request<rpc::RefreshEndpointReportRequest>,
311313
) -> Result<Response<::rpc::site_explorer::ExploredEndpoint>, tonic::Status> {
312314
log_request_data(&request);
315+
316+
return Err(CarbideError::UnavailableError(
317+
"Endpoint refresh is temporarily unavailable".to_string(),
318+
)
319+
.into());
320+
313321
let req = request.into_inner();
314322

315323
let bmc_ip = IpAddr::from_str(&req.ip_address).map_err(CarbideError::from)?;
@@ -352,39 +360,14 @@ pub(crate) async fn refresh_endpoint_report(
352360
.map(ExpectedEntity::PowerShelf)
353361
};
354362

355-
// Acquire the per-endpoint work lock before probing. If the periodic site-explorer
356-
// loop (or another concurrent refresh) is already probing this endpoint, return an
357-
// error immediately rather than running a redundant Redfish call.
358-
let work_lock = match api
359-
.work_lock_manager_handle
360-
.try_acquire_lock(endpoint_exploration_work_key(bmc_ip))
361-
.await
362-
{
363-
Ok(lock) => lock,
364-
Err(AcquireLockError::WorkAlreadyLocked(_)) => {
365-
return Err(CarbideError::AlreadyInProgress(format!(
366-
"Endpoint refresh already in progress for {bmc_ip}"
367-
))
368-
.into());
369-
}
370-
Err(e) => {
371-
return Err(CarbideError::internal(format!(
372-
"Failed to acquire endpoint work lock for {bmc_ip}: {e}"
373-
))
374-
.into());
375-
}
376-
};
377-
378-
// Run the probe + persist on a detached tokio task that owns the work lock.
379-
// Awaiting the JoinHandle preserves the synchronous UX. Even if the caller navigates
380-
// away mid-fetch, the probe will still run to completion.
363+
// Run the probe + persist on a detached tokio task. Awaiting the JoinHandle
364+
// preserves the synchronous UX. Even if the caller navigates away mid-fetch,
365+
// the probe will still run to completion.
381366
let endpoint_explorer = api.endpoint_explorer.clone();
382367
let database_connection = api.database_connection.clone();
383368
let runtime_config = api.runtime_config.clone();
384369

385370
let join_handle = tokio::spawn(async move {
386-
let _work_lock = work_lock;
387-
388371
let start = std::time::Instant::now();
389372
let result = endpoint_explorer
390373
.explore_endpoint(

crates/api-core/src/tests/site_explorer.rs

Lines changed: 3 additions & 207 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,13 @@ use std::str::FromStr;
2020
use std::sync::Arc;
2121

2222
use carbide_site_explorer::config::SiteExplorerConfig;
23-
use carbide_site_explorer::endpoint_exploration_work_key;
2423
use common::api_fixtures::TestEnv;
2524
use db::{self, ObjectColumnFilter};
2625
use ipnetwork::IpNetwork;
2726
use mac_address::MacAddress;
2827
use model::hardware_info::HardwareInfo;
2928
use model::machine::ManagedHostStateSnapshot;
30-
use model::site_explorer::{
31-
Chassis, EndpointExplorationError, EndpointExplorationReport, ExploredEndpoint,
32-
};
29+
use model::site_explorer::{Chassis, EndpointExplorationReport};
3330
use model::test_support::{DpuConfig, ManagedHostConfig};
3431
use rpc::forge::forge_server::Forge;
3532
use rpc::{DiscoveryData, DiscoveryInfo, MachineDiscoveryInfo};
@@ -728,87 +725,13 @@ async fn host_bmc_ip(
728725
Ok(bmc_ip)
729726
}
730727

731-
async fn explored_endpoint(
732-
env: &TestEnv,
733-
bmc_ip: IpAddr,
734-
) -> Result<ExploredEndpoint, Box<dyn std::error::Error>> {
735-
let mut txn = env.pool.begin().await?;
736-
let endpoint = db::explored_endpoints::find_by_ips(txn.as_mut(), vec![bmc_ip])
737-
.await?
738-
.into_iter()
739-
.next()
740-
.unwrap();
741-
txn.commit().await?;
742-
Ok(endpoint)
743-
}
744-
745-
fn endpoint_explore_call_count(env: &TestEnv, bmc_ip: IpAddr) -> usize {
746-
env.endpoint_explorer
747-
.explore_endpoint_calls
748-
.lock()
749-
.unwrap()
750-
.iter()
751-
.filter(|ip| **ip == bmc_ip)
752-
.count()
753-
}
754-
755-
#[sqlx_test]
756-
async fn test_refresh_endpoint_report_bumps_report_version(
757-
pool: PgPool,
758-
) -> Result<(), Box<dyn std::error::Error>> {
759-
let env = common::api_fixtures::create_test_env(pool.clone()).await;
760-
let mh = common::api_fixtures::create_managed_host(&env).await;
761-
let bmc_ip = host_bmc_ip(&env, &mh).await?;
762-
let initial_version = explored_endpoint(&env, bmc_ip).await?.report_version;
763-
764-
env.api
765-
.refresh_endpoint_report(Request::new(rpc::forge::RefreshEndpointReportRequest {
766-
ip_address: bmc_ip.to_string(),
767-
}))
768-
.await?;
769-
770-
let refreshed = explored_endpoint(&env, bmc_ip).await?;
771-
assert!(
772-
refreshed.report_version.version_nr() > initial_version.version_nr(),
773-
"refresh should bump report version from {} to a newer version, got {}",
774-
initial_version.version_nr(),
775-
refreshed.report_version.version_nr()
776-
);
777-
778-
Ok(())
779-
}
780-
781-
#[sqlx_test]
782-
async fn test_refresh_endpoint_report_rejects_nonexistent_endpoint(
783-
pool: PgPool,
784-
) -> Result<(), Box<dyn std::error::Error>> {
785-
let env = common::api_fixtures::create_test_env(pool.clone()).await;
786-
787-
let err = env
788-
.api
789-
.refresh_endpoint_report(Request::new(rpc::forge::RefreshEndpointReportRequest {
790-
ip_address: "99.99.99.99".to_string(),
791-
}))
792-
.await
793-
.unwrap_err();
794-
795-
assert_eq!(err.code(), tonic::Code::NotFound);
796-
797-
Ok(())
798-
}
799-
800728
#[sqlx_test]
801-
async fn test_refresh_endpoint_report_rejects_duplicate_refresh(
729+
async fn test_refresh_endpoint_report_is_unavailable(
802730
pool: PgPool,
803731
) -> Result<(), Box<dyn std::error::Error>> {
804732
let env = common::api_fixtures::create_test_env(pool.clone()).await;
805733
let mh = common::api_fixtures::create_managed_host(&env).await;
806734
let bmc_ip = host_bmc_ip(&env, &mh).await?;
807-
let _endpoint_lock = env
808-
.api
809-
.work_lock_manager_handle
810-
.try_acquire_lock(endpoint_exploration_work_key(bmc_ip))
811-
.await?;
812735

813736
let err = env
814737
.api
@@ -818,134 +741,7 @@ async fn test_refresh_endpoint_report_rejects_duplicate_refresh(
818741
.await
819742
.unwrap_err();
820743

821-
assert_eq!(err.code(), tonic::Code::AlreadyExists);
822-
823-
Ok(())
824-
}
825-
826-
#[sqlx_test]
827-
async fn test_refresh_endpoint_report_lock_blocks_periodic_probe(
828-
pool: PgPool,
829-
) -> Result<(), Box<dyn std::error::Error>> {
830-
let env = common::api_fixtures::create_test_env(pool.clone()).await;
831-
let mh = common::api_fixtures::create_managed_host(&env).await;
832-
let bmc_ip = host_bmc_ip(&env, &mh).await?;
833-
834-
env.api
835-
.re_explore_endpoint(Request::new(rpc::forge::ReExploreEndpointRequest {
836-
ip_address: bmc_ip.to_string(),
837-
if_version_match: None,
838-
}))
839-
.await?;
840-
841-
let calls_before = endpoint_explore_call_count(&env, bmc_ip);
842-
let _endpoint_lock = env
843-
.api
844-
.work_lock_manager_handle
845-
.try_acquire_lock(endpoint_exploration_work_key(bmc_ip))
846-
.await?;
847-
848-
env.run_site_explorer_iteration().await;
849-
850-
assert_eq!(
851-
endpoint_explore_call_count(&env, bmc_ip),
852-
calls_before,
853-
"periodic site explorer probe should be skipped while refresh lock is held"
854-
);
855-
856-
Ok(())
857-
}
858-
859-
#[sqlx_test]
860-
async fn test_refresh_endpoint_report_failure_persists_error_and_bumps_version(
861-
pool: PgPool,
862-
) -> Result<(), Box<dyn std::error::Error>> {
863-
let env = common::api_fixtures::create_test_env(pool.clone()).await;
864-
let mh = common::api_fixtures::create_managed_host(&env).await;
865-
let bmc_ip = host_bmc_ip(&env, &mh).await?;
866-
let initial_version = explored_endpoint(&env, bmc_ip).await?.report_version;
867-
env.endpoint_explorer.insert_endpoint_result(
868-
bmc_ip,
869-
Err(EndpointExplorationError::Unreachable {
870-
details: Some("refresh failure".to_string()),
871-
}),
872-
);
873-
env.api
874-
.refresh_endpoint_report(Request::new(rpc::forge::RefreshEndpointReportRequest {
875-
ip_address: bmc_ip.to_string(),
876-
}))
877-
.await?;
878-
879-
let refreshed = explored_endpoint(&env, bmc_ip).await?;
880-
assert!(
881-
refreshed.report_version.version_nr() > initial_version.version_nr(),
882-
"failed refresh should still bump report version"
883-
);
884-
assert!(
885-
refreshed.report.last_exploration_error.is_some(),
886-
"failed refresh should persist the exploration error"
887-
);
888-
889-
Ok(())
890-
}
891-
892-
#[sqlx_test]
893-
async fn test_refresh_endpoint_report_clears_pending_requested_exploration(
894-
pool: PgPool,
895-
) -> Result<(), Box<dyn std::error::Error>> {
896-
let env = common::api_fixtures::create_test_env(pool.clone()).await;
897-
let mh = common::api_fixtures::create_managed_host(&env).await;
898-
let bmc_ip = host_bmc_ip(&env, &mh).await?;
899-
900-
env.api
901-
.re_explore_endpoint(Request::new(rpc::forge::ReExploreEndpointRequest {
902-
ip_address: bmc_ip.to_string(),
903-
if_version_match: None,
904-
}))
905-
.await?;
906-
assert!(explored_endpoint(&env, bmc_ip).await?.exploration_requested);
907-
908-
env.api
909-
.refresh_endpoint_report(Request::new(rpc::forge::RefreshEndpointReportRequest {
910-
ip_address: bmc_ip.to_string(),
911-
}))
912-
.await?;
913-
914-
assert!(
915-
!explored_endpoint(&env, bmc_ip).await?.exploration_requested,
916-
"refresh should clear the pending requested exploration so the endpoint is not immediately probed again as priority work"
917-
);
918-
919-
Ok(())
920-
}
921-
922-
#[sqlx_test]
923-
async fn test_refresh_endpoint_report_lock_is_per_endpoint(
924-
pool: PgPool,
925-
) -> Result<(), Box<dyn std::error::Error>> {
926-
let env = common::api_fixtures::create_test_env(pool.clone()).await;
927-
let mh_a = common::api_fixtures::create_managed_host(&env).await;
928-
let mh_b = common::api_fixtures::create_managed_host(&env).await;
929-
let bmc_ip_a = host_bmc_ip(&env, &mh_a).await?;
930-
let bmc_ip_b = host_bmc_ip(&env, &mh_b).await?;
931-
let initial_version_b = explored_endpoint(&env, bmc_ip_b).await?.report_version;
932-
let _endpoint_lock = env
933-
.api
934-
.work_lock_manager_handle
935-
.try_acquire_lock(endpoint_exploration_work_key(bmc_ip_a))
936-
.await?;
937-
938-
env.api
939-
.refresh_endpoint_report(Request::new(rpc::forge::RefreshEndpointReportRequest {
940-
ip_address: bmc_ip_b.to_string(),
941-
}))
942-
.await?;
943-
944-
let refreshed_b = explored_endpoint(&env, bmc_ip_b).await?;
945-
assert!(
946-
refreshed_b.report_version.version_nr() > initial_version_b.version_nr(),
947-
"lock for endpoint {bmc_ip_a} should not block refresh for endpoint {bmc_ip_b}"
948-
);
744+
assert_eq!(err.code(), tonic::Code::Unavailable);
949745

950746
Ok(())
951747
}

crates/site-explorer/src/lib.rs

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ pub use machine_creator::MachineCreator;
7676
pub mod explored_endpoint_index;
7777
mod managed_host;
7878
use db::ObjectColumnFilter;
79-
use db::work_lock_manager::{AcquireLockError, WorkLockManagerHandle};
79+
use db::work_lock_manager::WorkLockManagerHandle;
8080
pub use managed_host::is_endpoint_in_managed_host;
8181
use model::DpuModel;
8282
use model::expected_machine::DpuMode;
@@ -253,14 +253,6 @@ impl<'a> Endpoint<'a> {
253253

254254
pub type SiteIdentifiedHosts = Vec<(ExploredManagedHost, EndpointExplorationReport)>;
255255

256-
/// Work-lock key for a single endpoint exploration.
257-
///
258-
/// Both the site-explorer loop (`update_explored_endpoints`) and the adhoc
259-
/// `RefreshEndpointReport` handler acquire this key before probing Redfish.
260-
pub fn endpoint_exploration_work_key(bmc_ip: IpAddr) -> String {
261-
format!("SiteExplorer::endpoint_exploration::{bmc_ip}")
262-
}
263-
264256
/// The SiteExplorer periodically runs [modules](machine_update_module::MachineUpdateModule) to initiate upgrades of machine components.
265257
/// On each iteration the SiteExplorer will:
266258
/// 1. collect the number of outstanding updates from all modules.
@@ -1994,7 +1986,6 @@ impl SiteExplorer {
19941986
let bmc_target_addr = SocketAddr::new(endpoint.address, bmc_target_port);
19951987
let fw_config_snapshot = fw_config_snapshot.clone();
19961988
let database_connection = self.database_connection.clone();
1997-
let work_lock_manager_handle = self.work_lock_manager_handle.clone();
19981989

19991990
task_set.push(
20001991
async move {
@@ -2010,26 +2001,6 @@ impl SiteExplorer {
20102001
.await
20112002
.expect("Semaphore can't be closed");
20122003

2013-
// If the endpoint is locked, we skip exploration.
2014-
let work_key = endpoint_exploration_work_key(endpoint.address);
2015-
let _work_lock = match work_lock_manager_handle.try_acquire_lock(work_key).await
2016-
{
2017-
Ok(work_lock) => work_lock,
2018-
Err(AcquireLockError::WorkAlreadyLocked(_)) => {
2019-
tracing::info!(
2020-
address = %endpoint.address,
2021-
"Skipping periodic endpoint exploration; adhoc refresh already in progress"
2022-
);
2023-
return Ok(None);
2024-
}
2025-
Err(e) => {
2026-
return Err(SiteExplorerError::internal(format!(
2027-
"Failed to acquire per-endpoint work lock for {}: {e}",
2028-
endpoint.address
2029-
)));
2030-
}
2031-
};
2032-
20332004
let mut result = endpoint_explorer
20342005
.explore_endpoint(
20352006
bmc_target_addr,

0 commit comments

Comments
 (0)