Skip to content

Commit 4f6c1c2

Browse files
authored
TQ: Shrink the ack requirements for partial commit (#9852)
Currently we wait for N-Z acked commits to allow a new configuration to be installed in the `Committing` state. For a 32 node system this means that 29 nodes have to ack. If more than 3 nodes are offline than a customer cannot add or remove any sleds. This is an unnecessarily harsh restriction. This PR changes it so that only K+Z nodes must ack commits to go from `Preparing` to `CommittedPartially`, similar to the prepare phase of the protocol that takes us from `Preparing` to `Committing`. We bump Z a little bit for larger cluster to balance things out. Now, only 21 nodes must ack commits for new configurations to be accepted. This allows up to 11 nodes to be offline and eliminates the need for an escape hatch for oxide support if 4 or 5 nodes rather than 3 are offline. If 11 nodes are offline, something major is going on and really, oxide support should be involved. We will not allow automatic triage via omdb in this situation. This PR also makes acking commits idempotent in the datastore layer, which fixes a potential bug in the case of multiple nexuses trying to commit the same nodes simultaneously. Fixes #9826
1 parent 3ee0bcb commit 4f6c1c2

2 files changed

Lines changed: 239 additions & 11 deletions

File tree

nexus/db-queries/src/db/datastore/trust_quorum.rs

Lines changed: 235 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -584,9 +584,9 @@ impl DataStore {
584584

585585
if last_committed_config.state.is_committing() {
586586
let acked = last_committed_config.acked_commits();
587-
// N - Z
588-
let min = last_committed_config.members.len()
589-
- last_committed_config.commit_crash_tolerance as usize;
587+
// K + Z
588+
let min = last_committed_config.threshold.0 as usize
589+
+ last_committed_config.commit_crash_tolerance as usize;
590590
bail_unless!(
591591
acked >= min,
592592
"Trust quorum reconfiguration in progress: not enough \
@@ -1158,10 +1158,6 @@ impl DataStore {
11581158
.filter(dsl::epoch.eq(epoch))
11591159
.filter(dsl::hw_baseboard_id.eq_any(hw_baseboard_ids))
11601160
.filter(dsl::share_digest.is_not_null())
1161-
.filter(dsl::state.eq_any(vec![
1162-
DbTrustQuorumMemberState::Unacked,
1163-
DbTrustQuorumMemberState::Prepared,
1164-
]))
11651161
.set((
11661162
dsl::state.eq(DbTrustQuorumMemberState::Committed),
11671163
dsl::time_committed.eq(Some(Utc::now())),
@@ -1619,7 +1615,7 @@ mod tests {
16191615

16201616
#[tokio::test]
16211617
async fn test_tq_insert_initial_lrtq_upgrade() {
1622-
let logctx = test_setup_log("test_tq_update_prepare_and_commit");
1618+
let logctx = test_setup_log("test_tq_insert_initial_lrtq_upgrade");
16231619
let db = TestDatabase::new_with_datastore(&logctx.log).await;
16241620
let (opctx, datastore) = (db.opctx(), db.datastore());
16251621

@@ -1915,6 +1911,237 @@ mod tests {
19151911
logctx.cleanup_successful();
19161912
}
19171913

1914+
#[tokio::test]
1915+
async fn test_tq_update_prepare_and_commit_partially() {
1916+
let logctx =
1917+
test_setup_log("test_tq_update_prepare_and_commit_partially");
1918+
let db = TestDatabase::new_with_datastore(&logctx.log).await;
1919+
let (opctx, datastore) = (db.opctx(), db.datastore());
1920+
1921+
let hw_ids = insert_hw_baseboard_ids(&db).await;
1922+
let rack_id = RackUuid::new_v4();
1923+
let members: BTreeSet<_> =
1924+
hw_ids.iter().cloned().map(BaseboardId::from).collect();
1925+
1926+
let conn = datastore.pool_connection_for_tests().await.unwrap();
1927+
let coordinator = members.first().unwrap().clone();
1928+
1929+
// Insert an initial config
1930+
DataStore::tq_insert_rss_config_after_handoff(
1931+
opctx,
1932+
&conn,
1933+
rack_id,
1934+
members.clone(),
1935+
coordinator,
1936+
)
1937+
.await
1938+
.unwrap();
1939+
1940+
// Propse a second configuration and successfully insert it
1941+
let config = ProposedTrustQuorumConfig {
1942+
rack_id,
1943+
epoch: Epoch(2),
1944+
is_lrtq_upgrade: IsLrtqUpgrade::No {
1945+
last_committed_epoch: Epoch(1),
1946+
},
1947+
members: members.clone(),
1948+
};
1949+
datastore.tq_insert_latest_config(opctx, config.clone()).await.unwrap();
1950+
1951+
// A configuration returned from a coordinator is part of the trust
1952+
// quorum protocol
1953+
let coordinator_config =
1954+
trust_quorum_types::configuration::Configuration {
1955+
rack_id: config.rack_id,
1956+
epoch: config.epoch,
1957+
coordinator: hw_ids.first().unwrap().clone().into(),
1958+
members: members
1959+
.clone()
1960+
.into_iter()
1961+
.map(|id| (id, Sha3_256Digest([0u8; 32])))
1962+
.collect(),
1963+
threshold: TrustQuorumConfig::threshold(members.len() as u8),
1964+
encrypted_rack_secrets: None,
1965+
};
1966+
1967+
let read_config = datastore
1968+
.tq_get_latest_config(opctx, rack_id)
1969+
.await
1970+
.expect("no error")
1971+
.expect("returned config");
1972+
1973+
// Ack enough nodes to go to the committing state
1974+
let min_acks = coordinator_config.threshold.0 as usize
1975+
+ read_config.commit_crash_tolerance as usize;
1976+
1977+
datastore
1978+
.tq_update_prepare_status(
1979+
opctx,
1980+
coordinator_config.clone(),
1981+
coordinator_config
1982+
.members
1983+
.keys()
1984+
.take(min_acks)
1985+
.cloned()
1986+
.collect(),
1987+
)
1988+
.await
1989+
.unwrap();
1990+
1991+
let read_config = datastore
1992+
.tq_get_latest_config(opctx, rack_id)
1993+
.await
1994+
.expect("no error")
1995+
.expect("returned config");
1996+
1997+
// We've acked enough nodes and should have written our status to the DB
1998+
assert_eq!(read_config.epoch, config.epoch);
1999+
assert_eq!(read_config.state, TrustQuorumConfigState::Committing);
2000+
assert!(read_config.encrypted_rack_secrets.is_none());
2001+
assert_eq!(
2002+
min_acks,
2003+
read_config
2004+
.members
2005+
.iter()
2006+
.filter(
2007+
|(_, info)| info.state == TrustQuorumMemberState::Prepared
2008+
)
2009+
.count()
2010+
);
2011+
2012+
// Try to start a new configuration with not enough nodes committed
2013+
//
2014+
// Trying to insert a new config should fail since we haven't committed
2015+
// enough nodes.
2016+
let next_config = ProposedTrustQuorumConfig {
2017+
rack_id,
2018+
epoch: Epoch(3),
2019+
is_lrtq_upgrade: IsLrtqUpgrade::No {
2020+
last_committed_epoch: Epoch(2),
2021+
},
2022+
members: members.clone(),
2023+
};
2024+
let e = datastore
2025+
.tq_insert_latest_config(opctx, next_config.clone())
2026+
.await
2027+
.unwrap_err();
2028+
println!("{e}");
2029+
2030+
// Commit at K + Z - 1 nodes
2031+
datastore
2032+
.tq_update_commit_status(
2033+
opctx,
2034+
rack_id,
2035+
config.epoch,
2036+
coordinator_config
2037+
.members
2038+
.keys()
2039+
.take(min_acks - 1)
2040+
.cloned()
2041+
.collect(),
2042+
)
2043+
.await
2044+
.unwrap();
2045+
2046+
let read_config = datastore
2047+
.tq_get_latest_config(opctx, rack_id)
2048+
.await
2049+
.expect("no error")
2050+
.expect("returned config");
2051+
2052+
assert_eq!(read_config.epoch, config.epoch);
2053+
assert_eq!(read_config.state, TrustQuorumConfigState::Committing);
2054+
assert!(read_config.encrypted_rack_secrets.is_none());
2055+
assert_eq!(
2056+
min_acks - 1,
2057+
read_config
2058+
.members
2059+
.iter()
2060+
.filter(
2061+
|(_, info)| info.state == TrustQuorumMemberState::Committed
2062+
)
2063+
.count()
2064+
);
2065+
2066+
// Insert should fail since again, there is not enough nodes:
2067+
//
2068+
// We have acked K+Z - 1 and not K+Z
2069+
let e = datastore
2070+
.tq_insert_latest_config(opctx, next_config.clone())
2071+
.await
2072+
.unwrap_err();
2073+
println!("{e}");
2074+
2075+
// Commit at K + Z
2076+
datastore
2077+
.tq_update_commit_status(
2078+
opctx,
2079+
rack_id,
2080+
config.epoch,
2081+
coordinator_config
2082+
.members
2083+
.keys()
2084+
.take(min_acks)
2085+
.cloned()
2086+
.collect(),
2087+
)
2088+
.await
2089+
.unwrap();
2090+
2091+
let read_config = datastore
2092+
.tq_get_latest_config(opctx, rack_id)
2093+
.await
2094+
.expect("no error")
2095+
.expect("returned config");
2096+
2097+
assert_eq!(read_config.epoch, config.epoch);
2098+
// Even though K+Z nodes have committed, we should still see the state
2099+
// at `Committing`, since a new config has not yet been proposed.
2100+
assert_eq!(read_config.state, TrustQuorumConfigState::Committing);
2101+
assert!(read_config.encrypted_rack_secrets.is_none());
2102+
assert_eq!(
2103+
min_acks,
2104+
read_config
2105+
.members
2106+
.iter()
2107+
.filter(
2108+
|(_, info)| info.state == TrustQuorumMemberState::Committed
2109+
)
2110+
.count()
2111+
);
2112+
2113+
// Now create a new config. This should succeed and we should
2114+
// see the state of the old config move from `Committing` to
2115+
// `CommittedPartially`.
2116+
datastore.tq_insert_latest_config(opctx, next_config).await.unwrap();
2117+
2118+
let read_config = datastore
2119+
.tq_get_config(opctx, rack_id, Epoch(2))
2120+
.await
2121+
.expect("no error")
2122+
.expect("returned config");
2123+
2124+
assert_eq!(read_config.epoch, config.epoch);
2125+
assert_eq!(
2126+
read_config.state,
2127+
TrustQuorumConfigState::CommittedPartially
2128+
);
2129+
assert!(read_config.encrypted_rack_secrets.is_none());
2130+
assert_eq!(
2131+
min_acks,
2132+
read_config
2133+
.members
2134+
.iter()
2135+
.filter(
2136+
|(_, info)| info.state == TrustQuorumMemberState::Committed
2137+
)
2138+
.count()
2139+
);
2140+
2141+
db.terminate().await;
2142+
logctx.cleanup_successful();
2143+
}
2144+
19182145
#[tokio::test]
19192146
async fn test_tq_abort() {
19202147
let logctx = test_setup_log("test_tq_abort");

nexus/types/src/trust_quorum.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,10 @@ impl TrustQuorumConfig {
234234
pub fn commit_crash_tolerance(num_members: u8) -> u8 {
235235
match num_members {
236236
0..=3 => 0,
237-
4..=8 => 1,
238-
9..=16 => 2,
239-
_ => 3,
237+
4..=7 => 1,
238+
8..=15 => 2,
239+
16..=23 => 3,
240+
_ => 4,
240241
}
241242
}
242243
}

0 commit comments

Comments
 (0)