Skip to content

Commit ea86e66

Browse files
pierugo-dfinityIDX GitHub Automation
andauthored
refactor(ic-admin): [CON-1637] halt a subnet without necessarily provisioning SSH keys (#9780)
This PR changes the arguments `ssh_readonly_access` and `ssh_node_state_write_access` in `ic-admin`'s `ProposeToTakeSubnetOfflineForRepairsCmd` command to be optional. This allows to halt the subnet without provisioning SSH keys as [suggested](#7361 (comment)) in the PR introducing the command. This allows to replace all halting commands in`ic-recovery` with this one. This PR also addresses Leo's other comments in the linked PR. Behavioural changes/remarks: - It was not and still is not possible to halt the subnet while provisioning an empty list of SSH keys, i.e. clearing the current list. You can either overwrite the list with a non-empty list or (thanks to this PR) do no change. I cannot think of any realistic use-case for overwriting with an empty list so this should be fine (note that this is a limitation from `ic-admin`, this is still possible through a direct call to the governance canister's `SetSubnetOpereationalLevel`). - Unhalting will now always clear the list of SSH keys. In particular: - App subnet recoveries' `Unhalt` step will clear the list instead of overwriting it with a singleton empty string (this is actually an interesting artifact that you can deduce that a subnet was previously recovered by checking whether its `SubnetRecord::ssh_readonly_access` field is `[""]` instead of `[]`). - When unhalting the source subnet in subnet splitting, the list will be cleared instead of doing no change (see change in `subnet_splitting.rs`. This is actually probably the intended effect, which ensures that the SSH readonly key that was deployed to the subnet is cleared. - As a (maybe unintended) side-effect, this will also clear the list in the destination subnet, which could have been non-empty when creating the subnet, but again I doubt this corresponds to any realistic scenario. --------- Co-authored-by: IDX GitHub Automation <infra+github-automation@dfinity.org>
1 parent d62aac1 commit ea86e66

7 files changed

Lines changed: 106 additions & 111 deletions

File tree

rs/recovery/src/admin_helper.rs

Lines changed: 71 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -81,35 +81,6 @@ impl AdminHelper {
8181
}
8282
}
8383

84-
pub fn get_halt_subnet_command(
85-
&self,
86-
subnet_id: SubnetId,
87-
is_halted: bool,
88-
keys: &[String],
89-
) -> IcAdmin {
90-
let mut ic_admin = self.get_ic_admin_cmd_base();
91-
92-
ic_admin
93-
.add_positional_argument("propose-to-update-subnet")
94-
.add_argument("subnet", subnet_id)
95-
.add_argument("is-halted", is_halted);
96-
if !keys.is_empty() {
97-
ic_admin.add_arguments(SSH_READONLY_ACCESS_ARG, keys.iter().map(quote));
98-
}
99-
ic_admin.add_argument(
100-
SUMMARY_ARG,
101-
quote(format!(
102-
"{} subnet {}, for recovery and update ssh readonly access",
103-
if is_halted { "Halt" } else { "Unhalt" },
104-
subnet_id,
105-
)),
106-
);
107-
108-
self.add_proposer_args(&mut ic_admin);
109-
110-
ic_admin
111-
}
112-
11384
pub fn get_propose_to_take_subnet_offline_for_repairs_command(
11485
&self,
11586
subnet_id: SubnetId,
@@ -423,24 +394,89 @@ mod tests {
423394
const FAKE_REPLICA_VERSION: &str = "fake_replica_version";
424395

425396
#[test]
426-
fn get_halt_subnet_command_test() {
397+
fn get_propose_to_take_subnet_offline_for_repairs_without_keys_command_test() {
427398
let result = fake_admin_helper()
428-
.get_halt_subnet_command(
399+
.get_propose_to_take_subnet_offline_for_repairs_command(
400+
subnet_id_from_str(FAKE_SUBNET_ID_1),
401+
&[],
402+
&BTreeMap::new(),
403+
)
404+
.join(" ");
405+
406+
assert_eq!(
407+
result,
408+
"/fake/ic/admin/dir/ic-admin \
409+
--nns-url \"https://fake_nns_url.com:8080/\" \
410+
propose-to-take-subnet-offline-for-repairs \
411+
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
412+
--summary \"Take subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe offline for recovery, updating readonly and write access ssh keys\" \
413+
--test-neuron-proposer"
414+
);
415+
}
416+
417+
#[test]
418+
fn get_propose_to_take_subnet_offline_for_repairs_with_readonly_key_command_test() {
419+
let result = fake_admin_helper()
420+
.get_propose_to_take_subnet_offline_for_repairs_command(
421+
subnet_id_from_str(FAKE_SUBNET_ID_1),
422+
&["fake public key".to_string()],
423+
&BTreeMap::new(),
424+
)
425+
.join(" ");
426+
427+
assert_eq!(
428+
result,
429+
"/fake/ic/admin/dir/ic-admin \
430+
--nns-url \"https://fake_nns_url.com:8080/\" \
431+
propose-to-take-subnet-offline-for-repairs \
432+
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
433+
--ssh-readonly-access \"fake public key\" \
434+
--summary \"Take subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe offline for recovery, updating readonly and write access ssh keys\" \
435+
--test-neuron-proposer"
436+
);
437+
}
438+
439+
#[test]
440+
fn get_propose_to_take_subnet_offline_for_repairs_with_readonly_and_write_keys_command_test() {
441+
let result = fake_admin_helper()
442+
.get_propose_to_take_subnet_offline_for_repairs_command(
429443
subnet_id_from_str(FAKE_SUBNET_ID_1),
430-
/*is_halted=*/ true,
431444
&["fake public key".to_string()],
445+
&BTreeMap::from([(
446+
node_id_from_str(FAKE_NODE_ID),
447+
vec!["fake write access public key".to_string()],
448+
)]),
432449
)
433450
.join(" ");
434451

435452
assert_eq!(
436453
result,
437454
"/fake/ic/admin/dir/ic-admin \
438455
--nns-url \"https://fake_nns_url.com:8080/\" \
439-
propose-to-update-subnet \
456+
propose-to-take-subnet-offline-for-repairs \
440457
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
441-
--is-halted true \
442458
--ssh-readonly-access \"fake public key\" \
443-
--summary \"Halt subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe, for recovery and update ssh readonly access\" \
459+
--ssh-node-state-write-access \"nqpqw-cp42a-rmdsx-fpui3-ncne5-kzq6o-m67an-w25cx-zu636-lcf2v-fqe:fake write access public key\" \
460+
--summary \"Take subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe offline for recovery, updating readonly and write access ssh keys\" \
461+
--test-neuron-proposer"
462+
);
463+
}
464+
465+
#[test]
466+
fn get_propose_to_bring_subnet_back_online_after_repairs_command_test() {
467+
let result = fake_admin_helper()
468+
.get_propose_to_bring_subnet_back_online_after_repairs_command(subnet_id_from_str(
469+
FAKE_SUBNET_ID_1,
470+
))
471+
.join(" ");
472+
473+
assert_eq!(
474+
result,
475+
"/fake/ic/admin/dir/ic-admin \
476+
--nns-url \"https://fake_nns_url.com:8080/\" \
477+
propose-to-bring-subnet-back-online-after-repairs \
478+
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
479+
--summary \"Bring subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe back online after repairs\" \
444480
--test-neuron-proposer"
445481
);
446482
}

rs/recovery/src/app_subnet_recovery.rs

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -401,20 +401,18 @@ impl RecoveryIterator<StepType, StepTypeIter> for AppSubnetRecovery {
401401
vec![]
402402
};
403403

404-
if let Some((node_id, pub_key)) = self.params.write_node_id_and_pub_key.clone() {
405-
Ok(Box::new(self.recovery.take_subnet_offline_for_repairs(
404+
Ok(Box::new(
405+
self.recovery.take_subnet_offline_for_repairs(
406406
self.params.subnet_id,
407407
&subnet_readonly_keys,
408-
&BTreeMap::from([(node_id, vec![pub_key])]),
409-
)))
410-
} else {
411-
// TODO (CON-1637): Remove this branch and only use `take_subnet_offline_for_repairs`
412-
Ok(Box::new(self.recovery.halt_subnet(
413-
self.params.subnet_id,
414-
true,
415-
&subnet_readonly_keys,
416-
)))
417-
}
408+
&self
409+
.params
410+
.write_node_id_and_pub_key
411+
.clone()
412+
.map(|(node_id, pub_key)| BTreeMap::from([(node_id, vec![pub_key])]))
413+
.unwrap_or_default(),
414+
),
415+
))
418416
}
419417

420418
StepType::DownloadCertifications => {
@@ -580,22 +578,10 @@ impl RecoveryIterator<StepType, StepTypeIter> for AppSubnetRecovery {
580578
}
581579
}
582580

583-
StepType::Unhalt => {
584-
if self.params.write_node_id_and_pub_key.is_some() {
585-
Ok(Box::new(
586-
self.recovery
587-
.bring_subnet_back_online_after_repairs(self.params.subnet_id),
588-
))
589-
} else {
590-
// TODO (CON-1637): Remove this branch and only use
591-
// `bring_subnet_back_online_after_repairs`
592-
Ok(Box::new(self.recovery.halt_subnet(
593-
self.params.subnet_id,
594-
false,
595-
&["".to_string()],
596-
)))
597-
}
598-
}
581+
StepType::Unhalt => Ok(Box::new(
582+
self.recovery
583+
.bring_subnet_back_online_after_repairs(self.params.subnet_id),
584+
)),
599585

600586
StepType::Cleanup => Ok(Box::new(self.recovery.get_cleanup_step())),
601587
}

rs/recovery/src/lib.rs

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -255,21 +255,6 @@ impl Recovery {
255255
Ok(max_height)
256256
}
257257

258-
/// Return a recovery [AdminStep] to halt or unhalt the given subnet
259-
pub fn halt_subnet(
260-
&self,
261-
subnet_id: SubnetId,
262-
is_halted: bool,
263-
keys: &[String],
264-
) -> impl Step + use<> {
265-
AdminStep {
266-
logger: self.logger.clone(),
267-
ic_admin_cmd: self
268-
.admin_helper
269-
.get_halt_subnet_command(subnet_id, is_halted, keys),
270-
}
271-
}
272-
273258
/// Return a recovery [AdminStep] to take the given subnet offline for repairs
274259
pub fn take_subnet_offline_for_repairs(
275260
&self,

rs/recovery/subnet_splitting/src/subnet_splitting.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -307,11 +307,8 @@ impl SubnetSplitting {
307307
}
308308

309309
fn unhalt(&self, target_subnet: TargetSubnet) -> impl Step + use<> {
310-
self.recovery.halt_subnet(
311-
self.subnet_id(target_subnet),
312-
/*is_halted=*/ false,
313-
/*keys=*/ &[],
314-
)
310+
self.recovery
311+
.bring_subnet_back_online_after_repairs(self.subnet_id(target_subnet))
315312
}
316313

317314
fn propose_cup(&self, target_subnet: TargetSubnet) -> RecoveryResult<impl Step + use<>> {

rs/registry/admin/bin/main.rs

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ enum SubCommand {
509509
///
510510
/// This is the first step of subnet recovery. Previously the first step was
511511
/// done using propose-to-update-subnet. However, that does not support
512-
/// changing ssn_node_state_write_access, which is needed when a subnet has
512+
/// changing ssh_node_state_write_access, which is needed when a subnet has
513513
/// SEV enabled everywhere (or the subnet has no DFINITY node).
514514
///
515515
/// At the end of subnet recovery, run propose-to-bring-subnet-back-online.
@@ -839,7 +839,7 @@ struct ProposeToTakeSubnetOfflineForRepairsCmd {
839839
/// This usually isn't a problem, because the list is typically empty when
840840
/// this proposal is used as part of subnet recovery.
841841
#[clap(long, num_args(1..))]
842-
pub ssh_readonly_access: Vec<String>,
842+
pub ssh_readonly_access: Option<Vec<String>>,
843843

844844
/// Similar to the --ssh-readonly-access flag, but there are some important
845845
/// differences:
@@ -862,7 +862,7 @@ struct ProposeToTakeSubnetOfflineForRepairsCmd {
862862
/// clobber. However, in practice, it would probably be empty to begin with,
863863
/// so most likely, this won't be an issue.
864864
#[clap(long, value_parser, num_args(1..))]
865-
pub ssh_node_state_write_access: Vec<NodeSshAccessFlagValue>,
865+
pub ssh_node_state_write_access: Option<Vec<NodeSshAccessFlagValue>>,
866866

867867
/// List of replica version IDs to recall. These versions will be marked as
868868
/// recalled for this subnet, preventing them from being upgraded to them. If the
@@ -899,15 +899,13 @@ impl ProposalPayload<SetSubnetOperationalLevelPayload> for ProposeToTakeSubnetOf
899899
let ssh_node_state_write_access = self
900900
.ssh_node_state_write_access
901901
.clone()
902-
.into_iter()
903-
.map(NodeSshAccess::from)
904-
.collect();
902+
.map(|keys| keys.into_iter().map(NodeSshAccess::from).collect());
905903

906904
SetSubnetOperationalLevelPayload {
907905
subnet_id: Some(SubnetId::from(self.subnet)),
908906
operational_level: Some(operational_level::DOWN_FOR_REPAIRS),
909-
ssh_readonly_access: Some(self.ssh_readonly_access.clone()),
910-
ssh_node_state_write_access: Some(ssh_node_state_write_access),
907+
ssh_readonly_access: self.ssh_readonly_access.clone(),
908+
ssh_node_state_write_access,
911909
recalled_replica_version_ids,
912910
}
913911
}
@@ -928,14 +926,14 @@ impl FromStr for NodeSshAccessFlagValue {
928926
let node_id = parts.next().ok_or("Missing node ID.")?;
929927
let public_keys = parts
930928
.next()
931-
.ok_or("Missing semicolon separating node ID and SSH public key.")?
929+
.ok_or("Missing semicolon separating node ID and SSH public keys.")?
932930
.to_string();
933931

934932
// Parse node_id.
935933
let node_id = PrincipalId::from_str(node_id).map_err(|err| format!("{err}"))?;
936934
let node_id = NodeId::from(node_id);
937935

938-
// Parse public_keys, by simply splitting on ','.<
936+
// Parse public_keys, by simply splitting on ','.
939937
let public_keys = public_keys
940938
.split(',')
941939
.map(|public_key| public_key.to_string())

rs/tests/consensus/subnet_recovery/common.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -560,13 +560,7 @@ fn app_subnet_recovery_test(env: TestEnv, cfg: TestConfig) {
560560
let f = (cfg.subnet_size - 1) / 3;
561561
break_nodes(&app_nodes.take(f + 1).collect::<Vec<_>>(), &logger);
562562
} else {
563-
halt_subnet(
564-
&admin_helper,
565-
&download_state_node,
566-
app_subnet_id,
567-
&[],
568-
&logger,
569-
)
563+
halt_subnet(&admin_helper, &download_state_node, app_subnet_id, &logger);
570564
}
571565
assert_subnet_is_broken(
572566
&download_state_node.get_public_url(),
@@ -1010,7 +1004,7 @@ fn corrupt_latest_cup(
10101004
"Did not find log entry that CUP is corrupted"
10111005
);
10121006

1013-
unhalt_subnet(admin_helper, subnet.subnet_id, &[], logger);
1007+
unhalt_subnet(admin_helper, subnet.subnet_id, logger);
10141008
}
10151009

10161010
/// Print ID and IP of the source subnet, the first app subnet found that is not the source, and all

rs/tests/consensus/subnet_recovery/utils.rs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use ic_system_test_driver::{
1818
};
1919
use ic_types::SubnetId;
2020
use slog::{Logger, info};
21-
use std::{fmt::Debug, path::PathBuf};
21+
use std::{collections::BTreeMap, fmt::Debug, path::PathBuf};
2222
use url::Url;
2323

2424
pub const READONLY_USERNAME: &str = "readonly";
@@ -86,7 +86,6 @@ pub fn halt_subnet(
8686
admin_helper: &AdminHelper,
8787
subnet_node: &IcNodeSnapshot,
8888
subnet_id: SubnetId,
89-
keys: &[String],
9089
logger: &Logger,
9190
) {
9291
info!(logger, "Halting subnet {subnet_id}...");
@@ -96,7 +95,11 @@ pub fn halt_subnet(
9695

9796
AdminStep {
9897
logger: logger.clone(),
99-
ic_admin_cmd: admin_helper.get_halt_subnet_command(subnet_id, true, keys),
98+
ic_admin_cmd: admin_helper.get_propose_to_take_subnet_offline_for_repairs_command(
99+
subnet_id,
100+
&[],
101+
&BTreeMap::new(),
102+
),
100103
}
101104
.exec()
102105
.expect("Failed to halt subnet");
@@ -115,16 +118,12 @@ pub fn halt_subnet(
115118

116119
/// Unhalt the given subnet by executing the corresponding ic-admin command through the given NNS
117120
/// URL.
118-
pub fn unhalt_subnet(
119-
admin_helper: &AdminHelper,
120-
subnet_id: SubnetId,
121-
keys: &[String],
122-
logger: &Logger,
123-
) {
121+
pub fn unhalt_subnet(admin_helper: &AdminHelper, subnet_id: SubnetId, logger: &Logger) {
124122
info!(logger, "Unhalting subnet {subnet_id}...");
125123
AdminStep {
126124
logger: logger.clone(),
127-
ic_admin_cmd: admin_helper.get_halt_subnet_command(subnet_id, false, keys),
125+
ic_admin_cmd: admin_helper
126+
.get_propose_to_bring_subnet_back_online_after_repairs_command(subnet_id),
128127
}
129128
.exec()
130129
.expect("Failed to unhalt subnet");

0 commit comments

Comments
 (0)