Skip to content

Commit 3dc8f45

Browse files
authored
fix(ssh): use port 2222 for inter-pod SSH connections
The generated inter-pod SSH config was missing the Port directive, causing ssh to connect on port 22 while okdev-sshd listens on 2222. Also install openssh-client in the E2E test container and wait for it before attempting inter-pod SSH.
1 parent 9d6ff40 commit 3dc8f45

3 files changed

Lines changed: 15 additions & 1 deletion

File tree

internal/cli/ssh.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,7 @@ func buildInterPodSSHConfig(user string, endpoints []interPodSSHEndpoint) string
460460
for _, endpoint := range endpoints {
461461
fmt.Fprintf(&b, "Host %s\n", endpoint.PodName)
462462
fmt.Fprintf(&b, " HostName %s\n", endpoint.PodIP)
463+
fmt.Fprintf(&b, " Port 2222\n")
463464
fmt.Fprintf(&b, " User %s\n", user)
464465
fmt.Fprintln(&b, " IdentityFile ~/.ssh/okdev_interpod_ed25519")
465466
fmt.Fprintln(&b, " IdentitiesOnly yes")

internal/cli/ssh_interpod_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ func TestBuildInterPodSSHConfigIncludesSessionPods(t *testing.T) {
3636
for _, want := range []string{
3737
"Host trainer-master-0",
3838
"HostName 10.0.0.11",
39+
"Port 2222",
3940
"Host trainer-worker-0",
4041
"HostName 10.0.0.12",
4142
"IdentityFile ~/.ssh/okdev_interpod_ed25519",

scripts/e2e_kind_pytorchjob_interpod_ssh.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ cd "$WORKDIR"
6464
MANIFEST_PATH="$WORKDIR/.okdev/pytorchjob.yaml"
6565
replace_all_in_file "$MANIFEST_PATH" 'name: dev' 'name: pytorch'
6666
replace_all_in_file "$MANIFEST_PATH" 'image: # TODO: replace with your image' 'image: ubuntu:22.04'
67-
replace_all_in_file "$MANIFEST_PATH" 'command: ["sleep", "infinity"]' 'command: ["sh", "-lc", "trap : TERM INT; while true; do sleep 3600; done"]'
67+
replace_all_in_file "$MANIFEST_PATH" 'command: ["sleep", "infinity"]' 'command: ["sh", "-lc", "apt-get update -qq && apt-get install -y -qq openssh-client >/dev/null 2>&1; trap : TERM INT; while true; do sleep 3600; done"]'
6868
replace_all_in_file "$CFG_PATH" 'container: dev' 'container: pytorch'
6969

7070
python3 - <<'PY' "$CFG_PATH"
@@ -116,6 +116,18 @@ if [[ "$WORKER_CONTAINERS" != *"okdev-sidecar"* ]]; then
116116
fi
117117
echo "worker sidecar override verified"
118118

119+
echo "Waiting for ssh client to be available in master pod"
120+
for i in $(seq 1 30); do
121+
if "$OKDEV_BIN" --config "$CFG_PATH" --session "$SESSION_NAME" ssh --cmd 'which ssh' >/dev/null 2>&1; then
122+
break
123+
fi
124+
if [[ "$i" -eq 30 ]]; then
125+
echo "ERROR: ssh client not available in master pod after 60s" >&2
126+
exit 1
127+
fi
128+
sleep 2
129+
done
130+
119131
echo "Verifying inter-pod SSH from master to worker"
120132
"$OKDEV_BIN" --config "$CFG_PATH" --session "$SESSION_NAME" ssh --setup-key --cmd 'echo pytorchjob-ssh-ok' >/dev/null
121133
INTERPOD_SSH_OUTPUT=$("$OKDEV_BIN" --config "$CFG_PATH" --session "$SESSION_NAME" ssh --cmd "ssh -o BatchMode=yes $FIRST_WORKER_POD 'echo interpod-ssh-ok'")

0 commit comments

Comments
 (0)