From a9252856d2890284c4682cbb262858e8ef7204d8 Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:50:03 +0800 Subject: [PATCH 1/7] Update train_node1.sh In cloud servers, the environment variable names may differ from those used here (for example, in Tencent Cloud's DDP environment, NNODES is actually named WORLD_SIZE). This can cause torch.distributed.run to fail to recognize nnodes, preventing the master node and worker nodes from discovering each other. --- examples/train/multi-node/swift/train_node1.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/train/multi-node/swift/train_node1.sh b/examples/train/multi-node/swift/train_node1.sh index 76d5dc9581..f69f16fc08 100644 --- a/examples/train/multi-node/swift/train_node1.sh +++ b/examples/train/multi-node/swift/train_node1.sh @@ -1,12 +1,12 @@ nnodes=2 nproc_per_node=4 -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -NNODES=$nnodes \ -NODE_RANK=0 \ -MASTER_ADDR=127.0.0.1 \ -MASTER_PORT=29500 \ -NPROC_PER_NODE=$nproc_per_node \ +export CUDA_VISIBLE_DEVICES=0,1,2,3 \ +export NNODES=$nnodes \ +export NODE_RANK=0 \ +export MASTER_ADDR=127.0.0.1 \ +export MASTER_PORT=29500 \ +export NPROC_PER_NODE=$nproc_per_node \ swift sft \ --model Qwen/Qwen2.5-7B-Instruct \ --tuner_type full \ From c3b18ae74323e0934f7818520f793387e5156aed Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:50:38 +0800 Subject: [PATCH 2/7] Update train_node2.sh In cloud servers, the environment variable names may differ from those used here (for example, in Tencent Cloud's DDP environment, NNODES is actually named WORLD_SIZE). This can cause torch.distributed.run to fail to recognize nnodes, preventing the master node and worker nodes from discovering each other. --- examples/train/multi-node/swift/train_node2.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/train/multi-node/swift/train_node2.sh b/examples/train/multi-node/swift/train_node2.sh index 4a64b23bdf..5a92b2aa97 100644 --- a/examples/train/multi-node/swift/train_node2.sh +++ b/examples/train/multi-node/swift/train_node2.sh @@ -1,12 +1,12 @@ nnodes=2 nproc_per_node=4 -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -NNODES=$nnodes \ -NODE_RANK=1 \ -MASTER_ADDR=xxx.xxx.xxx.xxx \ -MASTER_PORT=29500 \ -NPROC_PER_NODE=$nproc_per_node \ +export CUDA_VISIBLE_DEVICES=0,1,2,3 \ +export NNODES=$nnodes \ +export NODE_RANK=1 \ +export MASTER_ADDR=xxx.xxx.xxx.xxx \ +export MASTER_PORT=29500 \ +export NPROC_PER_NODE=$nproc_per_node \ swift sft \ --model Qwen/Qwen2.5-7B-Instruct \ --tuner_type full \ From 1869abefc38475a2a80798fe2845071149e6ab00 Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:59:48 +0800 Subject: [PATCH 3/7] Update train_node1.sh --- examples/train/multi-node/swift/train_node1.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/train/multi-node/swift/train_node1.sh b/examples/train/multi-node/swift/train_node1.sh index f69f16fc08..175cf58d65 100644 --- a/examples/train/multi-node/swift/train_node1.sh +++ b/examples/train/multi-node/swift/train_node1.sh @@ -1,12 +1,12 @@ nnodes=2 nproc_per_node=4 -export CUDA_VISIBLE_DEVICES=0,1,2,3 \ -export NNODES=$nnodes \ -export NODE_RANK=0 \ -export MASTER_ADDR=127.0.0.1 \ -export MASTER_PORT=29500 \ -export NPROC_PER_NODE=$nproc_per_node \ +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export NNODES=$nnodes +export NODE_RANK=0 +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=29500 +export NPROC_PER_NODE=$nproc_per_node swift sft \ --model Qwen/Qwen2.5-7B-Instruct \ --tuner_type full \ From 428c427673690d1ee25be34912ee357ebd1637a6 Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 17:00:03 +0800 Subject: [PATCH 4/7] Update train_node2.sh --- examples/train/multi-node/swift/train_node2.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/train/multi-node/swift/train_node2.sh b/examples/train/multi-node/swift/train_node2.sh index 5a92b2aa97..0e0d677603 100644 --- a/examples/train/multi-node/swift/train_node2.sh +++ b/examples/train/multi-node/swift/train_node2.sh @@ -1,12 +1,12 @@ nnodes=2 nproc_per_node=4 -export CUDA_VISIBLE_DEVICES=0,1,2,3 \ -export NNODES=$nnodes \ -export NODE_RANK=1 \ -export MASTER_ADDR=xxx.xxx.xxx.xxx \ -export MASTER_PORT=29500 \ -export NPROC_PER_NODE=$nproc_per_node \ +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export NNODES=$nnodes +export NODE_RANK=1 +export MASTER_ADDR=xxx.xxx.xxx.xxx +export MASTER_PORT=29500 +export NPROC_PER_NODE=$nproc_per_node swift sft \ --model Qwen/Qwen2.5-7B-Instruct \ --tuner_type full \ From 3996ec015d48fd53ec5cc0f7d3f25491b404b5e9 Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 17:03:48 +0800 Subject: [PATCH 5/7] Update examples/train/multi-node/swift/train_node2.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- examples/train/multi-node/swift/train_node2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/train/multi-node/swift/train_node2.sh b/examples/train/multi-node/swift/train_node2.sh index 0e0d677603..b93c7e8ed2 100644 --- a/examples/train/multi-node/swift/train_node2.sh +++ b/examples/train/multi-node/swift/train_node2.sh @@ -4,7 +4,7 @@ nproc_per_node=4 export CUDA_VISIBLE_DEVICES=0,1,2,3 export NNODES=$nnodes export NODE_RANK=1 -export MASTER_ADDR=xxx.xxx.xxx.xxx +export MASTER_ADDR=xxx.xxx.xxx.xxx # FIXME: Replace with the IP address of the master node (node 1) export MASTER_PORT=29500 export NPROC_PER_NODE=$nproc_per_node swift sft \ From aa69b184699757d5052ccb164bda939b3a8ad2d5 Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:48:19 +0000 Subject: [PATCH 6/7] Update train_node1.sh --- examples/train/multi-node/swift/train_node1.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/train/multi-node/swift/train_node1.sh b/examples/train/multi-node/swift/train_node1.sh index 175cf58d65..1d652ad907 100644 --- a/examples/train/multi-node/swift/train_node1.sh +++ b/examples/train/multi-node/swift/train_node1.sh @@ -1,12 +1,12 @@ nnodes=2 nproc_per_node=4 -export CUDA_VISIBLE_DEVICES=0,1,2,3 -export NNODES=$nnodes -export NODE_RANK=0 -export MASTER_ADDR=127.0.0.1 -export MASTER_PORT=29500 -export NPROC_PER_NODE=$nproc_per_node +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export NNODES=$nnodes +export NODE_RANK=0 +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=29500 +export NPROC_PER_NODE=$nproc_per_node swift sft \ --model Qwen/Qwen2.5-7B-Instruct \ --tuner_type full \ From 714b6174a78591d7d075bcd1f3c2af89ff9930b3 Mon Sep 17 00:00:00 2001 From: Huangfu Yuanxiang <57837413+huangfu170@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:49:02 +0000 Subject: [PATCH 7/7] Update train_node2.sh --- examples/train/multi-node/swift/train_node2.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/train/multi-node/swift/train_node2.sh b/examples/train/multi-node/swift/train_node2.sh index b93c7e8ed2..b2a9a80230 100644 --- a/examples/train/multi-node/swift/train_node2.sh +++ b/examples/train/multi-node/swift/train_node2.sh @@ -1,11 +1,11 @@ nnodes=2 nproc_per_node=4 -export CUDA_VISIBLE_DEVICES=0,1,2,3 -export NNODES=$nnodes -export NODE_RANK=1 +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export NNODES=$nnodes +export NODE_RANK=1 export MASTER_ADDR=xxx.xxx.xxx.xxx # FIXME: Replace with the IP address of the master node (node 1) -export MASTER_PORT=29500 +export MASTER_PORT=29500 export NPROC_PER_NODE=$nproc_per_node swift sft \ --model Qwen/Qwen2.5-7B-Instruct \