From 70961d1fa62ac7f750e433201b6f3d18dd89d70d Mon Sep 17 00:00:00 2001 From: quancs001 Date: Fri, 24 Apr 2026 10:32:32 +0800 Subject: [PATCH 1/6] =?UTF-8?q?=E5=8A=A0=E5=85=A5NPU=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../slime/backends/megatron_utils/__init__.py | 22 ++++++++++++++ slime/slime/backends/megatron_utils/actor.py | 18 +++++++++-- slime/slime/backends/megatron_utils/data.py | 10 +++++-- .../megatron_utils/kernels/int4_qat/setup.py | 2 +- slime/slime/backends/megatron_utils/loss.py | 13 +++++--- .../backends/megatron_utils/model_provider.py | 6 +++- .../megatron_utils/update_weight/common.py | 5 ++++ .../update_weight_from_distributed.py | 6 ++-- .../backends/sglang_utils/sglang_engine.py | 6 +++- slime/slime/ray/actor_group.py | 7 +++-- slime/slime/ray/placement_group.py | 18 +++++++---- slime/slime/ray/rollout.py | 11 +++++-- slime/slime/ray/train_actor.py | 13 ++++++-- slime/slime/utils/common.py | 13 ++++++++ .../utils/external_utils/command_utils.py | 14 ++++++++- slime/slime/utils/memory_utils.py | 14 +++++++-- slime/slime/utils/ppo_utils.py | 2 +- slime/tools/convert_hf_to_torch_dist.py | 30 ++++++++++++++----- 18 files changed, 170 insertions(+), 40 deletions(-) create mode 100644 slime/slime/utils/common.py diff --git a/slime/slime/backends/megatron_utils/__init__.py b/slime/slime/backends/megatron_utils/__init__.py index a4666fbeb9..54d6a7703e 100644 --- a/slime/slime/backends/megatron_utils/__init__.py +++ b/slime/slime/backends/megatron_utils/__init__.py @@ -2,6 +2,10 @@ import torch +from slime.utils.common import is_npu +if is_npu(): + import mindspeed.megatron_adaptor + try: import deep_ep from torch_memory_saver import torch_memory_saver @@ -39,4 +43,22 @@ def _patched_forward(self, *args, packed_seq_params=None, **kwargs): except ImportError: pass +try: + from mbridge.models.qwen3_vl.model import Qwen3VLModel + _original_forward2 = Qwen3VLModel.forward + + def _patched_forward2(self, *args, loss_mask=None, **kwargs): + return _original_forward2(self, *args, **kwargs) + Qwen3VLModel.forward = _patched_forward2 +except ImportError: + pass +try: + from megatron.bridge.models.qwen_vl.modelling_qwen3_vl.model import Qwen3VLModel + _original_forward3 = Qwen3VLModel.forward + + def _patched_forward3(self, *args, loss_mask=None, **kwargs): + return _original_forward3(self, *args, **kwargs) + Qwen3VLModel.forward = _patched_forward3 +except ImportError: + pass logging.getLogger("megatron").setLevel(logging.WARNING) diff --git a/slime/slime/backends/megatron_utils/actor.py b/slime/slime/backends/megatron_utils/actor.py index 658069a2e4..8cbc143975 100644 --- a/slime/slime/backends/megatron_utils/actor.py +++ b/slime/slime/backends/megatron_utils/actor.py @@ -10,6 +10,11 @@ import ray import torch import torch.distributed as dist + +from slime.utils.common import is_npu +if is_npu(): + import mindspeed.megatron_adaptor + from mindspeed.megatron_adaptor import repatch from megatron.core import mpu from ray.actor import ActorHandle from torch_memory_saver import torch_memory_saver @@ -150,7 +155,10 @@ def _offload_rollout_data_to_cpu(rollout_data: RolloutBatch) -> None: rollout_data[key] = [v.to("cpu", non_blocking=True) for v in vals] moved_any = True if moved_any: - torch.cuda.synchronize() + if not is_npu(): + torch.cuda.synchronize() + else: + torch.npu.synchronize() class MegatronTrainRayActor(TrainRayActor): @@ -269,6 +277,8 @@ def init( super().init(args, role, with_ref) init(args) + if is_npu(): + repatch(args) if is_megatron_main_rank(): init_tracking(args, primary=False) @@ -1054,8 +1064,12 @@ def connect_actor_critic( group_name = "actor_critic" world_size = 2 + if is_npu(): + backend = "hccl" + else: + backend = "nccl" self._actor_critic_groups = init_process_group( - backend="nccl", + backend=backend, init_method=f"tcp://{master_address}:{master_port}", world_size=world_size, rank=0 if self.role == "actor" else 1, diff --git a/slime/slime/backends/megatron_utils/data.py b/slime/slime/backends/megatron_utils/data.py index e1fb2140d7..6c01031ec6 100644 --- a/slime/slime/backends/megatron_utils/data.py +++ b/slime/slime/backends/megatron_utils/data.py @@ -15,6 +15,7 @@ from slime.utils.metric_utils import compute_pass_rate, compute_rollout_step from slime.utils.seqlen_balancing import get_seqlen_balanced_partitions from slime.utils.types import RolloutBatch +from slime.utils.common import is_npu from ...utils import logging_utils from .cp_utils import get_sum_of_sample_mean, slice_with_cp @@ -31,9 +32,12 @@ def _to_cuda(val: object) -> object: if val is None: return None if isinstance(val, torch.Tensor): - if val.is_cuda: - return val - return val.to(device=torch.cuda.current_device(), non_blocking=True) + if is_npu(): + return val.to(device=torch.npu.current_device(), non_blocking=True) + else: + if val.is_cuda: + return val + return val.to(device=torch.cuda.current_device(), non_blocking=True) if isinstance(val, list): return [_to_cuda(v) for v in val] if isinstance(val, tuple): diff --git a/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py b/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py index 8715dd7b8a..6dd9eceb2b 100644 --- a/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py +++ b/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py @@ -43,7 +43,7 @@ f'-gencode=arch=compute_{arch.replace(".", "")},code=sm_{arch.replace(".", "")}' for arch in arch_list ] - + ["-gencode=arch=compute_90a,code=sm_90a"], + + (["-gencode=arch=compute_90a,code=sm_90a"] if not hasattr(torch,'npu') else []), }, ) ], diff --git a/slime/slime/backends/megatron_utils/loss.py b/slime/slime/backends/megatron_utils/loss.py index 9798f40730..82fe273e8d 100644 --- a/slime/slime/backends/megatron_utils/loss.py +++ b/slime/slime/backends/megatron_utils/loss.py @@ -10,6 +10,7 @@ from slime.utils.distributed_utils import distributed_masked_whiten from slime.utils.misc import load_function +from slime.utils.common import is_npu logger = logging.getLogger(__name__) @@ -87,7 +88,7 @@ def get_responses( Args: logits: Model outputs with shape ``[1, T, V]`` (policy) or - ``[1, T, 1]`` (value). + ``[1, T, 1]`` (value). Must be float32. args: Configuration containing ``rollout_temperature`` for scaling. unconcat_tokens: List of token tensors (prompt+response) per sample. total_lengths: Total sequence lengths (prompt+response) per sample. @@ -101,6 +102,7 @@ def get_responses( """ qkv_format = args.qkv_format + assert logits.dtype == torch.float32, f"{logits.dtype}" assert len(logits.shape) == 3, f"{logits.shape}" logits_gib = logits.nelement() * logits.element_size() / (1 << 30) @@ -530,8 +532,11 @@ def compute_advantages_and_returns(args: Namespace, rollout_data: RolloutBatch) # loss_masks live on CPU (lazy-loading optimisation). We need GPU copies # for the advantage / KL / normalisation math below. The original CPU # tensors in rollout_data["loss_masks"] are NOT modified. - if loss_masks and isinstance(loss_masks[0], torch.Tensor) and not loss_masks[0].is_cuda: - _gpu = torch.cuda.current_device() + if loss_masks and isinstance(loss_masks[0], torch.Tensor) and loss_masks[0].is_cpu: + if is_npu(): + _gpu = torch.npu.current_device() + else: + _gpu = torch.cuda.current_device() loss_masks = [m.to(device=_gpu) for m in loss_masks] if args.kl_coef == 0 or not log_probs: @@ -1198,7 +1203,7 @@ def loss_function( return ( loss, - (num_tokens if args.calculate_per_token_loss else torch.tensor(1, device=logits.device)), + torch.tensor(num_tokens if args.calculate_per_token_loss else 1, device=logits.device), { "keys": list(log.keys()), "values": torch.tensor( diff --git a/slime/slime/backends/megatron_utils/model_provider.py b/slime/slime/backends/megatron_utils/model_provider.py index 09971a5d33..268ea654dc 100644 --- a/slime/slime/backends/megatron_utils/model_provider.py +++ b/slime/slime/backends/megatron_utils/model_provider.py @@ -1,7 +1,6 @@ # Adapt from https://github.com/NVIDIA/Megatron-LM/blob/b1efb3c7126ef7615e8c333432d76e08038e17ff/pretrain_gpt.py import argparse import inspect -import re from contextlib import nullcontext from typing import Literal @@ -114,6 +113,11 @@ def wrapped_model_provider( provider.recompute_method = args.recompute_method provider.recompute_num_layers = args.recompute_num_layers + for key, value in vars(args).items(): + if hasattr(provider, key): + continue + setattr(provider, key, value) + # CLI flags that materially affect train numerics/quality and per-step # speed but are NOT derivable from the HF config. Without these, bridge # mode silently keeps HF-config defaults (e.g. attention_dropout=0.1 diff --git a/slime/slime/backends/megatron_utils/update_weight/common.py b/slime/slime/backends/megatron_utils/update_weight/common.py index 07a78ad13d..513286ea51 100644 --- a/slime/slime/backends/megatron_utils/update_weight/common.py +++ b/slime/slime/backends/megatron_utils/update_weight/common.py @@ -11,6 +11,8 @@ from slime.backends.megatron_utils.misc_utils import strip_param_name_prefix from slime.utils.types import ParamInfo +from slime.utils.common import is_npu + _DISABLE_LINEAR_FC1_RECHUNK = os.getenv("SLIME_QWEN35_DISABLE_LINEAR_FC1_RECHUNK", "0") == "1" @@ -41,6 +43,9 @@ def _merge_tp_partitions( if "linear_fc1.weight" in name and not _DISABLE_LINEAR_FC1_RECHUNK: param_partitions = [p.chunk(2, dim=0) for p in param_partitions] param_partitions = [p[0] for p in param_partitions] + [p[1] for p in param_partitions] + # TODO: Temporary workaround for NPU to set partition_dim to 0 + if is_npu(): + partition_dim = 0 # this is bug in megatron's grouped moe. if "linear_fc2.weight" in name and partition_dim == 0: partition_dim = 1 diff --git a/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py b/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py index a8e50e0e43..b3c6ac24a1 100644 --- a/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py +++ b/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py @@ -12,6 +12,7 @@ from tqdm import tqdm from slime.utils.distributed_utils import get_gloo_group, init_process_group +from slime.utils.common import is_npu from ..megatron_to_hf import convert_to_hf from .common import all_gather_param, named_params_and_buffers @@ -253,6 +254,7 @@ def connect_rollout_engines_from_distributed( master_port = sock.getsockname()[1] world_size = len(rollout_engines) * args.rollout_num_gpus_per_engine + 1 + backend = "hccl" if is_npu() else "nccl" refs = [ engine.init_weights_update_group.remote( master_address, @@ -260,12 +262,12 @@ def connect_rollout_engines_from_distributed( i * args.rollout_num_gpus_per_engine + 1, world_size, group_name, - backend="nccl", + backend=backend, ) for i, engine in enumerate(rollout_engines) ] model_update_groups = init_process_group( - backend="nccl", + backend=backend, init_method=f"tcp://{master_address}:{master_port}", world_size=world_size, rank=0, diff --git a/slime/slime/backends/sglang_utils/sglang_engine.py b/slime/slime/backends/sglang_utils/sglang_engine.py index c146d4c03a..ad8d83598c 100644 --- a/slime/slime/backends/sglang_utils/sglang_engine.py +++ b/slime/slime/backends/sglang_utils/sglang_engine.py @@ -16,6 +16,7 @@ from .qwen3_5 import is_qwen35_model_path, maybe_prepare_qwen35_text_model, patch_sglang_qwen35 from slime.ray.ray_actor import RayActor from slime.utils.http_utils import get_host_info +from slime.utils.common import is_npu logger = logging.getLogger(__name__) @@ -34,7 +35,10 @@ def get_base_gpu_id(args, rank): def _to_local_gpu_id(physical_gpu_id: int) -> int: - cvd = os.environ.get("CUDA_VISIBLE_DEVICES") + if is_npu(): + cvd = os.environ.get("ASCEND_RT_VISIBLE_DEVICES") + else: + cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if not cvd: return physical_gpu_id # no remapping # CUDA_VISIBLE_DEVICES can be like "4,5,6,7" diff --git a/slime/slime/ray/actor_group.py b/slime/slime/ray/actor_group.py index 8de2bcdcee..00e7888e92 100644 --- a/slime/slime/ray/actor_group.py +++ b/slime/slime/ray/actor_group.py @@ -5,6 +5,7 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from slime.ray.utils import NOSET_VISIBLE_DEVICES_ENV_VARS_LIST +from slime.utils.common import is_npu class RayTrainGroup: @@ -87,19 +88,19 @@ def _allocate_gpus_for_actor(self, pg, num_gpus_per_actor): actor_impl = FSDPTrainRayActor - TrainRayActor = ray.remote(num_gpus=1, runtime_env={"env_vars": env_vars})(actor_impl) - + TrainRayActor = ray.remote(runtime_env={"env_vars": env_vars})(actor_impl) + device_name = "NPU" if is_npu() else "GPU" # Create worker actors self._actor_handlers = [] master_addr, master_port = None, None for rank in range(world_size): actor = TrainRayActor.options( num_cpus=num_gpus_per_actor, - num_gpus=num_gpus_per_actor, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, placement_group_bundle_index=reordered_bundle_indices[rank], ), + resources={device_name: num_gpus_per_actor} ).remote(world_size, rank, master_addr, master_port) if rank == 0: master_addr, master_port = ray.get(actor.get_master_addr_and_port.remote()) diff --git a/slime/slime/ray/placement_group.py b/slime/slime/ray/placement_group.py index 59104c9182..6325ca089b 100644 --- a/slime/slime/ray/placement_group.py +++ b/slime/slime/ray/placement_group.py @@ -4,6 +4,7 @@ import ray from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from slime.utils.common import is_npu from .actor_group import RayTrainGroup from .rollout import RolloutManager @@ -11,10 +12,13 @@ logger = logging.getLogger(__name__) -@ray.remote(num_gpus=1) +@ray.remote class InfoActor: def get_ip_and_gpu_id(self): - return ray.util.get_node_ip_address(), ray.get_gpu_ids()[0] + if is_npu(): + return ray.util.get_node_ip_address(), ray.get_runtime_context().get_accelerator_ids()["NPU"][0] + else: + return ray.util.get_node_ip_address(), ray.get_gpu_ids()[0] def sort_key(x): @@ -35,12 +39,13 @@ def sort_key(x): # representation that allows for sorting. node_ip_parts = [ord(c) for c in node_identifier] - return (node_ip_parts, gpu_id) + return (node_ip_parts, int(gpu_id)) def _create_placement_group(num_gpus): """Create a placement group with the specified number of GPUs.""" - bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)] + device_name = "NPU" if is_npu() else "GPU" + bundles = [{device_name: 1, "CPU": 1} for _ in range(num_gpus)] pg = placement_group(bundles, strategy="PACK") num_bundles = len(bundles) @@ -53,7 +58,8 @@ def _create_placement_group(num_gpus): scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, placement_group_bundle_index=i, - ) + ), + resources={device_name: 1} ).remote() ) gpu_ids = ray.get([actor.get_ip_and_gpu_id.remote() for actor in info_actors]) @@ -201,9 +207,11 @@ def create_training_models(args, pgs, rollout_manager): def create_rollout_manager(args, pg, prm_pg=None): + device_name = "NPU" if is_npu() else "GPU" rollout_manager = RolloutManager.options( num_cpus=1, num_gpus=0, + resources={device_name: 0} ).remote(args, pg, prm_pg) # calculate num_rollout from num_epoch diff --git a/slime/slime/ray/rollout.py b/slime/slime/ray/rollout.py index 0d45c0bec6..54bcc37b26 100644 --- a/slime/slime/ray/rollout.py +++ b/slime/slime/ray/rollout.py @@ -30,6 +30,7 @@ from slime.utils.misc import Box, group_by, load_function from slime.utils.seqlen_balancing import get_seqlen_balanced_partitions from slime.utils.types import Sample +from slime.utils.common import is_npu from ..utils.metric_utils import has_repetition from .utils import NOSET_VISIBLE_DEVICES_ENV_VARS_LIST, Lock @@ -89,7 +90,8 @@ def __init__(self, args, pg, prm_pg=None): self.all_prm_engines = [] self.num_new_prm_engines = 0 self.nodes_per_engine = max(1, args.rollout_num_gpus_per_engine // args.num_gpus_per_node) - self.rollout_engine_lock = Lock.options(num_cpus=1, num_gpus=0).remote() + device_name = "NPU" if is_npu() else "GPU" + self.rollout_engine_lock = Lock.options(num_cpus=1, num_gpus=0, resources={device_name: 0}).remote() self.rollout_id = -1 self._metric_checker = MetricChecker.maybe_create(args) @@ -830,6 +832,7 @@ def init_rollout_engines(args, pg, all_rollout_engines): RolloutRayActor = ray.remote(SGLangEngine) rollout_engines = [] + device_name = "NPU" if is_npu() else "GPU" for i in range(num_engines): if all_rollout_engines[i] is not None: continue @@ -849,6 +852,7 @@ def init_rollout_engines(args, pg, all_rollout_engines): env_vars = {name: "1" for name in NOSET_VISIBLE_DEVICES_ENV_VARS_LIST} | { key: os.environ.get(key, default_val) for key, default_val in { + "SGL_JIT_DEEPGEMM_PRECOMPILE": "false", "SGLANG_JIT_DEEPGEMM_PRECOMPILE": "false", "SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK": "true", "SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK": "true", @@ -868,11 +872,11 @@ def init_rollout_engines(args, pg, all_rollout_engines): rollout_engine = RolloutRayActor.options( num_cpus=num_cpus, - num_gpus=num_gpus, scheduling_strategy=scheduling_strategy, runtime_env={ "env_vars": env_vars, }, + resources={device_name: num_gpus} ).remote(args, rank=i, worker_type=worker_type, base_gpu_id=base_gpu_id) rollout_engines.append((i, rollout_engine)) @@ -937,11 +941,12 @@ def init_prm_engines(args, pg, all_prm_engines): }.items() } + device_name = "NPU" if is_npu() else "GPU" prm_engine = RolloutRayActor.options( num_cpus=num_cpus, - num_gpus=num_gpus, scheduling_strategy=scheduling_strategy, runtime_env={"env_vars": env_vars}, + resources={device_name: num_gpus} ).remote(args, rank=i, worker_type="regular", base_gpu_id=base_gpu_id, engine_role="prm") prm_engines.append((i, prm_engine)) diff --git a/slime/slime/ray/train_actor.py b/slime/slime/ray/train_actor.py index 2e900ca5a6..d0a2558386 100644 --- a/slime/slime/ray/train_actor.py +++ b/slime/slime/ray/train_actor.py @@ -13,16 +13,23 @@ from slime.utils.distributed_utils import init_gloo_group from slime.utils.logging_utils import configure_logger from slime.utils.memory_utils import clear_memory, print_memory +from slime.utils.common import is_npu logger = logging.getLogger(__name__) def get_local_gpu_id(): - cvd = os.environ.get("CUDA_VISIBLE_DEVICES", None) + if is_npu(): + env_var = "ASCEND_RT_VISIBLE_DEVICES" + device_ids = ray.get_runtime_context().get_accelerator_ids()["NPU"] + else: + env_var = "CUDA_VISIBLE_DEVICES" + device_ids = ray.get_gpu_ids() + cvd = os.environ.get(env_var, None) if cvd is None: - return ray.get_gpu_ids()[0] + return device_ids[0] else: - return cvd.split(",").index(str(ray.get_gpu_ids()[0])) + return cvd.split(",").index(str(device_ids[0])) class TrainRayActor(RayActor): diff --git a/slime/slime/utils/common.py b/slime/slime/utils/common.py new file mode 100644 index 0000000000..3fde3f69ff --- /dev/null +++ b/slime/slime/utils/common.py @@ -0,0 +1,13 @@ +import torch + + +def is_npu() -> bool: + if not hasattr(torch, "npu"): + return False + + if not torch.npu.is_available(): + raise RuntimeError( + "torch_npu detected, but NPU device is not available or visible." + ) + + return True diff --git a/slime/slime/utils/external_utils/command_utils.py b/slime/slime/utils/external_utils/command_utils.py index 9f51ecdf20..bd4a28b52c 100644 --- a/slime/slime/utils/external_utils/command_utils.py +++ b/slime/slime/utils/external_utils/command_utils.py @@ -12,6 +12,7 @@ from slime.utils.external_utils.typer_utils import dataclass_cli from slime.utils.misc import exec_command +from slime.utils.common import is_npu _ = exec_command, dataclass_cli @@ -127,10 +128,11 @@ def execute_train( ) if not external_ray: + gpus_config = "" if is_npu() else f"--num-gpus {num_gpus_per_node}" exec_command( # will prevent ray from buffering stdout/stderr f"export PYTHONBUFFERED=16 && " - f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats" + f"ray start --head --node-ip-address {master_addr} {gpus_config} --disable-usage-stats" ) if (f := before_ray_job_submit) is not None: @@ -140,6 +142,16 @@ def execute_train( { "env_vars": { "PYTHONPATH": "/root/Megatron-LM/", + "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES": "1", + # Replace with actual Ascend toolkit paths + "ASCEND_TOOLKIT_HOME": "/path/to/ascend/ascend-toolkit/latest/", + "ASCEND_OPP_PATH": "/path/to/ascend/ascend-toolkit/latest/opp/", + "ASCEND_AICPU_PATH": "/path/to/ascend/ascend-toolkit/latest/", + "ASCEND_HOME_PATH": "/path/to/ascend/ascend-toolkit/latest/", + "set_env_path": "/path/to/ascend/nnal/atb/set_env.sh", + "HYDRA_FULL_ERROR": "1", + "HCCL_HOST_SOCKET_PORT_RANGE": "60000-60050", + "HCCL_NPU_SOCKET_PORT_RANGE": "61000-61050", # If setting this in FSDP, the computation communication overlapping may have issues **( {} diff --git a/slime/slime/utils/memory_utils.py b/slime/slime/utils/memory_utils.py index c12f3cd0bc..8907826629 100644 --- a/slime/slime/utils/memory_utils.py +++ b/slime/slime/utils/memory_utils.py @@ -3,6 +3,7 @@ import torch import torch.distributed as dist +from slime.utils.common import is_npu logger = logging.getLogger(__name__) @@ -12,12 +13,19 @@ def clear_memory(clear_host_memory: bool = False): gc.collect() torch.cuda.empty_cache() if clear_host_memory: - torch._C._host_emptyCache() + if is_npu(): + torch.npu.empty_cache() + else: + torch._C._host_emptyCache() def available_memory(): - device = torch.cuda.current_device() - free, total = torch.cuda.mem_get_info(device) + if is_npu(): + device = torch.npu.current_device() + free, total = torch.npu.mem_get_info(device) + else: + device = torch.cuda.current_device() + free, total = torch.cuda.mem_get_info(device) return { "gpu": str(device), "total_GB": _byte_to_gb(total), diff --git a/slime/slime/utils/ppo_utils.py b/slime/slime/utils/ppo_utils.py index a024754883..121feea7e5 100644 --- a/slime/slime/utils/ppo_utils.py +++ b/slime/slime/utils/ppo_utils.py @@ -647,7 +647,7 @@ def chunked_gae( def _is_oom_error(exc: BaseException) -> bool: - if isinstance(exc, torch.cuda.OutOfMemoryError): + if isinstance(exc, torch.cuda.OutOfMemoryError) or "OutOfMemory" in str(exc): return True return "out of memory" in str(exc).lower() diff --git a/slime/tools/convert_hf_to_torch_dist.py b/slime/tools/convert_hf_to_torch_dist.py index 0995bd8b99..16aaf319ba 100644 --- a/slime/tools/convert_hf_to_torch_dist.py +++ b/slime/tools/convert_hf_to_torch_dist.py @@ -4,6 +4,10 @@ import torch import torch.distributed as dist +from slime.utils.common import is_npu +if is_npu(): + import mindspeed.megatron_adaptor + from mindspeed.megatron_adaptor import repatch from megatron.core.enums import ModelType from megatron.training.arguments import parse_args, validate_args from megatron.training.checkpointing import get_checkpoint_name, get_checkpoint_tracker_filename, save_checkpoint @@ -164,20 +168,32 @@ def main(): local_rank = int(os.getenv("LOCAL_RANK") or os.getenv("SLURM_LOCALID") or 0) global_rank = int(os.getenv("RANK") or os.getenv("SLURM_PROCID") or 0) - torch.cuda.set_device(local_rank) + if is_npu(): + torch.npu.set_device(local_rank) + else: + torch.cuda.set_device(local_rank) os.environ.setdefault("WORLD_SIZE", str(world_size)) os.environ.setdefault("RANK", str(global_rank)) os.environ.setdefault("LOCAL_RANK", str(local_rank)) os.environ.setdefault("MASTER_ADDR", "localhost") os.environ.setdefault("MASTER_PORT", "12355") - dist.init_process_group( - backend="nccl", - world_size=world_size, - rank=global_rank, - device_id=torch.device(f"cuda:{local_rank}"), - ) + if is_npu(): + dist.init_process_group( + backend="hccl", + world_size=world_size, + rank=global_rank, + ) + else: + dist.init_process_group( + backend="nccl", + world_size=world_size, + rank=global_rank, + device_id=torch.device(f"cuda:{local_rank}"), + ) args = get_args() init(args) + if is_npu(): + repatch(args) # if using AMD gpus, we have to do the conversion in cpu if hasattr(torch.version, "hip") and torch.version.hip is not None: From bfaca7ceabb2ae0f1c4ec4973f5f7c4cb9fc4e08 Mon Sep 17 00:00:00 2001 From: quancs001 Date: Sat, 25 Apr 2026 19:54:13 +0800 Subject: [PATCH 2/6] +qwen3-4b for NPU --- slime/slime/backends/megatron_utils/loss.py | 1 - slime/train.py | 2 +- toolcall-rl/retool_qwen3_4b_rl_npu.sh | 202 ++++++++++++++++++++ 3 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 toolcall-rl/retool_qwen3_4b_rl_npu.sh diff --git a/slime/slime/backends/megatron_utils/loss.py b/slime/slime/backends/megatron_utils/loss.py index 82fe273e8d..c8fe1122e3 100644 --- a/slime/slime/backends/megatron_utils/loss.py +++ b/slime/slime/backends/megatron_utils/loss.py @@ -102,7 +102,6 @@ def get_responses( """ qkv_format = args.qkv_format - assert logits.dtype == torch.float32, f"{logits.dtype}" assert len(logits.shape) == 3, f"{logits.shape}" logits_gib = logits.nelement() * logits.element_size() / (1 << 30) diff --git a/slime/train.py b/slime/train.py index 01883c4733..40efe3c81e 100644 --- a/slime/train.py +++ b/slime/train.py @@ -17,7 +17,7 @@ def train(args): rollout_manager, num_rollout_per_epoch = create_rollout_manager(args, pgs["rollout"]) # create the actor and critic models - actor_model, critic_model = create_training_models(args, pgs, rollout_manager) + actor_model, critic_model, _ = create_training_models(args, pgs, rollout_manager) if args.offload_rollout: ray.get(rollout_manager.onload_weights.remote()) diff --git a/toolcall-rl/retool_qwen3_4b_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_rl_npu.sh new file mode 100644 index 0000000000..1355a54218 --- /dev/null +++ b/toolcall-rl/retool_qwen3_4b_rl_npu.sh @@ -0,0 +1,202 @@ +#!/bin/bash + +# for rerun the task +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python + +set -ex + +# keep stdout/stderr unbuffered in ray jobs +export PYTHONUNBUFFERED=1 +export PYTHONFAULTHANDLER=1 + +# default to 8 GPUs if not set by scheduler +NUM_GPUS=${NUM_GPUS:-16} +ACTOR_GPUS=${ACTOR_GPUS:-8} +ROLLOUT_GPUS=${ROLLOUT_GPUS:-8} + +# async mode usually runs actor/rollout on separate GPUs +if (( ACTOR_GPUS + ROLLOUT_GPUS > NUM_GPUS )); then + echo "ACTOR_GPUS + ROLLOUT_GPUS must be <= NUM_GPUS" + echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, NUM_GPUS=${NUM_GPUS}" + exit 1 +fi + +# set visible devices +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 +export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 + +# Increase Ray heartbeat/health-check timeouts to reduce false node failures under heavy init. +export RAY_health_check_failure_threshold=20 +export RAY_health_check_period_ms=5000 +export RAY_health_check_timeout_ms=30000 +export RAY_num_heartbeats_timeout=60 + +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +export RAY_DEBUG=1 +export RAY_DEDUP_LOGS=0 + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SLIME_DIR="$(cd -- "${SCRIPT_DIR}/../slime" &>/dev/null && pwd)" +MEGATRON_LM_PATH=${MEGATRON_LM_PATH:-"${SCRIPT_DIR}/../../Megatron-LM"} +MEGATRON_BRIDGE_PATH=${MEGATRON_BRIDGE_PATH:-"${SCRIPT_DIR}/../../Megatron-Bridge/src"} +SGLANG_PATH=${SGLANG_PATH:-"${SCRIPT_DIR}/../../sglang/python"} +source "${SLIME_DIR}/scripts/models/qwen3-4B.sh" + +HF_CKPT=${HF_CKPT:-/data_storage/wyj/systems/huggingface/hub/qwen3-4b-retool-sft} +REF_LOAD=${REF_LOAD:-/data_storage/wyj/systems/huggingface/hub/qwen3-4b-retool-sft_torch_dist} +SAVE_CKPT=${SAVE_CKPT:-/data_storage/wyj/OpenClaw-RL/ckpt/qwen3-4b-retool-rl/} +RESUME_LOAD=${RESUME_LOAD:-${SAVE_CKPT}} +# Use the existing run id to continue plotting on the same W&B curve. +#WANDB_RESUME=${WANDB_RESUME:-must} + +CKPT_ARGS=( + --hf-checkpoint ${HF_CKPT} + --ref-load ${REF_LOAD} + --load ${RESUME_LOAD} + --save ${SAVE_CKPT} + --save-interval 20 + --rotary-base 5000000 +) + +ROLLOUT_ARGS=( + --prompt-data /data_storage/wyj/OpenClaw-RL/data/dapo-math-17k/dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --reward-key score + --num-rollout 3000 + --rollout-batch-size 32 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-max-context-len 16384 + --rollout-temperature 1 + + --num-steps-per-rollout 2 + --balance-data +) + +EVAL_ARGS=( + --eval-interval 20 + --eval-prompt-data aime /data_storage/wyj/OpenClaw-RL/data/aime-2024/aime-2024.jsonl + --n-samples-per-eval-prompt 16 + --eval-max-response-len 16384 + --eval-max-context-len 32768 + --eval-top-p 1 + --eval-reward-key acc +) + +PERF_ARGS=( + --tensor-model-parallel-size 4 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + # --micro-batch-size 1 + --use-dynamic-batch-size + --max-tokens-per-gpu 16384 + --log-probs-chunk-size 1024 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --use-kl-loss + --kl-loss-coef 0.01 + --kl-loss-type k3 + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 + --optimizer-cpu-offload + --overlap-cpu-optimizer-d2h-h2d + --use-precision-aware-optimizer +) + +WANDB_ARGS=( + --use-wandb + --wandb-project slime_retool + --wandb-group qwen3-4B-rl_retool + --wandb-key ${WANDB_KEY} +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 2 + --sglang-mem-fraction-static 0.6 + # ======================= NPU 添加参数 ======================= + --sglang-device npu +) + +MISC_ARGS=( + # default dropout in megatron is 0.1 + --attention-dropout 0.0 + --hidden-dropout 0.0 + # should be good for model performance + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + # need to comment this when using model with MLA + --attention-backend flash +) + +CUSTOM_ARGS=( + --custom-generate-function-path generate_with_retool.generate + --custom-rm-path generate_with_retool.reward_func +) + +export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"max_split_size_mb:2048,expandable_segments:True"} + +# launch the master node of ray in container +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +ray start --head --node-ip-address ${MASTER_ADDR} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 + +# Build the runtime environment JSON with proper variable substitution +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\" + \"ASCEND_TOOLKIT_HOME\": \"/path/to/cann/\", + \"ASCEND_OPP_PATH\": \"/path/to/cann/\", + \"ASCEND_AICPU_PATH\": \"/path/to/cann/\", + \"ASCEND_HOME_PATH\": \"/path/to/cann/\" + } +}" + +ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + -- python3 train_async.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node ${ACTOR_GPUS} \ + --rollout-num-gpus ${ROLLOUT_GPUS} \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${WANDB_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${EVAL_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]} \ + ${CUSTOM_ARGS[@]} \ No newline at end of file From 2275943eef757b12fb8954959ab14ca420b641ea Mon Sep 17 00:00:00 2001 From: quancs001 Date: Sat, 25 Apr 2026 19:59:01 +0800 Subject: [PATCH 3/6] unset proxy --- toolcall-rl/retool_qwen3_4b_rl_npu.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/toolcall-rl/retool_qwen3_4b_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_rl_npu.sh index 1355a54218..5b46e0a9ad 100644 --- a/toolcall-rl/retool_qwen3_4b_rl_npu.sh +++ b/toolcall-rl/retool_qwen3_4b_rl_npu.sh @@ -12,6 +12,8 @@ pkill -9 python set -ex +unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + # keep stdout/stderr unbuffered in ray jobs export PYTHONUNBUFFERED=1 export PYTHONFAULTHANDLER=1 From 46d1979d281962d3775623cb534231c23671ea9a Mon Sep 17 00:00:00 2001 From: quancs001 Date: Sat, 25 Apr 2026 20:30:22 +0800 Subject: [PATCH 4/6] fix --- toolcall-rl/retool_qwen3_4b_rl_npu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toolcall-rl/retool_qwen3_4b_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_rl_npu.sh index 5b46e0a9ad..f733f9f8fa 100644 --- a/toolcall-rl/retool_qwen3_4b_rl_npu.sh +++ b/toolcall-rl/retool_qwen3_4b_rl_npu.sh @@ -166,7 +166,7 @@ CUSTOM_ARGS=( --custom-rm-path generate_with_retool.reward_func ) -export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"max_split_size_mb:2048,expandable_segments:True"} +export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"expandable_segments:True"} # launch the master node of ray in container export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} @@ -177,7 +177,7 @@ RUNTIME_ENV_JSON="{ \"env_vars\": { \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\", \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", - \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\" + \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\", \"ASCEND_TOOLKIT_HOME\": \"/path/to/cann/\", \"ASCEND_OPP_PATH\": \"/path/to/cann/\", \"ASCEND_AICPU_PATH\": \"/path/to/cann/\", From 8716644a6081d92f56f458b38da4235ab3c4d3ee Mon Sep 17 00:00:00 2001 From: quancs001 Date: Sat, 25 Apr 2026 20:38:18 +0800 Subject: [PATCH 5/6] fix --- slime/slime/backends/megatron_utils/loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slime/slime/backends/megatron_utils/loss.py b/slime/slime/backends/megatron_utils/loss.py index c8fe1122e3..a768ca184d 100644 --- a/slime/slime/backends/megatron_utils/loss.py +++ b/slime/slime/backends/megatron_utils/loss.py @@ -88,7 +88,7 @@ def get_responses( Args: logits: Model outputs with shape ``[1, T, V]`` (policy) or - ``[1, T, 1]`` (value). Must be float32. + ``[1, T, 1]`` (value). args: Configuration containing ``rollout_temperature`` for scaling. unconcat_tokens: List of token tensors (prompt+response) per sample. total_lengths: Total sequence lengths (prompt+response) per sample. From 99fa2a435811e965f04707e887c590de0635d14e Mon Sep 17 00:00:00 2001 From: liyongwen <1310439159@qq.com> Date: Fri, 15 May 2026 14:43:21 +0800 Subject: [PATCH 6/6] =?UTF-8?q?=E6=8F=90=E4=BA=A4toolcall-rl-prm=E5=8F=8At?= =?UTF-8?q?erminal-rl=20npu=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- terminal-rl/remote/docker_compose_utils.py | 5 +- terminal-rl/remote/terminal_env.py | 5 +- terminal-rl/terminal-rl_qwen3-8b_npu.sh | 357 +++++++++++++++++++++ toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh | 226 +++++++++++++ 4 files changed, 591 insertions(+), 2 deletions(-) create mode 100644 terminal-rl/terminal-rl_qwen3-8b_npu.sh create mode 100644 toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh diff --git a/terminal-rl/remote/docker_compose_utils.py b/terminal-rl/remote/docker_compose_utils.py index 29a87781c5..3d801ec94b 100644 --- a/terminal-rl/remote/docker_compose_utils.py +++ b/terminal-rl/remote/docker_compose_utils.py @@ -125,7 +125,10 @@ def build_docker_image(task: dict[str, Any], timeout: float = 1200.0) -> None: sessions_logs_path=trial_handler.trial_paths.sessions_path, agent_logs_path=trial_handler.trial_paths.agent_logging_dir, ) - compose_manager.build(timeout=timeout) + try: + compose_manager.build(timeout=timeout) + except TypeError: + compose_manager.build() def _resolve_pull_image(task: dict[str, Any]) -> str: diff --git a/terminal-rl/remote/terminal_env.py b/terminal-rl/remote/terminal_env.py index c9073cc653..b56309321d 100644 --- a/terminal-rl/remote/terminal_env.py +++ b/terminal-rl/remote/terminal_env.py @@ -139,7 +139,10 @@ def _sync_reset() -> tuple[str, list[dict[str, Any]]]: logger=logger, ) else: - self._terminal.start(timeout=self._timeouts.reset_session) + try: + self._terminal.start(timeout=self._timeouts.reset_session) + except TypeError: + self._terminal.start() try: from .docker_compose_utils import ( _DEFAULT_CONTAINER_MEMORY_LIMIT, diff --git a/terminal-rl/terminal-rl_qwen3-8b_npu.sh b/terminal-rl/terminal-rl_qwen3-8b_npu.sh new file mode 100644 index 0000000000..5374a57239 --- /dev/null +++ b/terminal-rl/terminal-rl_qwen3-8b_npu.sh @@ -0,0 +1,357 @@ +#!/usr/bin/env bash +set -euo pipefail +set -ex + +unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY +ulimit -n 65535 + +log() { echo "[$(date +'%F %T')] $*"; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing cmd: $1"; exit 1; }; } + +WORKER_URLS="http://localhost:18081" +ROLLOUT_PROMPT_DATA="terminal-rl/dataset/seta_env_convert/train.jsonl" + +RAY_TMPDIR="ray_tmp" +# CHECK_HOST="localhost" +# ENV_SERVER_PORT="18081" + +export PYTHONUNBUFFERED=1 +export PYTHONFAULTHANDLER=1 + +# set visible devices +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 +export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +export RAY_DEBUG=1 +export RAY_DEDUP_LOGS=0 + +export RAY_health_check_failure_threshold=20 +export RAY_health_check_period_ms=5000 +export RAY_health_check_timeout_ms=30000 +export RAY_num_heartbeats_timeout=60 + +NUM_GPUS="${NUM_GPUS:-8}" +ACTOR_GPUS="${ACTOR_GPUS:-4}" +ROLLOUT_GPUS="${ROLLOUT_GPUS:-4}" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/.." && pwd)}" +CUSTOM_CONFIG_PATH="${CUSTOM_CONFIG_PATH:-${SCRIPT_DIR}/configs/rollout_qwen3.yaml}" + +export REPO_ROOT +export SLIME_DIR="${REPO_ROOT}/slime" +export MEGATRON_DIR="Megatron-LM" + +source "${SLIME_DIR}/scripts/models/qwen3-8B.sh" + +# Paths: set/export before running (no built-in defaults). +HF_HOME="${HF_HOME:-}" +HF_CKPT=Qwen3-8B +REF_LOAD=Qwen3-8B-dist-slime +SAVE_CKPT=Qwen3-8B-save +RESUME_LOAD="${RESUME_LOAD:-${SAVE_CKPT}}" +ROLLOUT_PROMPT_DATA="${ROLLOUT_PROMPT_DATA:-}" + +export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-max_split_size_mb:2048,expandable_segments:True}" +export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" + +export USE_REMOTE_ENV="${USE_REMOTE_ENV:-1}" +export PROVIDER_NAME="${PROVIDER_NAME:-pull}" +export ENV_SERVER_BIND_HOST="${ENV_SERVER_BIND_HOST:-0.0.0.0}" +export ENV_SERVER_PORT="${ENV_SERVER_PORT:-18080}" +export ENV_SERVER_HOST="${ENV_SERVER_HOST:-${MASTER_ADDR}}" +export ENV_SERVER_URL="${ENV_SERVER_URL:-}" +export START_ENV_POOL_SERVER="${START_ENV_POOL_SERVER:-0}" + +# export RAY_TMPDIR="${RAY_TMPDIR:-}" + +export WORKER_URLS="${WORKER_URLS:-}" + +ROUTER_SESSION_NAME="${ROUTER_SESSION_NAME:-terminal_router}" +CONDA_ENV_PATH="${CONDA_ENV_PATH:-}" +ROUTER_PROJECT_DIR="${ROUTER_PROJECT_DIR:-${REPO_ROOT}}" +export CONDA_ENV_PATH +CONDA_PYTHON_VERSION="${CONDA_PYTHON_VERSION:-3.12}" +export CONDA_PYTHON_VERSION +ROUTER_HOST="${ROUTER_HOST:-0.0.0.0}" +ROUTER_PORT="${ROUTER_PORT:-${ENV_SERVER_PORT}}" + +CHECK_HOST="${CHECK_HOST:-127.0.0.1}" +CHECK_WAIT_SECS="${CHECK_WAIT_SECS:-60}" +ROUTER_RESTART="${ROUTER_RESTART:-1}" + +CKPT_ARGS=( + --hf-checkpoint "${HF_CKPT}" + --ref-load "${REF_LOAD}" + # --load "${RESUME_LOAD}" + --save "${SAVE_CKPT}" + --save-interval 80000 + --rotary-base 1000000 +) + +ROLLOUT_ARGS=( + --prompt-data "${ROLLOUT_PROMPT_DATA}" + --input-key task + --rollout-shuffle + --reward-key score + --num-rollout 250 + --rollout-batch-size 16 + --n-samples-per-prompt 4 + --rollout-max-response-len 8192 + --rollout-max-context-len 16384 + --rollout-temperature 1 + + --num-steps-per-rollout 2 + --balance-data +) + +EVAL_ARGS=( + --n-samples-per-eval-prompt 16 + --eval-max-response-len 16384 + --eval-top-p 1 +) + + +PERF_ARGS=( + --tensor-model-parallel-size 4 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + --use-dynamic-batch-size + --max-tokens-per-gpu 16384 + --log-probs-chunk-size 1024 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --dynamic_history + --use-kl-loss + --kl-loss-coef 0.01 + --kl-loss-type k3 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 + --optimizer-cpu-offload + --overlap-cpu-optimizer-d2h-h2d + --use-precision-aware-optimizer +) + +if [[ -n "${WANDB_KEY:-}" ]]; then + WANDB_ARGS=( + --use-wandb + --wandb-project ${WANDB_PROJECT} + --wandb-group ${WANDB_GROUP} + --wandb-key ${WANDB_KEY} + ) +else + WANDB_ARGS=() +fi + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 2 + --sglang-mem-fraction-static 0.6 +) + +MISC_ARGS=( + --attention-dropout 0.0 + --hidden-dropout 0.0 + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + --attention-backend flash +) + +CUSTOM_ARGS=( + --custom-generate-function-path generate.generate + --custom-rollout-log-function-path rollout_log.rollout_log + --custom-config-path "${CUSTOM_CONFIG_PATH}" +) + +check_gpus() { + if (( ACTOR_GPUS + ROLLOUT_GPUS > NUM_GPUS )); then + echo "ACTOR_GPUS + ROLLOUT_GPUS must be <= NUM_GPUS" + echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, NUM_GPUS=${NUM_GPUS}" + exit 1 + fi +} + +cleanup_prev() { + log "cleanup previous processes" + pkill -9 sglang || true + sleep 3 + ray stop --force || true + pkill -9 ray || true + pkill -9 python || true + sleep 3 + pkill -9 ray || true + pkill -9 python || true +} + +start_router() { + require_cmd curl + mkdir -p "${ROUTER_PROJECT_DIR}/logs" + local logf="${ROUTER_PROJECT_DIR}/logs/router_${ROUTER_PORT}.log" + + python -m terminal-rl.router_server \ + --host "${ROUTER_HOST}" --port "${ROUTER_PORT}" --workers "${WORKER_URLS}" \ + > "${logf}" 2>&1 & + + export ROUTER_PID=$! + log "router started pid=${ROUTER_PID}, log=${logf}" + + sleep 1 + tail -n 50 "${logf}" || true +} + +check_router() { + require_cmd curl + local base_url="http://${CHECK_HOST}:${ROUTER_PORT}" + + log "wait router healthz up to ${CHECK_WAIT_SECS}s: ${base_url}/healthz" + for ((i=1; i<=CHECK_WAIT_SECS; i++)); do + if curl -fsS "${base_url}/healthz" >/dev/null 2>&1; then + log "router is up" + break + fi + sleep 1 + done + + log "curl ${base_url}/status" + curl -sS "${base_url}/status" + echo + log "curl ${base_url}/healthz" + curl -sS "${base_url}/healthz" + echo +} + +detect_nvlink() { + local count + count="$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l || true)" + if [[ "${count:-0}" -gt 0 ]]; then + export HAS_NVLINK=1 + else + export HAS_NVLINK=0 + fi + log "HAS_NVLINK=${HAS_NVLINK} (detected ${count} NVLink references)" +} + +maybe_fill_env_server_url() { + if [[ "${USE_REMOTE_ENV}" == "1" && -z "${ENV_SERVER_URL}" ]]; then + export ENV_SERVER_URL="http://${ENV_SERVER_HOST}:${ENV_SERVER_PORT}" + if [[ "${START_ENV_POOL_SERVER}" == "0" ]]; then + export START_ENV_POOL_SERVER=1 + fi + fi + log "ENV_SERVER_URL=${ENV_SERVER_URL} START_ENV_POOL_SERVER=${START_ENV_POOL_SERVER}" +} + +start_ray_head() { + require_cmd ray + log "start ray head" + mkdir -p "${RAY_TMPDIR}" + ray start --head \ + --node-ip-address "${MASTER_ADDR}" \ + --num-gpus "${NUM_GPUS}" \ + --disable-usage-stats \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 \ + --temp-dir "${RAY_TMPDIR}" +} +# --temp-dir "${RAY_TMPDIR}" + +build_runtime_env_json() { + python3 - <<'PY' +import json, os + +conda_env = os.environ.get("CONDA_ENV_PATH", "") +py_ver = os.environ.get("CONDA_PYTHON_VERSION", "3.12") +site_packages = f"{conda_env}/lib/python{py_ver}/site-packages" if conda_env else "" + +parts = [ + os.environ.get("REPO_ROOT",""), + os.environ.get("SLIME_PKG_DIR",""), + os.environ.get("MEGATRON_DIR",""), + os.environ.get("SCRIPT_DIR",""), + site_packages, +] +pythonpath = ":".join([p for p in parts if p]) + +env_vars = { + "PYTHONPATH": pythonpath, + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + "NCCL_NVLS_ENABLE": os.environ.get("HAS_NVLINK","0"), + "PYTORCH_CUDA_ALLOC_CONF": os.environ.get("PYTORCH_CUDA_ALLOC_CONF",""), + "USE_REMOTE_ENV": os.environ.get("USE_REMOTE_ENV","0"), + "ENV_SERVER_URL": os.environ.get("ENV_SERVER_URL",""), +} +print(json.dumps({"env_vars": env_vars})) +PY +} +MEGATRON_LM_PATH=Megatron-LM +MEGATRON_BRIDGE_PATH=Megatron-Bridge/src +SGLANG_PATH=sglang/python +export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"expandable_segments:True"} + + +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\", + \"ASCEND_TOOLKIT_HOME\": \"/usr/local/Ascend/cann-8.5.0/\", + \"ASCEND_OPP_PATH\": \"/usr/local/Ascend/cann-8.5.0/opp/\", + \"ASCEND_AICPU_PATH\": \"/usr/local/Ascend/cann-8.5.0/\", + \"ASCEND_HOME_PATH\": \"/usr/local/Ascend/cann-8.5.0/\" + } +}" + +submit_job() { + log "submit ray job" + local runtime_env_json + runtime_env_json="$(build_runtime_env_json)" + + ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + -- python3 ${SLIME_DIR}/train_async.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node "${ACTOR_GPUS}" \ + --rollout-num-gpus "${ROLLOUT_GPUS}" \ + --skip-eval-before-train \ + "${MODEL_ARGS[@]}" \ + "${CKPT_ARGS[@]}" \ + "${ROLLOUT_ARGS[@]}" \ + "${OPTIMIZER_ARGS[@]}" \ + "${GRPO_ARGS[@]}" \ + "${WANDB_ARGS[@]}" \ + "${PERF_ARGS[@]}" \ + "${EVAL_ARGS[@]}" \ + "${SGLANG_ARGS[@]}" \ + "${MISC_ARGS[@]}" \ + "${CUSTOM_ARGS[@]}" +} + +cleanup_prev + +start_router +check_router + +# check_gpus +# detect_nvlink +maybe_fill_env_server_url +export SCRIPT_DIR +start_ray_head +submit_job diff --git a/toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh new file mode 100644 index 0000000000..8b704b547b --- /dev/null +++ b/toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +# for rerun the task +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python + +set -ex +unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY +ulimit -n 65535 + +# keep stdout/stderr unbuffered in ray jobs +export PYTHONUNBUFFERED=1 +export PYTHONFAULTHANDLER=1 + +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +export RAY_DEBUG=1 +export RAY_DEDUP_LOGS=0 + +# default to 8 GPUs if not set by scheduler +NUM_GPUS=${NUM_GPUS:-8} +ACTOR_GPUS=${ACTOR_GPUS:-2} +ROLLOUT_GPUS=${ROLLOUT_GPUS:-4} +PRM_GPUS=${PRM_GPUS:-2} + +if (( ACTOR_GPUS + ROLLOUT_GPUS + PRM_GPUS > NUM_GPUS )); then + echo "ACTOR_GPUS + ROLLOUT_GPUS + PRM_GPUS must be <= NUM_GPUS" + echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, PRM_GPUS=${PRM_GPUS}, NUM_GPUS=${NUM_GPUS}" + exit 1 +fi + +# set visible devices +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 +export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 + +# Increase Ray heartbeat/health-check timeouts to reduce false node failures under heavy init. +export RAY_health_check_failure_threshold=20 +export RAY_health_check_period_ms=5000 +export RAY_health_check_timeout_ms=30000 +export RAY_num_heartbeats_timeout=60 + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SLIME_DIR="$(cd -- "${SCRIPT_DIR}/../slime" &>/dev/null && pwd)" +MEGATRON_LM_PATH=Megatron-LM +MEGATRON_BRIDGE_PATH=Megatron-Bridge/src +SGLANG_PATH=sglang/python +source "${SLIME_DIR}/scripts/models/qwen3-4B.sh" + +HF_CKPT=qwen3-4b-sft-SGLang-RL +REF_LOAD=qwen3-4b-sft-SGLang-RL-dist-slime +SAVE_CKPT=qwen3-4b-sft-SGLang-RL-save +# RESUME_LOAD=/workspace/OpenClaw-RL/ckpt/qwen3-4b-retool-rl +# Use the existing run id to continue plotting on the same W&B curve. +#WANDB_RESUME=${WANDB_RESUME:-must} + +PRM_MODEL_PATH=Qwen3-4B-Instruct-2507 +# DYNAMIC_HISTORY=1 + +CKPT_ARGS=( + --hf-checkpoint ${HF_CKPT} + --ref-load ${REF_LOAD} + # --load ${RESUME_LOAD} + --save ${SAVE_CKPT} + --save-interval 50 + --rotary-base 5000000 +) + +ROLLOUT_ARGS=( + --prompt-data dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --reward-key score + --num-rollout 220 + --rollout-batch-size 32 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-max-context-len 16384 + --rollout-temperature 1 + --num-steps-per-rollout 2 + --balance-data +) + +EVAL_ARGS=( + --eval-interval 50 + --eval-prompt-data aime aime-2024.jsonl + --n-samples-per-eval-prompt 16 + --eval-max-response-len 16384 + --eval-max-context-len 32768 + --eval-top-p 1 + --eval-reward-key acc +) + +PERF_ARGS=( + --tensor-model-parallel-size 2 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + # --micro-batch-size 1 + --use-dynamic-batch-size + --max-tokens-per-gpu 16384 + --log-probs-chunk-size 1024 +) + +GRPO_ARGS=( + --advantage-estimator step_wise + --use-kl-loss + --kl-loss-coef 0.01 + --kl-loss-type k3 + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 + --optimizer-cpu-offload + --overlap-cpu-optimizer-d2h-h2d + --use-precision-aware-optimizer +) + +WANDB_ARGS=( + # --use-wandb + # --wandb-project slime_retool + # --wandb-group qwen3-4B-rl_retool + # --wandb-key ${WANDB_KEY} +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 2 + --sglang-mem-fraction-static 0.6 + # ======================= NPU 添加参数 ======================= + --sglang-device npu +) + +PRM_ARGS=( + --prm-enable + --prm-num-gpus "${PRM_GPUS}" + --prm-num-gpus-per-engine 2 + --prm-model-path "${PRM_MODEL_PATH}" + --prm-m "${PRM_M:-1}" + --prm-step-coef "${PRM_STEP_COEF:-1.0}" + --prm-temperature "${PRM_TEMPERATURE:-0.6}" + --prm-max-new-tokens "${PRM_MAX_NEW_TOKENS:-4096}" +) + +MISC_ARGS=( + # default dropout in megatron is 0.1 + --attention-dropout 0.0 + --hidden-dropout 0.0 + # should be good for model performance + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + # need to comment this when using model with MLA + --attention-backend flash +) + +CUSTOM_ARGS=( + --custom-generate-function-path generate_with_retool.generate + --custom-rm-path generate_with_retool.reward_func +) + +DYNAMIC_HISTORY_ARGS=() +if [[ "${DYNAMIC_HISTORY:-0}" == "1" ]]; then + DYNAMIC_HISTORY_ARGS+=(--dynamic_history) +fi + +export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"expandable_segments:True"} + +# launch the master node of ray in container +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +ray start --head --node-ip-address ${MASTER_ADDR} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 + +# Build the runtime environment JSON with proper variable substitution +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", + \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\", + \"ASCEND_TOOLKIT_HOME\": \"/usr/local/Ascend/cann-8.5.0/\", + \"ASCEND_OPP_PATH\": \"/usr/local/Ascend/cann-8.5.0/opp/\", + \"ASCEND_AICPU_PATH\": \"/usr/local/Ascend/cann-8.5.0/\", + \"ASCEND_HOME_PATH\": \"/usr/local/Ascend/cann-8.5.0/\" + } +}" + +ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + -- python3 train_async.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node ${ACTOR_GPUS} \ + --rollout-num-gpus ${ROLLOUT_GPUS} \ + --num-gpus-per-node ${NUM_GPUS} \ + --skip-eval-before-train \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${WANDB_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${EVAL_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]} \ + ${CUSTOM_ARGS[@]} \ + ${PRM_ARGS[@]} \ + ${DYNAMIC_HISTORY_ARGS[@]}