From 70961d1fa62ac7f750e433201b6f3d18dd89d70d Mon Sep 17 00:00:00 2001
From: quancs001 <quancs@qq.com>
Date: Fri, 24 Apr 2026 10:32:32 +0800
Subject: [PATCH 1/6] =?UTF-8?q?=E5=8A=A0=E5=85=A5NPU=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../slime/backends/megatron_utils/__init__.py | 22 ++++++++++++++
 slime/slime/backends/megatron_utils/actor.py  | 18 +++++++++--
 slime/slime/backends/megatron_utils/data.py   | 10 +++++--
 .../megatron_utils/kernels/int4_qat/setup.py  |  2 +-
 slime/slime/backends/megatron_utils/loss.py   | 13 +++++---
 .../backends/megatron_utils/model_provider.py |  6 +++-
 .../megatron_utils/update_weight/common.py    |  5 ++++
 .../update_weight_from_distributed.py         |  6 ++--
 .../backends/sglang_utils/sglang_engine.py    |  6 +++-
 slime/slime/ray/actor_group.py                |  7 +++--
 slime/slime/ray/placement_group.py            | 18 +++++++----
 slime/slime/ray/rollout.py                    | 11 +++++--
 slime/slime/ray/train_actor.py                | 13 ++++++--
 slime/slime/utils/common.py                   | 13 ++++++++
 .../utils/external_utils/command_utils.py     | 14 ++++++++-
 slime/slime/utils/memory_utils.py             | 14 +++++++--
 slime/slime/utils/ppo_utils.py                |  2 +-
 slime/tools/convert_hf_to_torch_dist.py       | 30 ++++++++++++++-----
 18 files changed, 170 insertions(+), 40 deletions(-)
 create mode 100644 slime/slime/utils/common.py

diff --git a/slime/slime/backends/megatron_utils/__init__.py b/slime/slime/backends/megatron_utils/__init__.py
index a4666fbeb9..54d6a7703e 100644
--- a/slime/slime/backends/megatron_utils/__init__.py
+++ b/slime/slime/backends/megatron_utils/__init__.py
@@ -2,6 +2,10 @@
 
 import torch
 
+from slime.utils.common import is_npu
+if is_npu():
+    import mindspeed.megatron_adaptor
+
 try:
     import deep_ep
     from torch_memory_saver import torch_memory_saver
@@ -39,4 +43,22 @@ def _patched_forward(self, *args, packed_seq_params=None, **kwargs):
 except ImportError:
     pass
 
+try:
+    from mbridge.models.qwen3_vl.model import Qwen3VLModel
+    _original_forward2 = Qwen3VLModel.forward
+
+    def _patched_forward2(self, *args, loss_mask=None, **kwargs):
+        return _original_forward2(self, *args, **kwargs)
+    Qwen3VLModel.forward = _patched_forward2
+except ImportError:
+    pass
+try:
+    from megatron.bridge.models.qwen_vl.modelling_qwen3_vl.model import Qwen3VLModel
+    _original_forward3 = Qwen3VLModel.forward
+    
+    def _patched_forward3(self, *args, loss_mask=None, **kwargs):
+        return _original_forward3(self, *args, **kwargs)
+    Qwen3VLModel.forward = _patched_forward3
+except ImportError:
+    pass
 logging.getLogger("megatron").setLevel(logging.WARNING)
diff --git a/slime/slime/backends/megatron_utils/actor.py b/slime/slime/backends/megatron_utils/actor.py
index 658069a2e4..8cbc143975 100644
--- a/slime/slime/backends/megatron_utils/actor.py
+++ b/slime/slime/backends/megatron_utils/actor.py
@@ -10,6 +10,11 @@
 import ray
 import torch
 import torch.distributed as dist
+
+from slime.utils.common import is_npu
+if is_npu():
+    import mindspeed.megatron_adaptor
+    from mindspeed.megatron_adaptor import repatch
 from megatron.core import mpu
 from ray.actor import ActorHandle
 from torch_memory_saver import torch_memory_saver
@@ -150,7 +155,10 @@ def _offload_rollout_data_to_cpu(rollout_data: RolloutBatch) -> None:
             rollout_data[key] = [v.to("cpu", non_blocking=True) for v in vals]
             moved_any = True
     if moved_any:
-        torch.cuda.synchronize()
+        if not is_npu():
+            torch.cuda.synchronize()
+        else:
+            torch.npu.synchronize()
 
 
 class MegatronTrainRayActor(TrainRayActor):
@@ -269,6 +277,8 @@ def init(
         super().init(args, role, with_ref)
 
         init(args)
+        if is_npu():
+            repatch(args)
 
         if is_megatron_main_rank():
             init_tracking(args, primary=False)
@@ -1054,8 +1064,12 @@ def connect_actor_critic(
 
         group_name = "actor_critic"
         world_size = 2
+        if is_npu():
+            backend = "hccl"
+        else:
+            backend = "nccl"
         self._actor_critic_groups = init_process_group(
-            backend="nccl",
+            backend=backend,
             init_method=f"tcp://{master_address}:{master_port}",
             world_size=world_size,
             rank=0 if self.role == "actor" else 1,
diff --git a/slime/slime/backends/megatron_utils/data.py b/slime/slime/backends/megatron_utils/data.py
index e1fb2140d7..6c01031ec6 100644
--- a/slime/slime/backends/megatron_utils/data.py
+++ b/slime/slime/backends/megatron_utils/data.py
@@ -15,6 +15,7 @@
 from slime.utils.metric_utils import compute_pass_rate, compute_rollout_step
 from slime.utils.seqlen_balancing import get_seqlen_balanced_partitions
 from slime.utils.types import RolloutBatch
+from slime.utils.common import is_npu
 
 from ...utils import logging_utils
 from .cp_utils import get_sum_of_sample_mean, slice_with_cp
@@ -31,9 +32,12 @@ def _to_cuda(val: object) -> object:
     if val is None:
         return None
     if isinstance(val, torch.Tensor):
-        if val.is_cuda:
-            return val
-        return val.to(device=torch.cuda.current_device(), non_blocking=True)
+        if is_npu():
+            return val.to(device=torch.npu.current_device(), non_blocking=True)
+        else:
+            if val.is_cuda:
+                return val
+            return val.to(device=torch.cuda.current_device(), non_blocking=True)
     if isinstance(val, list):
         return [_to_cuda(v) for v in val]
     if isinstance(val, tuple):
diff --git a/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py b/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py
index 8715dd7b8a..6dd9eceb2b 100644
--- a/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py
+++ b/slime/slime/backends/megatron_utils/kernels/int4_qat/setup.py
@@ -43,7 +43,7 @@
                     f'-gencode=arch=compute_{arch.replace(".", "")},code=sm_{arch.replace(".", "")}'
                     for arch in arch_list
                 ]
-                + ["-gencode=arch=compute_90a,code=sm_90a"],
+                + (["-gencode=arch=compute_90a,code=sm_90a"] if not hasattr(torch,'npu') else []),
             },
         )
     ],
diff --git a/slime/slime/backends/megatron_utils/loss.py b/slime/slime/backends/megatron_utils/loss.py
index 9798f40730..82fe273e8d 100644
--- a/slime/slime/backends/megatron_utils/loss.py
+++ b/slime/slime/backends/megatron_utils/loss.py
@@ -10,6 +10,7 @@
 
 from slime.utils.distributed_utils import distributed_masked_whiten
 from slime.utils.misc import load_function
+from slime.utils.common import is_npu
 
 logger = logging.getLogger(__name__)
 
@@ -87,7 +88,7 @@ def get_responses(
 
     Args:
         logits: Model outputs with shape ``[1, T, V]`` (policy) or
-            ``[1, T, 1]`` (value).
+            ``[1, T, 1]`` (value). Must be float32.
         args: Configuration containing ``rollout_temperature`` for scaling.
         unconcat_tokens: List of token tensors (prompt+response) per sample.
         total_lengths: Total sequence lengths (prompt+response) per sample.
@@ -101,6 +102,7 @@ def get_responses(
     """
     qkv_format = args.qkv_format
 
+    assert logits.dtype == torch.float32, f"{logits.dtype}"
     assert len(logits.shape) == 3, f"{logits.shape}"
 
     logits_gib = logits.nelement() * logits.element_size() / (1 << 30)
@@ -530,8 +532,11 @@ def compute_advantages_and_returns(args: Namespace, rollout_data: RolloutBatch)
     # loss_masks live on CPU (lazy-loading optimisation).  We need GPU copies
     # for the advantage / KL / normalisation math below.  The original CPU
     # tensors in rollout_data["loss_masks"] are NOT modified.
-    if loss_masks and isinstance(loss_masks[0], torch.Tensor) and not loss_masks[0].is_cuda:
-        _gpu = torch.cuda.current_device()
+    if loss_masks and isinstance(loss_masks[0], torch.Tensor) and loss_masks[0].is_cpu:
+        if is_npu():
+            _gpu = torch.npu.current_device()
+        else:
+            _gpu = torch.cuda.current_device()
         loss_masks = [m.to(device=_gpu) for m in loss_masks]
 
     if args.kl_coef == 0 or not log_probs:
@@ -1198,7 +1203,7 @@ def loss_function(
 
     return (
         loss,
-        (num_tokens if args.calculate_per_token_loss else torch.tensor(1, device=logits.device)),
+        torch.tensor(num_tokens if args.calculate_per_token_loss else 1, device=logits.device),
         {
             "keys": list(log.keys()),
             "values": torch.tensor(
diff --git a/slime/slime/backends/megatron_utils/model_provider.py b/slime/slime/backends/megatron_utils/model_provider.py
index 09971a5d33..268ea654dc 100644
--- a/slime/slime/backends/megatron_utils/model_provider.py
+++ b/slime/slime/backends/megatron_utils/model_provider.py
@@ -1,7 +1,6 @@
 # Adapt from https://github.com/NVIDIA/Megatron-LM/blob/b1efb3c7126ef7615e8c333432d76e08038e17ff/pretrain_gpt.py
 import argparse
 import inspect
-import re
 from contextlib import nullcontext
 from typing import Literal
 
@@ -114,6 +113,11 @@ def wrapped_model_provider(
             provider.recompute_method = args.recompute_method
             provider.recompute_num_layers = args.recompute_num_layers
 
+        for key, value in vars(args).items():
+            if hasattr(provider, key):
+                continue
+            setattr(provider, key, value)
+
         # CLI flags that materially affect train numerics/quality and per-step
         # speed but are NOT derivable from the HF config. Without these, bridge
         # mode silently keeps HF-config defaults (e.g. attention_dropout=0.1
diff --git a/slime/slime/backends/megatron_utils/update_weight/common.py b/slime/slime/backends/megatron_utils/update_weight/common.py
index 07a78ad13d..513286ea51 100644
--- a/slime/slime/backends/megatron_utils/update_weight/common.py
+++ b/slime/slime/backends/megatron_utils/update_weight/common.py
@@ -11,6 +11,8 @@
 
 from slime.backends.megatron_utils.misc_utils import strip_param_name_prefix
 from slime.utils.types import ParamInfo
+from slime.utils.common import is_npu
+
 
 _DISABLE_LINEAR_FC1_RECHUNK = os.getenv("SLIME_QWEN35_DISABLE_LINEAR_FC1_RECHUNK", "0") == "1"
 
@@ -41,6 +43,9 @@ def _merge_tp_partitions(
     if "linear_fc1.weight" in name and not _DISABLE_LINEAR_FC1_RECHUNK:
         param_partitions = [p.chunk(2, dim=0) for p in param_partitions]
         param_partitions = [p[0] for p in param_partitions] + [p[1] for p in param_partitions]
+        # TODO: Temporary workaround for NPU to set partition_dim to 0
+        if is_npu():
+            partition_dim = 0
     # this is bug in megatron's grouped moe.
     if "linear_fc2.weight" in name and partition_dim == 0:
         partition_dim = 1
diff --git a/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py b/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py
index a8e50e0e43..b3c6ac24a1 100644
--- a/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py
+++ b/slime/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py
@@ -12,6 +12,7 @@
 from tqdm import tqdm
 
 from slime.utils.distributed_utils import get_gloo_group, init_process_group
+from slime.utils.common import is_npu
 
 from ..megatron_to_hf import convert_to_hf
 from .common import all_gather_param, named_params_and_buffers
@@ -253,6 +254,7 @@ def connect_rollout_engines_from_distributed(
         master_port = sock.getsockname()[1]
     world_size = len(rollout_engines) * args.rollout_num_gpus_per_engine + 1
 
+    backend = "hccl" if is_npu() else "nccl"
     refs = [
         engine.init_weights_update_group.remote(
             master_address,
@@ -260,12 +262,12 @@ def connect_rollout_engines_from_distributed(
             i * args.rollout_num_gpus_per_engine + 1,
             world_size,
             group_name,
-            backend="nccl",
+            backend=backend,
         )
         for i, engine in enumerate(rollout_engines)
     ]
     model_update_groups = init_process_group(
-        backend="nccl",
+        backend=backend,
         init_method=f"tcp://{master_address}:{master_port}",
         world_size=world_size,
         rank=0,
diff --git a/slime/slime/backends/sglang_utils/sglang_engine.py b/slime/slime/backends/sglang_utils/sglang_engine.py
index c146d4c03a..ad8d83598c 100644
--- a/slime/slime/backends/sglang_utils/sglang_engine.py
+++ b/slime/slime/backends/sglang_utils/sglang_engine.py
@@ -16,6 +16,7 @@
 from .qwen3_5 import is_qwen35_model_path, maybe_prepare_qwen35_text_model, patch_sglang_qwen35
 from slime.ray.ray_actor import RayActor
 from slime.utils.http_utils import get_host_info
+from slime.utils.common import is_npu
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +35,10 @@ def get_base_gpu_id(args, rank):
 
 
 def _to_local_gpu_id(physical_gpu_id: int) -> int:
-    cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if is_npu():
+        cvd = os.environ.get("ASCEND_RT_VISIBLE_DEVICES")
+    else:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
     if not cvd:
         return physical_gpu_id  # no remapping
     # CUDA_VISIBLE_DEVICES can be like "4,5,6,7"
diff --git a/slime/slime/ray/actor_group.py b/slime/slime/ray/actor_group.py
index 8de2bcdcee..00e7888e92 100644
--- a/slime/slime/ray/actor_group.py
+++ b/slime/slime/ray/actor_group.py
@@ -5,6 +5,7 @@
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
 from slime.ray.utils import NOSET_VISIBLE_DEVICES_ENV_VARS_LIST
+from slime.utils.common import is_npu
 
 
 class RayTrainGroup:
@@ -87,19 +88,19 @@ def _allocate_gpus_for_actor(self, pg, num_gpus_per_actor):
 
             actor_impl = FSDPTrainRayActor
 
-        TrainRayActor = ray.remote(num_gpus=1, runtime_env={"env_vars": env_vars})(actor_impl)
-
+        TrainRayActor = ray.remote(runtime_env={"env_vars": env_vars})(actor_impl)
+        device_name = "NPU" if is_npu() else "GPU"
         # Create worker actors
         self._actor_handlers = []
         master_addr, master_port = None, None
         for rank in range(world_size):
             actor = TrainRayActor.options(
                 num_cpus=num_gpus_per_actor,
-                num_gpus=num_gpus_per_actor,
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
                     placement_group=pg,
                     placement_group_bundle_index=reordered_bundle_indices[rank],
                 ),
+                resources={device_name: num_gpus_per_actor}
             ).remote(world_size, rank, master_addr, master_port)
             if rank == 0:
                 master_addr, master_port = ray.get(actor.get_master_addr_and_port.remote())
diff --git a/slime/slime/ray/placement_group.py b/slime/slime/ray/placement_group.py
index 59104c9182..6325ca089b 100644
--- a/slime/slime/ray/placement_group.py
+++ b/slime/slime/ray/placement_group.py
@@ -4,6 +4,7 @@
 import ray
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from slime.utils.common import is_npu
 
 from .actor_group import RayTrainGroup
 from .rollout import RolloutManager
@@ -11,10 +12,13 @@
 logger = logging.getLogger(__name__)
 
 
-@ray.remote(num_gpus=1)
+@ray.remote
 class InfoActor:
     def get_ip_and_gpu_id(self):
-        return ray.util.get_node_ip_address(), ray.get_gpu_ids()[0]
+        if is_npu():
+            return ray.util.get_node_ip_address(), ray.get_runtime_context().get_accelerator_ids()["NPU"][0]
+        else:
+            return ray.util.get_node_ip_address(), ray.get_gpu_ids()[0]
 
 
 def sort_key(x):
@@ -35,12 +39,13 @@ def sort_key(x):
             # representation that allows for sorting.
             node_ip_parts = [ord(c) for c in node_identifier]
 
-    return (node_ip_parts, gpu_id)
+    return (node_ip_parts, int(gpu_id))
 
 
 def _create_placement_group(num_gpus):
     """Create a placement group with the specified number of GPUs."""
-    bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
+    device_name = "NPU" if is_npu() else "GPU"
+    bundles = [{device_name: 1, "CPU": 1} for _ in range(num_gpus)]
     pg = placement_group(bundles, strategy="PACK")
     num_bundles = len(bundles)
 
@@ -53,7 +58,8 @@ def _create_placement_group(num_gpus):
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
                     placement_group=pg,
                     placement_group_bundle_index=i,
-                )
+                ),
+                resources={device_name: 1}
             ).remote()
         )
     gpu_ids = ray.get([actor.get_ip_and_gpu_id.remote() for actor in info_actors])
@@ -201,9 +207,11 @@ def create_training_models(args, pgs, rollout_manager):
 
 
 def create_rollout_manager(args, pg, prm_pg=None):
+    device_name = "NPU" if is_npu() else "GPU"
     rollout_manager = RolloutManager.options(
         num_cpus=1,
         num_gpus=0,
+        resources={device_name: 0}
     ).remote(args, pg, prm_pg)
 
     # calculate num_rollout from num_epoch
diff --git a/slime/slime/ray/rollout.py b/slime/slime/ray/rollout.py
index 0d45c0bec6..54bcc37b26 100644
--- a/slime/slime/ray/rollout.py
+++ b/slime/slime/ray/rollout.py
@@ -30,6 +30,7 @@
 from slime.utils.misc import Box, group_by, load_function
 from slime.utils.seqlen_balancing import get_seqlen_balanced_partitions
 from slime.utils.types import Sample
+from slime.utils.common import is_npu
 
 from ..utils.metric_utils import has_repetition
 from .utils import NOSET_VISIBLE_DEVICES_ENV_VARS_LIST, Lock
@@ -89,7 +90,8 @@ def __init__(self, args, pg, prm_pg=None):
             self.all_prm_engines = []
             self.num_new_prm_engines = 0
         self.nodes_per_engine = max(1, args.rollout_num_gpus_per_engine // args.num_gpus_per_node)
-        self.rollout_engine_lock = Lock.options(num_cpus=1, num_gpus=0).remote()
+        device_name = "NPU" if is_npu() else "GPU"
+        self.rollout_engine_lock = Lock.options(num_cpus=1, num_gpus=0, resources={device_name: 0}).remote()
         self.rollout_id = -1
 
         self._metric_checker = MetricChecker.maybe_create(args)
@@ -830,6 +832,7 @@ def init_rollout_engines(args, pg, all_rollout_engines):
     RolloutRayActor = ray.remote(SGLangEngine)
 
     rollout_engines = []
+    device_name = "NPU" if is_npu() else "GPU"
     for i in range(num_engines):
         if all_rollout_engines[i] is not None:
             continue
@@ -849,6 +852,7 @@ def init_rollout_engines(args, pg, all_rollout_engines):
         env_vars = {name: "1" for name in NOSET_VISIBLE_DEVICES_ENV_VARS_LIST} | {
             key: os.environ.get(key, default_val)
             for key, default_val in {
+                "SGL_JIT_DEEPGEMM_PRECOMPILE": "false",
                 "SGLANG_JIT_DEEPGEMM_PRECOMPILE": "false",
                 "SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK": "true",
                 "SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK": "true",
@@ -868,11 +872,11 @@ def init_rollout_engines(args, pg, all_rollout_engines):
 
         rollout_engine = RolloutRayActor.options(
             num_cpus=num_cpus,
-            num_gpus=num_gpus,
             scheduling_strategy=scheduling_strategy,
             runtime_env={
                 "env_vars": env_vars,
             },
+            resources={device_name: num_gpus}
         ).remote(args, rank=i, worker_type=worker_type, base_gpu_id=base_gpu_id)
 
         rollout_engines.append((i, rollout_engine))
@@ -937,11 +941,12 @@ def init_prm_engines(args, pg, all_prm_engines):
             }.items()
         }
 
+        device_name = "NPU" if is_npu() else "GPU"
         prm_engine = RolloutRayActor.options(
             num_cpus=num_cpus,
-            num_gpus=num_gpus,
             scheduling_strategy=scheduling_strategy,
             runtime_env={"env_vars": env_vars},
+            resources={device_name: num_gpus}
         ).remote(args, rank=i, worker_type="regular", base_gpu_id=base_gpu_id, engine_role="prm")
 
         prm_engines.append((i, prm_engine))
diff --git a/slime/slime/ray/train_actor.py b/slime/slime/ray/train_actor.py
index 2e900ca5a6..d0a2558386 100644
--- a/slime/slime/ray/train_actor.py
+++ b/slime/slime/ray/train_actor.py
@@ -13,16 +13,23 @@
 from slime.utils.distributed_utils import init_gloo_group
 from slime.utils.logging_utils import configure_logger
 from slime.utils.memory_utils import clear_memory, print_memory
+from slime.utils.common import is_npu
 
 logger = logging.getLogger(__name__)
 
 
 def get_local_gpu_id():
-    cvd = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    if is_npu():
+        env_var = "ASCEND_RT_VISIBLE_DEVICES"
+        device_ids = ray.get_runtime_context().get_accelerator_ids()["NPU"]
+    else:
+        env_var = "CUDA_VISIBLE_DEVICES"
+        device_ids = ray.get_gpu_ids()
+    cvd = os.environ.get(env_var, None)
     if cvd is None:
-        return ray.get_gpu_ids()[0]
+        return device_ids[0]
     else:
-        return cvd.split(",").index(str(ray.get_gpu_ids()[0]))
+        return cvd.split(",").index(str(device_ids[0]))
 
 
 class TrainRayActor(RayActor):
diff --git a/slime/slime/utils/common.py b/slime/slime/utils/common.py
new file mode 100644
index 0000000000..3fde3f69ff
--- /dev/null
+++ b/slime/slime/utils/common.py
@@ -0,0 +1,13 @@
+import torch
+
+
+def is_npu() -> bool:
+    if not hasattr(torch, "npu"):
+        return False
+
+    if not torch.npu.is_available():
+        raise RuntimeError(
+            "torch_npu detected, but NPU device is not available or visible."
+        )
+
+    return True
diff --git a/slime/slime/utils/external_utils/command_utils.py b/slime/slime/utils/external_utils/command_utils.py
index 9f51ecdf20..bd4a28b52c 100644
--- a/slime/slime/utils/external_utils/command_utils.py
+++ b/slime/slime/utils/external_utils/command_utils.py
@@ -12,6 +12,7 @@
 
 from slime.utils.external_utils.typer_utils import dataclass_cli
 from slime.utils.misc import exec_command
+from slime.utils.common import is_npu
 
 _ = exec_command, dataclass_cli
 
@@ -127,10 +128,11 @@ def execute_train(
     )
 
     if not external_ray:
+        gpus_config = "" if is_npu() else f"--num-gpus {num_gpus_per_node}"
         exec_command(
             # will prevent ray from buffering stdout/stderr
             f"export PYTHONBUFFERED=16 && "
-            f"ray start --head --node-ip-address {master_addr} --num-gpus {num_gpus_per_node} --disable-usage-stats"
+            f"ray start --head --node-ip-address {master_addr} {gpus_config} --disable-usage-stats"
         )
 
     if (f := before_ray_job_submit) is not None:
@@ -140,6 +142,16 @@ def execute_train(
         {
             "env_vars": {
                 "PYTHONPATH": "/root/Megatron-LM/",
+                "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES": "1",
+                # Replace with actual Ascend toolkit paths
+                "ASCEND_TOOLKIT_HOME": "/path/to/ascend/ascend-toolkit/latest/",
+                "ASCEND_OPP_PATH": "/path/to/ascend/ascend-toolkit/latest/opp/",
+                "ASCEND_AICPU_PATH": "/path/to/ascend/ascend-toolkit/latest/",
+                "ASCEND_HOME_PATH": "/path/to/ascend/ascend-toolkit/latest/",
+                "set_env_path": "/path/to/ascend/nnal/atb/set_env.sh",
+                "HYDRA_FULL_ERROR": "1",
+                "HCCL_HOST_SOCKET_PORT_RANGE": "60000-60050",
+                "HCCL_NPU_SOCKET_PORT_RANGE": "61000-61050",
                 # If setting this in FSDP, the computation communication overlapping may have issues
                 **(
                     {}
diff --git a/slime/slime/utils/memory_utils.py b/slime/slime/utils/memory_utils.py
index c12f3cd0bc..8907826629 100644
--- a/slime/slime/utils/memory_utils.py
+++ b/slime/slime/utils/memory_utils.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.distributed as dist
+from slime.utils.common import is_npu
 
 logger = logging.getLogger(__name__)
 
@@ -12,12 +13,19 @@ def clear_memory(clear_host_memory: bool = False):
     gc.collect()
     torch.cuda.empty_cache()
     if clear_host_memory:
-        torch._C._host_emptyCache()
+        if is_npu():
+            torch.npu.empty_cache()
+        else:
+            torch._C._host_emptyCache()
 
 
 def available_memory():
-    device = torch.cuda.current_device()
-    free, total = torch.cuda.mem_get_info(device)
+    if is_npu():
+        device = torch.npu.current_device()
+        free, total = torch.npu.mem_get_info(device)
+    else:
+        device = torch.cuda.current_device()
+        free, total = torch.cuda.mem_get_info(device)
     return {
         "gpu": str(device),
         "total_GB": _byte_to_gb(total),
diff --git a/slime/slime/utils/ppo_utils.py b/slime/slime/utils/ppo_utils.py
index a024754883..121feea7e5 100644
--- a/slime/slime/utils/ppo_utils.py
+++ b/slime/slime/utils/ppo_utils.py
@@ -647,7 +647,7 @@ def chunked_gae(
 
 
 def _is_oom_error(exc: BaseException) -> bool:
-    if isinstance(exc, torch.cuda.OutOfMemoryError):
+    if isinstance(exc, torch.cuda.OutOfMemoryError) or "OutOfMemory" in str(exc):
         return True
     return "out of memory" in str(exc).lower()
 
diff --git a/slime/tools/convert_hf_to_torch_dist.py b/slime/tools/convert_hf_to_torch_dist.py
index 0995bd8b99..16aaf319ba 100644
--- a/slime/tools/convert_hf_to_torch_dist.py
+++ b/slime/tools/convert_hf_to_torch_dist.py
@@ -4,6 +4,10 @@
 
 import torch
 import torch.distributed as dist
+from slime.utils.common import is_npu
+if is_npu():
+    import mindspeed.megatron_adaptor
+    from mindspeed.megatron_adaptor import repatch
 from megatron.core.enums import ModelType
 from megatron.training.arguments import parse_args, validate_args
 from megatron.training.checkpointing import get_checkpoint_name, get_checkpoint_tracker_filename, save_checkpoint
@@ -164,20 +168,32 @@ def main():
     local_rank = int(os.getenv("LOCAL_RANK") or os.getenv("SLURM_LOCALID") or 0)
     global_rank = int(os.getenv("RANK") or os.getenv("SLURM_PROCID") or 0)
 
-    torch.cuda.set_device(local_rank)
+    if is_npu():
+        torch.npu.set_device(local_rank)
+    else:
+        torch.cuda.set_device(local_rank)
     os.environ.setdefault("WORLD_SIZE", str(world_size))
     os.environ.setdefault("RANK", str(global_rank))
     os.environ.setdefault("LOCAL_RANK", str(local_rank))
     os.environ.setdefault("MASTER_ADDR", "localhost")
     os.environ.setdefault("MASTER_PORT", "12355")
-    dist.init_process_group(
-        backend="nccl",
-        world_size=world_size,
-        rank=global_rank,
-        device_id=torch.device(f"cuda:{local_rank}"),
-    )
+    if is_npu():
+        dist.init_process_group(
+            backend="hccl",
+            world_size=world_size,
+            rank=global_rank,
+        )
+    else:
+        dist.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=global_rank,
+            device_id=torch.device(f"cuda:{local_rank}"),
+        )
     args = get_args()
     init(args)
+    if is_npu():
+        repatch(args)
 
     # if using AMD gpus, we have to do the conversion in cpu
     if hasattr(torch.version, "hip") and torch.version.hip is not None:

From bfaca7ceabb2ae0f1c4ec4973f5f7c4cb9fc4e08 Mon Sep 17 00:00:00 2001
From: quancs001 <quancs@qq.com>
Date: Sat, 25 Apr 2026 19:54:13 +0800
Subject: [PATCH 2/6] +qwen3-4b for NPU

---
 slime/slime/backends/megatron_utils/loss.py |   1 -
 slime/train.py                              |   2 +-
 toolcall-rl/retool_qwen3_4b_rl_npu.sh       | 202 ++++++++++++++++++++
 3 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 toolcall-rl/retool_qwen3_4b_rl_npu.sh

diff --git a/slime/slime/backends/megatron_utils/loss.py b/slime/slime/backends/megatron_utils/loss.py
index 82fe273e8d..c8fe1122e3 100644
--- a/slime/slime/backends/megatron_utils/loss.py
+++ b/slime/slime/backends/megatron_utils/loss.py
@@ -102,7 +102,6 @@ def get_responses(
     """
     qkv_format = args.qkv_format
 
-    assert logits.dtype == torch.float32, f"{logits.dtype}"
     assert len(logits.shape) == 3, f"{logits.shape}"
 
     logits_gib = logits.nelement() * logits.element_size() / (1 << 30)
diff --git a/slime/train.py b/slime/train.py
index 01883c4733..40efe3c81e 100644
--- a/slime/train.py
+++ b/slime/train.py
@@ -17,7 +17,7 @@ def train(args):
     rollout_manager, num_rollout_per_epoch = create_rollout_manager(args, pgs["rollout"])
 
     # create the actor and critic models
-    actor_model, critic_model = create_training_models(args, pgs, rollout_manager)
+    actor_model, critic_model, _ = create_training_models(args, pgs, rollout_manager)
 
     if args.offload_rollout:
         ray.get(rollout_manager.onload_weights.remote())
diff --git a/toolcall-rl/retool_qwen3_4b_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_rl_npu.sh
new file mode 100644
index 0000000000..1355a54218
--- /dev/null
+++ b/toolcall-rl/retool_qwen3_4b_rl_npu.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# keep stdout/stderr unbuffered in ray jobs
+export PYTHONUNBUFFERED=1
+export PYTHONFAULTHANDLER=1
+
+# default to 8 GPUs if not set by scheduler
+NUM_GPUS=${NUM_GPUS:-16}
+ACTOR_GPUS=${ACTOR_GPUS:-8}
+ROLLOUT_GPUS=${ROLLOUT_GPUS:-8}
+
+# async mode usually runs actor/rollout on separate GPUs
+if (( ACTOR_GPUS + ROLLOUT_GPUS > NUM_GPUS )); then
+    echo "ACTOR_GPUS + ROLLOUT_GPUS must be <= NUM_GPUS"
+    echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, NUM_GPUS=${NUM_GPUS}"
+    exit 1
+fi
+
+# set visible devices
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+
+# Increase Ray heartbeat/health-check timeouts to reduce false node failures under heavy init.
+export RAY_health_check_failure_threshold=20
+export RAY_health_check_period_ms=5000
+export RAY_health_check_timeout_ms=30000
+export RAY_num_heartbeats_timeout=60
+
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export RAY_DEBUG=1
+export RAY_DEDUP_LOGS=0
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+SLIME_DIR="$(cd -- "${SCRIPT_DIR}/../slime" &>/dev/null && pwd)"
+MEGATRON_LM_PATH=${MEGATRON_LM_PATH:-"${SCRIPT_DIR}/../../Megatron-LM"}
+MEGATRON_BRIDGE_PATH=${MEGATRON_BRIDGE_PATH:-"${SCRIPT_DIR}/../../Megatron-Bridge/src"}
+SGLANG_PATH=${SGLANG_PATH:-"${SCRIPT_DIR}/../../sglang/python"}
+source "${SLIME_DIR}/scripts/models/qwen3-4B.sh"
+
+HF_CKPT=${HF_CKPT:-/data_storage/wyj/systems/huggingface/hub/qwen3-4b-retool-sft}
+REF_LOAD=${REF_LOAD:-/data_storage/wyj/systems/huggingface/hub/qwen3-4b-retool-sft_torch_dist}
+SAVE_CKPT=${SAVE_CKPT:-/data_storage/wyj/OpenClaw-RL/ckpt/qwen3-4b-retool-rl/}
+RESUME_LOAD=${RESUME_LOAD:-${SAVE_CKPT}}
+# Use the existing run id to continue plotting on the same W&B curve.
+#WANDB_RESUME=${WANDB_RESUME:-must}
+
+CKPT_ARGS=(
+   --hf-checkpoint ${HF_CKPT}
+   --ref-load ${REF_LOAD}
+   --load ${RESUME_LOAD}
+   --save ${SAVE_CKPT}
+   --save-interval 20
+   --rotary-base 5000000
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /data_storage/wyj/OpenClaw-RL/data/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --reward-key score
+   --num-rollout 3000
+   --rollout-batch-size 32
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-max-context-len 16384
+   --rollout-temperature 1
+
+   --num-steps-per-rollout 2
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 20
+   --eval-prompt-data aime /data_storage/wyj/OpenClaw-RL/data/aime-2024/aime-2024.jsonl
+   --n-samples-per-eval-prompt 16
+   --eval-max-response-len 16384
+   --eval-max-context-len 32768
+   --eval-top-p 1
+   --eval-reward-key acc
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 4
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   # --micro-batch-size 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 16384
+   --log-probs-chunk-size 1024
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.01
+   --kl-loss-type k3
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+   --use-wandb
+   --wandb-project slime_retool
+   --wandb-group qwen3-4B-rl_retool
+   --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 2
+   --sglang-mem-fraction-static 0.6
+   # ======================= NPU 添加参数 =======================
+   --sglang-device npu
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+)
+
+CUSTOM_ARGS=(
+   --custom-generate-function-path generate_with_retool.generate
+   --custom-rm-path generate_with_retool.reward_func
+)
+
+export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"max_split_size_mb:2048,expandable_segments:True"}
+
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+ray start --head --node-ip-address ${MASTER_ADDR} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\"
+    \"ASCEND_TOOLKIT_HOME\": \"/path/to/cann/\",
+    \"ASCEND_OPP_PATH\": \"/path/to/cann/\",
+    \"ASCEND_AICPU_PATH\": \"/path/to/cann/\",
+    \"ASCEND_HOME_PATH\": \"/path/to/cann/\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train_async.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node ${ACTOR_GPUS} \
+   --rollout-num-gpus ${ROLLOUT_GPUS} \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]} \
+   ${CUSTOM_ARGS[@]}
\ No newline at end of file

From 2275943eef757b12fb8954959ab14ca420b641ea Mon Sep 17 00:00:00 2001
From: quancs001 <quancs@qq.com>
Date: Sat, 25 Apr 2026 19:59:01 +0800
Subject: [PATCH 3/6] unset proxy

---
 toolcall-rl/retool_qwen3_4b_rl_npu.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/toolcall-rl/retool_qwen3_4b_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_rl_npu.sh
index 1355a54218..5b46e0a9ad 100644
--- a/toolcall-rl/retool_qwen3_4b_rl_npu.sh
+++ b/toolcall-rl/retool_qwen3_4b_rl_npu.sh
@@ -12,6 +12,8 @@ pkill -9 python
 
 set -ex
 
+unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
+
 # keep stdout/stderr unbuffered in ray jobs
 export PYTHONUNBUFFERED=1
 export PYTHONFAULTHANDLER=1

From 46d1979d281962d3775623cb534231c23671ea9a Mon Sep 17 00:00:00 2001
From: quancs001 <quancs@qq.com>
Date: Sat, 25 Apr 2026 20:30:22 +0800
Subject: [PATCH 4/6] fix

---
 toolcall-rl/retool_qwen3_4b_rl_npu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/toolcall-rl/retool_qwen3_4b_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_rl_npu.sh
index 5b46e0a9ad..f733f9f8fa 100644
--- a/toolcall-rl/retool_qwen3_4b_rl_npu.sh
+++ b/toolcall-rl/retool_qwen3_4b_rl_npu.sh
@@ -166,7 +166,7 @@ CUSTOM_ARGS=(
    --custom-rm-path generate_with_retool.reward_func
 )
 
-export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"max_split_size_mb:2048,expandable_segments:True"}
+export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"expandable_segments:True"}
 
 # launch the master node of ray in container
 export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
@@ -177,7 +177,7 @@ RUNTIME_ENV_JSON="{
   \"env_vars\": {
     \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\",
     \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
-    \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\"
+    \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\",
     \"ASCEND_TOOLKIT_HOME\": \"/path/to/cann/\",
     \"ASCEND_OPP_PATH\": \"/path/to/cann/\",
     \"ASCEND_AICPU_PATH\": \"/path/to/cann/\",

From 8716644a6081d92f56f458b38da4235ab3c4d3ee Mon Sep 17 00:00:00 2001
From: quancs001 <quancs@qq.com>
Date: Sat, 25 Apr 2026 20:38:18 +0800
Subject: [PATCH 5/6] fix

---
 slime/slime/backends/megatron_utils/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slime/slime/backends/megatron_utils/loss.py b/slime/slime/backends/megatron_utils/loss.py
index c8fe1122e3..a768ca184d 100644
--- a/slime/slime/backends/megatron_utils/loss.py
+++ b/slime/slime/backends/megatron_utils/loss.py
@@ -88,7 +88,7 @@ def get_responses(
 
     Args:
         logits: Model outputs with shape ``[1, T, V]`` (policy) or
-            ``[1, T, 1]`` (value). Must be float32.
+            ``[1, T, 1]`` (value).
         args: Configuration containing ``rollout_temperature`` for scaling.
         unconcat_tokens: List of token tensors (prompt+response) per sample.
         total_lengths: Total sequence lengths (prompt+response) per sample.

From 99fa2a435811e965f04707e887c590de0635d14e Mon Sep 17 00:00:00 2001
From: liyongwen <1310439159@qq.com>
Date: Fri, 15 May 2026 14:43:21 +0800
Subject: [PATCH 6/6] =?UTF-8?q?=E6=8F=90=E4=BA=A4toolcall-rl-prm=E5=8F=8At?=
 =?UTF-8?q?erminal-rl=20npu=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 terminal-rl/remote/docker_compose_utils.py |   5 +-
 terminal-rl/remote/terminal_env.py         |   5 +-
 terminal-rl/terminal-rl_qwen3-8b_npu.sh    | 357 +++++++++++++++++++++
 toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh  | 226 +++++++++++++
 4 files changed, 591 insertions(+), 2 deletions(-)
 create mode 100644 terminal-rl/terminal-rl_qwen3-8b_npu.sh
 create mode 100644 toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh

diff --git a/terminal-rl/remote/docker_compose_utils.py b/terminal-rl/remote/docker_compose_utils.py
index 29a87781c5..3d801ec94b 100644
--- a/terminal-rl/remote/docker_compose_utils.py
+++ b/terminal-rl/remote/docker_compose_utils.py
@@ -125,7 +125,10 @@ def build_docker_image(task: dict[str, Any], timeout: float = 1200.0) -> None:
         sessions_logs_path=trial_handler.trial_paths.sessions_path,
         agent_logs_path=trial_handler.trial_paths.agent_logging_dir,
     )
-    compose_manager.build(timeout=timeout)
+    try:
+        compose_manager.build(timeout=timeout)
+    except TypeError:
+        compose_manager.build()
 
 
 def _resolve_pull_image(task: dict[str, Any]) -> str:
diff --git a/terminal-rl/remote/terminal_env.py b/terminal-rl/remote/terminal_env.py
index c9073cc653..b56309321d 100644
--- a/terminal-rl/remote/terminal_env.py
+++ b/terminal-rl/remote/terminal_env.py
@@ -139,7 +139,10 @@ def _sync_reset() -> tuple[str, list[dict[str, Any]]]:
                     logger=logger,
                 )
             else:
-                self._terminal.start(timeout=self._timeouts.reset_session)
+                try:
+                    self._terminal.start(timeout=self._timeouts.reset_session)
+                except TypeError:
+                    self._terminal.start()
                 try:
                     from .docker_compose_utils import (
                         _DEFAULT_CONTAINER_MEMORY_LIMIT,
diff --git a/terminal-rl/terminal-rl_qwen3-8b_npu.sh b/terminal-rl/terminal-rl_qwen3-8b_npu.sh
new file mode 100644
index 0000000000..5374a57239
--- /dev/null
+++ b/terminal-rl/terminal-rl_qwen3-8b_npu.sh
@@ -0,0 +1,357 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -ex
+
+unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
+ulimit -n 65535
+
+log() { echo "[$(date +'%F %T')] $*"; }
+
+require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing cmd: $1"; exit 1; }; }
+
+WORKER_URLS="http://localhost:18081"
+ROLLOUT_PROMPT_DATA="terminal-rl/dataset/seta_env_convert/train.jsonl"
+
+RAY_TMPDIR="ray_tmp"
+# CHECK_HOST="localhost"
+# ENV_SERVER_PORT="18081"
+
+export PYTHONUNBUFFERED=1
+export PYTHONFAULTHANDLER=1
+
+# set visible devices
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export RAY_DEBUG=1
+export RAY_DEDUP_LOGS=0
+
+export RAY_health_check_failure_threshold=20
+export RAY_health_check_period_ms=5000
+export RAY_health_check_timeout_ms=30000
+export RAY_num_heartbeats_timeout=60
+
+NUM_GPUS="${NUM_GPUS:-8}"
+ACTOR_GPUS="${ACTOR_GPUS:-4}"
+ROLLOUT_GPUS="${ROLLOUT_GPUS:-4}"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/.." && pwd)}"
+CUSTOM_CONFIG_PATH="${CUSTOM_CONFIG_PATH:-${SCRIPT_DIR}/configs/rollout_qwen3.yaml}"
+
+export REPO_ROOT
+export SLIME_DIR="${REPO_ROOT}/slime"
+export MEGATRON_DIR="Megatron-LM"
+
+source "${SLIME_DIR}/scripts/models/qwen3-8B.sh"
+
+# Paths: set/export before running (no built-in defaults).
+HF_HOME="${HF_HOME:-}"
+HF_CKPT=Qwen3-8B
+REF_LOAD=Qwen3-8B-dist-slime
+SAVE_CKPT=Qwen3-8B-save
+RESUME_LOAD="${RESUME_LOAD:-${SAVE_CKPT}}"
+ROLLOUT_PROMPT_DATA="${ROLLOUT_PROMPT_DATA:-}"
+
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-max_split_size_mb:2048,expandable_segments:True}"
+export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+
+export USE_REMOTE_ENV="${USE_REMOTE_ENV:-1}"
+export PROVIDER_NAME="${PROVIDER_NAME:-pull}"
+export ENV_SERVER_BIND_HOST="${ENV_SERVER_BIND_HOST:-0.0.0.0}"
+export ENV_SERVER_PORT="${ENV_SERVER_PORT:-18080}"
+export ENV_SERVER_HOST="${ENV_SERVER_HOST:-${MASTER_ADDR}}"
+export ENV_SERVER_URL="${ENV_SERVER_URL:-}"
+export START_ENV_POOL_SERVER="${START_ENV_POOL_SERVER:-0}"
+
+# export RAY_TMPDIR="${RAY_TMPDIR:-}"
+
+export WORKER_URLS="${WORKER_URLS:-}"
+
+ROUTER_SESSION_NAME="${ROUTER_SESSION_NAME:-terminal_router}"
+CONDA_ENV_PATH="${CONDA_ENV_PATH:-}"
+ROUTER_PROJECT_DIR="${ROUTER_PROJECT_DIR:-${REPO_ROOT}}"
+export CONDA_ENV_PATH
+CONDA_PYTHON_VERSION="${CONDA_PYTHON_VERSION:-3.12}"
+export CONDA_PYTHON_VERSION
+ROUTER_HOST="${ROUTER_HOST:-0.0.0.0}"
+ROUTER_PORT="${ROUTER_PORT:-${ENV_SERVER_PORT}}"
+
+CHECK_HOST="${CHECK_HOST:-127.0.0.1}"
+CHECK_WAIT_SECS="${CHECK_WAIT_SECS:-60}"
+ROUTER_RESTART="${ROUTER_RESTART:-1}"
+
+CKPT_ARGS=(
+  --hf-checkpoint "${HF_CKPT}"
+  --ref-load "${REF_LOAD}"
+  # --load "${RESUME_LOAD}"
+  --save "${SAVE_CKPT}"
+  --save-interval 80000
+  --rotary-base 1000000
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data "${ROLLOUT_PROMPT_DATA}"
+   --input-key task
+   --rollout-shuffle
+   --reward-key score
+   --num-rollout 250
+   --rollout-batch-size 16
+   --n-samples-per-prompt 4
+   --rollout-max-response-len 8192
+   --rollout-max-context-len 16384
+   --rollout-temperature 1
+
+   --num-steps-per-rollout 2
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --n-samples-per-eval-prompt 16
+   --eval-max-response-len 16384
+   --eval-top-p 1
+)
+
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 4
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 16384
+   --log-probs-chunk-size 1024
+)
+
+GRPO_ARGS=(
+  --advantage-estimator grpo
+  --dynamic_history
+  --use-kl-loss
+  --kl-loss-coef 0.01
+  --kl-loss-type k3
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+if [[ -n "${WANDB_KEY:-}" ]]; then
+  WANDB_ARGS=(
+    --use-wandb
+    --wandb-project ${WANDB_PROJECT}
+    --wandb-group ${WANDB_GROUP}
+    --wandb-key ${WANDB_KEY}
+  )
+else
+  WANDB_ARGS=()
+fi
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 2
+   --sglang-mem-fraction-static 0.6
+)
+
+MISC_ARGS=(
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+)
+
+CUSTOM_ARGS=(
+   --custom-generate-function-path generate.generate
+   --custom-rollout-log-function-path rollout_log.rollout_log
+   --custom-config-path "${CUSTOM_CONFIG_PATH}"
+)
+
+check_gpus() {
+  if (( ACTOR_GPUS + ROLLOUT_GPUS > NUM_GPUS )); then
+    echo "ACTOR_GPUS + ROLLOUT_GPUS must be <= NUM_GPUS"
+    echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, NUM_GPUS=${NUM_GPUS}"
+    exit 1
+  fi
+}
+
+cleanup_prev() {
+  log "cleanup previous processes"
+  pkill -9 sglang || true
+  sleep 3
+  ray stop --force || true
+  pkill -9 ray || true
+  pkill -9 python || true
+  sleep 3
+  pkill -9 ray || true
+  pkill -9 python || true
+}
+
+start_router() {
+  require_cmd curl
+  mkdir -p "${ROUTER_PROJECT_DIR}/logs"
+  local logf="${ROUTER_PROJECT_DIR}/logs/router_${ROUTER_PORT}.log"
+
+  python -m terminal-rl.router_server \
+    --host "${ROUTER_HOST}" --port "${ROUTER_PORT}" --workers "${WORKER_URLS}" \
+    > "${logf}" 2>&1 &
+
+  export ROUTER_PID=$!
+  log "router started pid=${ROUTER_PID}, log=${logf}"
+
+  sleep 1
+  tail -n 50 "${logf}" || true
+}
+
+check_router() {
+  require_cmd curl
+  local base_url="http://${CHECK_HOST}:${ROUTER_PORT}"
+
+  log "wait router healthz up to ${CHECK_WAIT_SECS}s: ${base_url}/healthz"
+  for ((i=1; i<=CHECK_WAIT_SECS; i++)); do
+    if curl -fsS "${base_url}/healthz" >/dev/null 2>&1; then
+      log "router is up"
+      break
+    fi
+    sleep 1
+  done
+
+  log "curl ${base_url}/status"
+  curl -sS "${base_url}/status"
+  echo
+  log "curl ${base_url}/healthz"
+  curl -sS "${base_url}/healthz"
+  echo
+}
+
+detect_nvlink() {
+  local count
+  count="$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l || true)"
+  if [[ "${count:-0}" -gt 0 ]]; then
+    export HAS_NVLINK=1
+  else
+    export HAS_NVLINK=0
+  fi
+  log "HAS_NVLINK=${HAS_NVLINK} (detected ${count} NVLink references)"
+}
+
+maybe_fill_env_server_url() {
+  if [[ "${USE_REMOTE_ENV}" == "1" && -z "${ENV_SERVER_URL}" ]]; then
+    export ENV_SERVER_URL="http://${ENV_SERVER_HOST}:${ENV_SERVER_PORT}"
+    if [[ "${START_ENV_POOL_SERVER}" == "0" ]]; then
+      export START_ENV_POOL_SERVER=1
+    fi
+  fi
+  log "ENV_SERVER_URL=${ENV_SERVER_URL} START_ENV_POOL_SERVER=${START_ENV_POOL_SERVER}"
+}
+
+start_ray_head() {
+  require_cmd ray
+  log "start ray head"
+  mkdir -p "${RAY_TMPDIR}"
+  ray start --head \
+    --node-ip-address "${MASTER_ADDR}" \
+    --num-gpus "${NUM_GPUS}" \
+    --disable-usage-stats \
+    --dashboard-host=0.0.0.0 \
+    --dashboard-port=8265 \
+    --temp-dir "${RAY_TMPDIR}"
+}
+# --temp-dir "${RAY_TMPDIR}"
+
+build_runtime_env_json() {
+  python3 - <<'PY'
+import json, os
+
+conda_env = os.environ.get("CONDA_ENV_PATH", "")
+py_ver = os.environ.get("CONDA_PYTHON_VERSION", "3.12")
+site_packages = f"{conda_env}/lib/python{py_ver}/site-packages" if conda_env else ""
+
+parts = [
+  os.environ.get("REPO_ROOT",""),
+  os.environ.get("SLIME_PKG_DIR",""),
+  os.environ.get("MEGATRON_DIR",""),
+  os.environ.get("SCRIPT_DIR",""),
+  site_packages,
+]
+pythonpath = ":".join([p for p in parts if p])
+
+env_vars = {
+  "PYTHONPATH": pythonpath,
+  "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+  "NCCL_NVLS_ENABLE": os.environ.get("HAS_NVLINK","0"),
+  "PYTORCH_CUDA_ALLOC_CONF": os.environ.get("PYTORCH_CUDA_ALLOC_CONF",""),
+  "USE_REMOTE_ENV": os.environ.get("USE_REMOTE_ENV","0"),
+  "ENV_SERVER_URL": os.environ.get("ENV_SERVER_URL",""),
+}
+print(json.dumps({"env_vars": env_vars}))
+PY
+}
+MEGATRON_LM_PATH=Megatron-LM
+MEGATRON_BRIDGE_PATH=Megatron-Bridge/src
+SGLANG_PATH=sglang/python
+export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"expandable_segments:True"}
+
+
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\",
+    \"ASCEND_TOOLKIT_HOME\": \"/usr/local/Ascend/cann-8.5.0/\",
+    \"ASCEND_OPP_PATH\": \"/usr/local/Ascend/cann-8.5.0/opp/\",
+    \"ASCEND_AICPU_PATH\": \"/usr/local/Ascend/cann-8.5.0/\",
+    \"ASCEND_HOME_PATH\": \"/usr/local/Ascend/cann-8.5.0/\"
+  }
+}"
+
+submit_job() {
+  log "submit ray job"
+  local runtime_env_json
+  runtime_env_json="$(build_runtime_env_json)"
+
+  ray job submit --address="http://127.0.0.1:8265" \
+    --runtime-env-json="${RUNTIME_ENV_JSON}" \
+    -- python3 ${SLIME_DIR}/train_async.py \
+    --actor-num-nodes 1 \
+    --actor-num-gpus-per-node "${ACTOR_GPUS}" \
+    --rollout-num-gpus "${ROLLOUT_GPUS}" \
+    --skip-eval-before-train \
+    "${MODEL_ARGS[@]}" \
+    "${CKPT_ARGS[@]}" \
+    "${ROLLOUT_ARGS[@]}" \
+    "${OPTIMIZER_ARGS[@]}" \
+    "${GRPO_ARGS[@]}" \
+    "${WANDB_ARGS[@]}" \
+    "${PERF_ARGS[@]}" \
+    "${EVAL_ARGS[@]}" \
+    "${SGLANG_ARGS[@]}" \
+    "${MISC_ARGS[@]}" \
+    "${CUSTOM_ARGS[@]}"
+}
+
+cleanup_prev
+
+start_router
+check_router
+
+# check_gpus
+# detect_nvlink
+maybe_fill_env_server_url
+export SCRIPT_DIR
+start_ray_head
+submit_job
diff --git a/toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh b/toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh
new file mode 100644
index 0000000000..8b704b547b
--- /dev/null
+++ b/toolcall-rl/retool_qwen3_4b_prm_rl_npu.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
+ulimit -n 65535
+
+# keep stdout/stderr unbuffered in ray jobs
+export PYTHONUNBUFFERED=1
+export PYTHONFAULTHANDLER=1
+
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export RAY_DEBUG=1
+export RAY_DEDUP_LOGS=0
+
+# default to 8 GPUs if not set by scheduler
+NUM_GPUS=${NUM_GPUS:-8}
+ACTOR_GPUS=${ACTOR_GPUS:-2}
+ROLLOUT_GPUS=${ROLLOUT_GPUS:-4}
+PRM_GPUS=${PRM_GPUS:-2}
+
+if (( ACTOR_GPUS + ROLLOUT_GPUS + PRM_GPUS > NUM_GPUS )); then
+    echo "ACTOR_GPUS + ROLLOUT_GPUS + PRM_GPUS must be <= NUM_GPUS"
+    echo "ACTOR_GPUS=${ACTOR_GPUS}, ROLLOUT_GPUS=${ROLLOUT_GPUS}, PRM_GPUS=${PRM_GPUS}, NUM_GPUS=${NUM_GPUS}"
+    exit 1
+fi
+
+# set visible devices
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+
+# Increase Ray heartbeat/health-check timeouts to reduce false node failures under heavy init.
+export RAY_health_check_failure_threshold=20
+export RAY_health_check_period_ms=5000
+export RAY_health_check_timeout_ms=30000
+export RAY_num_heartbeats_timeout=60
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+SLIME_DIR="$(cd -- "${SCRIPT_DIR}/../slime" &>/dev/null && pwd)"
+MEGATRON_LM_PATH=Megatron-LM
+MEGATRON_BRIDGE_PATH=Megatron-Bridge/src
+SGLANG_PATH=sglang/python
+source "${SLIME_DIR}/scripts/models/qwen3-4B.sh"
+
+HF_CKPT=qwen3-4b-sft-SGLang-RL
+REF_LOAD=qwen3-4b-sft-SGLang-RL-dist-slime
+SAVE_CKPT=qwen3-4b-sft-SGLang-RL-save
+# RESUME_LOAD=/workspace/OpenClaw-RL/ckpt/qwen3-4b-retool-rl
+# Use the existing run id to continue plotting on the same W&B curve.
+#WANDB_RESUME=${WANDB_RESUME:-must}
+
+PRM_MODEL_PATH=Qwen3-4B-Instruct-2507
+# DYNAMIC_HISTORY=1
+
+CKPT_ARGS=(
+   --hf-checkpoint ${HF_CKPT}
+   --ref-load ${REF_LOAD}
+   # --load ${RESUME_LOAD}
+   --save ${SAVE_CKPT}
+   --save-interval 50
+   --rotary-base 5000000
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --reward-key score
+   --num-rollout 220
+   --rollout-batch-size 32
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-max-context-len 16384
+   --rollout-temperature 1
+   --num-steps-per-rollout 2
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 50
+   --eval-prompt-data aime aime-2024.jsonl
+   --n-samples-per-eval-prompt 16
+   --eval-max-response-len 16384
+   --eval-max-context-len 32768
+   --eval-top-p 1
+   --eval-reward-key acc
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 2
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   # --micro-batch-size 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 16384
+   --log-probs-chunk-size 1024
+)
+
+GRPO_ARGS=(
+   --advantage-estimator step_wise
+   --use-kl-loss
+   --kl-loss-coef 0.01
+   --kl-loss-type k3
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+   # --use-wandb
+   # --wandb-project slime_retool
+   # --wandb-group qwen3-4B-rl_retool
+   # --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 2
+   --sglang-mem-fraction-static 0.6
+   # ======================= NPU 添加参数 =======================
+   --sglang-device npu
+)
+
+PRM_ARGS=(
+   --prm-enable
+   --prm-num-gpus "${PRM_GPUS}"
+   --prm-num-gpus-per-engine 2
+   --prm-model-path "${PRM_MODEL_PATH}"
+   --prm-m "${PRM_M:-1}"
+   --prm-step-coef "${PRM_STEP_COEF:-1.0}"
+   --prm-temperature "${PRM_TEMPERATURE:-0.6}"
+   --prm-max-new-tokens "${PRM_MAX_NEW_TOKENS:-4096}"
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+)
+
+CUSTOM_ARGS=(
+   --custom-generate-function-path generate_with_retool.generate
+   --custom-rm-path generate_with_retool.reward_func
+)
+
+DYNAMIC_HISTORY_ARGS=()
+if [[ "${DYNAMIC_HISTORY:-0}" == "1" ]]; then
+  DYNAMIC_HISTORY_ARGS+=(--dynamic_history)
+fi
+
+export PYTORCH_NPU_ALLOC_CONF=${PYTORCH_NPU_ALLOC_CONF:-"expandable_segments:True"}
+
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+ray start --head --node-ip-address ${MASTER_ADDR} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"${MEGATRON_LM_PATH}:${MEGATRON_BRIDGE_PATH}:${SGLANG_PATH}:${SCRIPT_DIR}:${SLIME_DIR}:$PYTHONPATH\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"PYTORCH_NPU_ALLOC_CONF\": \"${PYTORCH_NPU_ALLOC_CONF}\",
+    \"ASCEND_TOOLKIT_HOME\": \"/usr/local/Ascend/cann-8.5.0/\",
+    \"ASCEND_OPP_PATH\": \"/usr/local/Ascend/cann-8.5.0/opp/\",
+    \"ASCEND_AICPU_PATH\": \"/usr/local/Ascend/cann-8.5.0/\",
+    \"ASCEND_HOME_PATH\": \"/usr/local/Ascend/cann-8.5.0/\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train_async.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node ${ACTOR_GPUS} \
+   --rollout-num-gpus ${ROLLOUT_GPUS} \
+   --num-gpus-per-node ${NUM_GPUS} \
+   --skip-eval-before-train \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]} \
+   ${CUSTOM_ARGS[@]} \
+   ${PRM_ARGS[@]} \
+   ${DYNAMIC_HISTORY_ARGS[@]}