InternLM
diff --git a/‎.dev_scripts/xtuner_rl_path.pth‎
Lines changed: 1 addition & 25 deletions b/‎.dev_scripts/xtuner_rl_path.pth‎
Lines changed: 1 addition & 25 deletions
diff --git a/‎.dev_scripts/xtuner_rl_path/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎.dev_scripts/xtuner_rl_path/__init__.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 3 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎ci/scripts/CI_ENV.sh‎
Lines changed: 7 additions & 7 deletions b/‎ci/scripts/CI_ENV.sh‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/model/test_gpt_oss_moe.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/model/test_gpt_oss_moe.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/model/test_intern_s1.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/model/test_intern_s1.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/model/test_qwen3_5.py‎
Lines changed: 4 additions & 4 deletions b/‎tests/model/test_qwen3_5.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/model/test_qwen3_dense.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/model/test_qwen3_dense.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/model/test_qwen3_moe.py‎
Lines changed: 6 additions & 22 deletions b/‎tests/model/test_qwen3_moe.py‎
Lines changed: 6 additions & 22 deletions
@@ -1,25 +1 @@
-import os
-import sys
-import warnings
-
-if os.getenv('XTUNER_USE_LMDEPLOY').lower() in ['1', 'on', 'true']:
-    lmdeploy_envs_dir = os.getenv('XTUNER_LMDEPLOY_ENVS_DIR', '/envs/lmdeploy')
-    if lmdeploy_envs_dir not in sys.path:
-        sys.path.insert(0, lmdeploy_envs_dir)
-        warnings.warn(
-            f"XTUNER_USE_LMDEPLOY is set to true. Injected {lmdeploy_envs_dir} into sys.path for lmdeploy imports."
-        )
-
-elif os.getenv('XTUNER_USE_SGLANG').lower() in ['1', 'on', 'true']:
-    sglang_envs_dir = os.getenv('XTUNER_SGLANG_ENVS_DIR', '/envs/sglang')
-    if sglang_envs_dir not in sys.path:
-        sys.path.insert(0, sglang_envs_dir)
-        warnings.warn(
-            f"XTUNER_USE_SGLANG is set to true. Injected {sglang_envs_dir} into sys.path for sglang imports."
-        )
-
-# else:
-#     warnings.warn(
-#         "Neither XTUNER_USE_LMDEPLOY nor XTUNER_USE_SGLANG is set to true. No custom paths will be injected."
-#     )    
-
+import xtuner_rl_path
@@ -0,0 +1,18 @@
+import os
+import sys
+
+dist_packages_index = 0
+for i, path in enumerate(sys.path):
+    if path.endswith("dist-packages"):
+        dist_packages_index = i
+        break
+
+if os.getenv('XTUNER_USE_LMDEPLOY', '').lower() in ['1', 'on', 'true']:
+    lmdeploy_envs_dir = os.getenv('XTUNER_LMDEPLOY_ENVS_DIR', '/envs/lmdeploy')
+    if lmdeploy_envs_dir not in sys.path:
+        sys.path.insert(dist_packages_index, lmdeploy_envs_dir)
+
+elif os.getenv('XTUNER_USE_SGLANG', '').lower() in ['1', 'on', 'true']:
+    sglang_envs_dir = os.getenv('XTUNER_SGLANG_ENVS_DIR', '/envs/sglang')
+    if sglang_envs_dir not in sys.path:
+        sys.path.insert(dist_packages_index, sglang_envs_dir)
@@ -268,8 +268,9 @@ ARG LMDEPLOY_WHEELS=https://github.com/InternLM/lmdeploy/releases/download/v${LM
 RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
     --mount=type=secret,id=NO_PROXY,env=no_proxy \
     pip install fastapi fire openai outlines \
-        partial_json_parser 'ray[default]<3' shortuuid uvicorn \
+        partial_json_parser 'ray[default]<3' shortuuid uvicorn pybase64 \
         'pydantic>2' openai_harmony dlblas --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-cache-dir -i ${DEFAULT_PYPI_URL} && \
+    pip install xgrammar==0.1.32 --no-cache-dir -i ${DEFAULT_PYPI_URL} --no-deps && \
     if [ -n "${LMDEPLOY_VERSION}" ]; then \
         # pip install lmdeploy==${LMDEPLOY_VERSION} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
         echo pip install ${LMDEPLOY_WHEELS} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
@@ -293,7 +294,7 @@ COPY . ${CODESPACE}/xtuner
 WORKDIR ${CODESPACE}/xtuner
 
 # Install custom .pth file for conditional lmdeploy and sglang path injection
-RUN cp .dev_scripts/xtuner_rl_path.pth ${PYTHON_SITE_PACKAGE_PATH}/xtuner_rl_path.pth
+RUN cp -r .dev_scripts/xtuner_rl_path* ${PYTHON_SITE_PACKAGE_PATH}/
 
 # RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
 RUN pip install .[all] -v --no-cache-dir -i ${DEFAULT_PYPI_URL}
 
@@ -33,12 +33,12 @@ export PYTEST_ADDOPTS='-o cache_dir=/tmp/.pytest_cache'
 # Some DDP test will cost more than 300s, set it to 600 avoid timeout error.
 export DISTRIBUTED_TESTS_DEFAULT_TIMEOUT=600
 
-proxy_off
-pip install .[all]
+# proxy_off
+# pip install .[all]
 
-TORCH_VERSION=$(python -c "import torch;print(torch.__version__.split('+')[0])")
-if [[ $TORCH_VERSION == "2.9.1" ]]; then
-    pip install nvidia-cudnn-cu12==9.15.1.9
-fi
+# TORCH_VERSION=$(python -c "import torch;print(torch.__version__.split('+')[0])")
+# if [[ $TORCH_VERSION == "2.9.1" ]]; then
+#     pip install nvidia-cudnn-cu12==9.15.1.9
+# fi
 
-export PYTHONPATH=${LM_DEPLOY}:$PYTHONPATH
+# export PYTHONPATH=${LM_DEPLOY}:$PYTHONPATH
@@ -28,7 +28,7 @@ dependencies = [
   "datasets<4.0.0",
   "einops",
   "loguru",
-  "mmengine==0.11.0rc0",
+  "mmengine==0.11.0rc2",
   "openpyxl",
   "peft>=0.14.0",
   "scikit-image",
 
@@ -33,8 +33,8 @@ class TestGptOss(DeterministicDDPTestCase):
     @parametrize.parametrize(
         "device,dispatcher,ep_size,compile,tol,loss_class",
         [
-            ("cuda", "all2all", 8, False, 1e-2, "cross_entropy"),
-            ("cuda", None, 1, False, 1e-2, "cross_entropy"),
+            ("cuda", "all2all", 8, False, 3e-2, "cross_entropy"),
+            ("cuda", None, 1, False, 3e-2, "cross_entropy"),
             # ("cuda", None, 1, False, 1e-2, "chunk_cross_entropy"),
         ],
     )
@@ -70,7 +70,7 @@ def test_gpt_oss_run(self, device, dispatcher, ep_size, compile, tol, loss_class
             cfg = GptOss21BA3P6Config(compile_cfg=False)
             cfg.dispatcher = dispatcher
             cfg.ep_size = ep_size
-            gpt_oss_model = cfg.build().to(torch.bfloat16)
+            gpt_oss_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         shift_input_ids = input_ids[:, :-1]
         shifted_labels = input_ids[:, 1:]
@@ -128,7 +128,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size):
             cfg = GptOss21BA3P6Config(compile_cfg=False)
             cfg.ep_size = ep_size
             cfg.dispatcher = dispatcher
-            gpt_oss_model = cfg.build().to(torch.bfloat16)
+            gpt_oss_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             ep_size=ep_size,
@@ -155,7 +155,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size):
                 loss_ctx={"lm": loss_ctx},
             )
         loss = output["loss"]
-        self.assertTrue(torch.allclose(loss, expected_loss.to(loss.dtype), atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(loss, expected_loss.to(loss.dtype), atol=5e-2, rtol=5e-2))
 
     @parametrize.parametrize(
         "device,dispatcher,ep_size",
@@ -170,7 +170,7 @@ def test_save_hf(self, device, dispatcher, ep_size):
             cfg = GptOss21BA3P6Config()
             cfg.dispatcher = dispatcher
             cfg.ep_size = ep_size
-            gpt_oss_model = cfg.build().to(torch.bfloat16)
+            gpt_oss_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             ep_size=ep_size,
 
@@ -64,7 +64,7 @@ def test_interns1_text_run(self, device, tol):
         with torch.device("meta"):
             model_cfg = InternS1MiniConfig()
             model_cfg.compile_cfg = False
-            interns1_model = model_cfg.build().to(torch.bfloat16)
+            interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         interns1_model.from_hf(INTERNS1_DENSE_PATH)
         interns1_model.eval()  # avoid open drop_path
@@ -162,7 +162,7 @@ def test_interns1_image_run(self, device, sp_size, tol):
         with torch.device("meta"):
             model_cfg = InternS1MiniConfig()
             model_cfg.compile_cfg = False
-            interns1_model = model_cfg.build().to(torch.bfloat16)
+            interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         interns1_model.from_hf(INTERNS1_DENSE_PATH)
         interns1_model.eval()  # avoid open drop_path
@@ -238,7 +238,7 @@ def test_fsdp_text_accuracy(self, device, tol):
         with torch.device("meta"):
             model_cfg = InternS1MiniConfig()
             model_cfg.compile_cfg = False
-            interns1_model = model_cfg.build().to(torch.bfloat16)
+            interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             cpu_offload=False,
@@ -342,7 +342,7 @@ def test_fsdp_image_accuracy(self, device, sp_size, compile, tol):
             model_cfg = InternS1MiniConfig()
             if not compile:
                 model_cfg.compile_cfg = False
-            interns1_model = model_cfg.build().to(torch.bfloat16)
+            interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             cpu_offload=False,
@@ -394,7 +394,7 @@ def test_save_hf(self, device, tp_size):
         self.create_pg(device)
         with torch.device("meta"):
             model_cfg = InternS1MiniConfig()
-            interns1_model = model_cfg.build().to(torch.bfloat16)
+            interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             tp_size=tp_size,
 
@@ -199,7 +199,7 @@ def test_qwen3_5_vl_run(self, device, sp_size, tol):
             model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
             # hf_save_cfg of text_model is ignored to align with transformers's forward result
             model_cfg.text_config.hf_save_cfg = HFSaveCfg()
-            qwen3vl_model = model_cfg.build().to(torch.bfloat16)
+            qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         qwen3vl_model.from_hf(QWEN3_VL_MOE_PATH)
         qwen3vl_model.eval()
@@ -221,7 +221,7 @@ def test_qwen3_5_vl_run(self, device, sp_size, tol):
             model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
             # hf_save_cfg of text_model is ignored to align with transformers's forward result
             model_cfg.text_config.hf_save_cfg = HFSaveCfg()
-            qwen3vl_model = model_cfg.build().to(torch.bfloat16)
+            qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(cpu_offload=False)
         fsdp_mesh = init_world_mesh()
@@ -260,7 +260,7 @@ def test_qwen3_5_vl_run_mtp(self, device, sp_size, tol):
         with torch.device("meta"):
             model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
             model_cfg.text_config.mtp_config = MTPConfig(num_layers=1, loss_scaling_factor=1)
-            qwen3vl_model = model_cfg.build().to(torch.bfloat16)
+            qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         qwen3vl_model.from_hf(QWEN3_VL_MOE_PATH)
         qwen3vl_model.eval()
@@ -308,7 +308,7 @@ def test_save_hf_with_mtp(self, device, sp_size):
         with torch.device("meta"):
             model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
             model_cfg.text_config.mtp_config = MTPConfig(num_layers=1)
-            qwen3vl_model = model_cfg.build().to(torch.bfloat16)
+            qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(cpu_offload=False)
         fsdp_mesh = init_world_mesh()
 
@@ -55,7 +55,7 @@ def test_qwen3_dense_run(self, device, tp_size, compile, tol, loss_class):
             cfg = Qwen3Dense8BConfig()
             if not compile:
                 cfg.compile_cfg = False
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         shift_input_ids = input_ids[:, :-1]
         shifted_labels = input_ids[:, 1:]
@@ -107,7 +107,7 @@ def test_fsdp_accuracy(self, device, tp_size):
 
         with torch.device("meta"):
             cfg = Qwen3Dense8BConfig(compile_cfg=False)
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             tp_size=tp_size,
@@ -158,7 +158,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
                                  use_sliding_window=use_sliding_window,
                                  max_window_layers=max_window_layers,
                                  attention=attention)
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
         loss_cfg = CELossConfig()
 
         if use_sliding_window is False or max_window_layers >= num_hidden_layers:
@@ -186,7 +186,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
                                      use_sliding_window=use_sliding_window,
                                      max_window_layers=max_window_layers,
                                      attention=attention)
-                qwen_model = cfg.build().to(torch.bfloat16)
+                qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
             fsdp_config = FSDPConfig()
             tokenizer = AutoTokenizer.from_pretrained(QWEN3_PATH, trust_remote_code=True)
@@ -221,7 +221,7 @@ def test_save_hf(self, device, tp_size):
         self.create_pg(device)
         with torch.device("meta"):
             cfg = Qwen3Dense8BConfig()
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             tp_size=tp_size,
 
@@ -30,15 +30,6 @@
 QWEN3_MOE_FOPE_PATH = os.environ["QWEN3_MOE_FOPE_PATH"]
 
 
-# Skip fope tests for transformers >= 5.2.0 due to SlidingWindowCache incompatibility
-# in the model's remote code
-def skip_if_fope_incompatible(model_type):
-    """Skip fope model tests if transformers version is incompatible."""
-    if model_type == "qwen3_moe_fope" and Version(transformers_version) >= Version("5.2.0"):
-        return True
-    return False
-
-
 class TestQwen3MoE(DeterministicDDPTestCase):
     def prepare(self):
         self.temp_dir = tempfile.TemporaryDirectory()
@@ -58,8 +49,6 @@ def prepare(self):
     )
     def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_mode, model_type):
         assert model_type in ["qwen3_moe", "qwen3_moe_fope"]
-        if skip_if_fope_incompatible(model_type):
-            raise unittest.SkipTest(f"Skipping fope test for transformers {transformers_version} due to SlidingWindowCache incompatibility")
         os.environ["TRITON_CACHE_DIR"] = str(Path(self.temp_dir.name) / "triton_cache")
         self.create_pg(device)
 
@@ -99,7 +88,7 @@ def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_mod
                 cfg.compile_cfg = False
             cfg.dispatcher = dispatcher
             cfg.ep_size = ep_size
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
         qwen_model.from_hf(hf_model_path)
 
         losses = []
@@ -139,8 +128,6 @@ def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_mod
     )
     def test_fsdp_accuracy(self, device, dispatcher, ep_size, model_type):
         assert model_type in ["qwen3_moe", "qwen3_moe_fope"]
-        if skip_if_fope_incompatible(model_type):
-            raise unittest.SkipTest(f"Skipping fope test for transformers {transformers_version} due to SlidingWindowCache incompatibility")
         self.create_pg(device)
 
         hf_model_path = QWEN3_MOE_PATH if model_type == "qwen3_moe" else QWEN3_MOE_FOPE_PATH
@@ -179,7 +166,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size, model_type):
             cfg.compile_cfg = False
             cfg.ep_size = ep_size
             cfg.dispatcher = dispatcher
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             ep_size=ep_size,
@@ -212,7 +199,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size, model_type):
             loss = output["loss"]
             losses.append(loss)
 
-        self._check_loss_curve(losses=torch.tensor(losses), losses_ref=torch.tensor(expected_losses), sim_tol=1e-2, rtol=1e-2)
+        self._check_loss_curve(losses=torch.tensor(losses), losses_ref=torch.tensor(expected_losses), sim_tol=3e-2, rtol=3e-2)
 
     @parametrize.parametrize(
         "use_sliding_window, max_window_layers, sliding_window",
@@ -235,7 +222,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
                                       use_sliding_window=use_sliding_window,
                                       max_window_layers=max_window_layers,
                                       attention=attention)
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
         loss_cfg = CELossConfig()
 
         if use_sliding_window is False or max_window_layers >= num_hidden_layers:
@@ -264,7 +251,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
                                           use_sliding_window=use_sliding_window,
                                           max_window_layers=max_window_layers,
                                           attention=attention)
-                qwen_model = cfg.build().to(torch.bfloat16)
+                qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
             fsdp_config = FSDPConfig()
             tokenizer = AutoTokenizer.from_pretrained(QWEN3_MOE_PATH, trust_remote_code=True)
@@ -303,7 +290,7 @@ def test_save_hf(self, device, dispatcher, ep_size):
             cfg = Qwen3MoE30BA3Config()
             cfg.dispatcher = dispatcher
             cfg.ep_size = ep_size
-            qwen_model = cfg.build().to(torch.bfloat16)
+            qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
 
         fsdp_config = FSDPConfig(
             ep_size=ep_size,
@@ -382,8 +369,6 @@ def test_fope_auto_config_with_remote_code(self):
         ],
     )
     def test_save_hf_fope(self, device, dispatcher, ep_size):
-        if Version(transformers_version) >= Version("5.2.0"):
-            raise unittest.SkipTest(f"Skipping fope test for transformers {transformers_version} due to SlidingWindowCache incompatibility")
         self.create_pg(device)
         with tempfile.TemporaryDirectory() as tmpdir:
             load_from = Path(QWEN3_MOE_FOPE_PATH)
@@ -518,4 +503,3 @@ def check_dict_equal(dict1: dict, dict2: dict) -> bool:
             print(f"[ERROR] key {key} value is not equal")
             return False
     return True
-