Skip to content

Commit 62a0a32

Browse files
committed
fixup! fix test_rope
1 parent e00ff2b commit 62a0a32

25 files changed

Lines changed: 233 additions & 169 deletions

.dev_scripts/xtuner_rl_path.pth

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1 @@
1-
import os
2-
import sys
3-
import warnings
4-
5-
if os.getenv('XTUNER_USE_LMDEPLOY').lower() in ['1', 'on', 'true']:
6-
lmdeploy_envs_dir = os.getenv('XTUNER_LMDEPLOY_ENVS_DIR', '/envs/lmdeploy')
7-
if lmdeploy_envs_dir not in sys.path:
8-
sys.path.insert(0, lmdeploy_envs_dir)
9-
warnings.warn(
10-
f"XTUNER_USE_LMDEPLOY is set to true. Injected {lmdeploy_envs_dir} into sys.path for lmdeploy imports."
11-
)
12-
13-
elif os.getenv('XTUNER_USE_SGLANG').lower() in ['1', 'on', 'true']:
14-
sglang_envs_dir = os.getenv('XTUNER_SGLANG_ENVS_DIR', '/envs/sglang')
15-
if sglang_envs_dir not in sys.path:
16-
sys.path.insert(0, sglang_envs_dir)
17-
warnings.warn(
18-
f"XTUNER_USE_SGLANG is set to true. Injected {sglang_envs_dir} into sys.path for sglang imports."
19-
)
20-
21-
# else:
22-
# warnings.warn(
23-
# "Neither XTUNER_USE_LMDEPLOY nor XTUNER_USE_SGLANG is set to true. No custom paths will be injected."
24-
# )
25-
1+
import xtuner_rl_path
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
import sys
3+
4+
dist_packages_index = 0
5+
for i, path in enumerate(sys.path):
6+
if path.endswith("dist-packages"):
7+
dist_packages_index = i
8+
break
9+
10+
if os.getenv('XTUNER_USE_LMDEPLOY', '').lower() in ['1', 'on', 'true']:
11+
lmdeploy_envs_dir = os.getenv('XTUNER_LMDEPLOY_ENVS_DIR', '/envs/lmdeploy')
12+
if lmdeploy_envs_dir not in sys.path:
13+
sys.path.insert(dist_packages_index, lmdeploy_envs_dir)
14+
15+
elif os.getenv('XTUNER_USE_SGLANG', '').lower() in ['1', 'on', 'true']:
16+
sglang_envs_dir = os.getenv('XTUNER_SGLANG_ENVS_DIR', '/envs/sglang')
17+
if sglang_envs_dir not in sys.path:
18+
sys.path.insert(dist_packages_index, sglang_envs_dir)

Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,9 @@ ARG LMDEPLOY_WHEELS=https://github.com/InternLM/lmdeploy/releases/download/v${LM
268268
RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
269269
--mount=type=secret,id=NO_PROXY,env=no_proxy \
270270
pip install fastapi fire openai outlines \
271-
partial_json_parser 'ray[default]<3' shortuuid uvicorn \
271+
partial_json_parser 'ray[default]<3' shortuuid uvicorn pybase64 \
272272
'pydantic>2' openai_harmony dlblas --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-cache-dir -i ${DEFAULT_PYPI_URL} && \
273+
pip install xgrammar==0.1.32 --no-cache-dir -i ${DEFAULT_PYPI_URL} --no-deps && \
273274
if [ -n "${LMDEPLOY_VERSION}" ]; then \
274275
# pip install lmdeploy==${LMDEPLOY_VERSION} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
275276
echo pip install ${LMDEPLOY_WHEELS} --target ${XTUNER_LMDEPLOY_ENVS_DIR} --no-deps --no-cache-dir -i ${DEFAULT_PYPI_URL}; \
@@ -293,7 +294,7 @@ COPY . ${CODESPACE}/xtuner
293294
WORKDIR ${CODESPACE}/xtuner
294295

295296
# Install custom .pth file for conditional lmdeploy and sglang path injection
296-
RUN cp .dev_scripts/xtuner_rl_path.pth ${PYTHON_SITE_PACKAGE_PATH}/xtuner_rl_path.pth
297+
RUN cp -r .dev_scripts/xtuner_rl_path* ${PYTHON_SITE_PACKAGE_PATH}/
297298

298299
# RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
299300
RUN pip install .[all] -v --no-cache-dir -i ${DEFAULT_PYPI_URL}

ci/scripts/CI_ENV.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,12 @@ export PYTEST_ADDOPTS='-o cache_dir=/tmp/.pytest_cache'
3333
# Some DDP test will cost more than 300s, set it to 600 avoid timeout error.
3434
export DISTRIBUTED_TESTS_DEFAULT_TIMEOUT=600
3535

36-
proxy_off
37-
pip install .[all]
36+
# proxy_off
37+
# pip install .[all]
3838

39-
TORCH_VERSION=$(python -c "import torch;print(torch.__version__.split('+')[0])")
40-
if [[ $TORCH_VERSION == "2.9.1" ]]; then
41-
pip install nvidia-cudnn-cu12==9.15.1.9
42-
fi
39+
# TORCH_VERSION=$(python -c "import torch;print(torch.__version__.split('+')[0])")
40+
# if [[ $TORCH_VERSION == "2.9.1" ]]; then
41+
# pip install nvidia-cudnn-cu12==9.15.1.9
42+
# fi
4343

44-
export PYTHONPATH=${LM_DEPLOY}:$PYTHONPATH
44+
# export PYTHONPATH=${LM_DEPLOY}:$PYTHONPATH

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ dependencies = [
2828
"datasets<4.0.0",
2929
"einops",
3030
"loguru",
31-
"mmengine==0.11.0rc0",
31+
"mmengine==0.11.0rc2",
3232
"openpyxl",
3333
"peft>=0.14.0",
3434
"scikit-image",

tests/model/test_gpt_oss_moe.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ class TestGptOss(DeterministicDDPTestCase):
3333
@parametrize.parametrize(
3434
"device,dispatcher,ep_size,compile,tol,loss_class",
3535
[
36-
("cuda", "all2all", 8, False, 1e-2, "cross_entropy"),
37-
("cuda", None, 1, False, 1e-2, "cross_entropy"),
36+
("cuda", "all2all", 8, False, 3e-2, "cross_entropy"),
37+
("cuda", None, 1, False, 3e-2, "cross_entropy"),
3838
# ("cuda", None, 1, False, 1e-2, "chunk_cross_entropy"),
3939
],
4040
)
@@ -70,7 +70,7 @@ def test_gpt_oss_run(self, device, dispatcher, ep_size, compile, tol, loss_class
7070
cfg = GptOss21BA3P6Config(compile_cfg=False)
7171
cfg.dispatcher = dispatcher
7272
cfg.ep_size = ep_size
73-
gpt_oss_model = cfg.build().to(torch.bfloat16)
73+
gpt_oss_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
7474

7575
shift_input_ids = input_ids[:, :-1]
7676
shifted_labels = input_ids[:, 1:]
@@ -128,7 +128,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size):
128128
cfg = GptOss21BA3P6Config(compile_cfg=False)
129129
cfg.ep_size = ep_size
130130
cfg.dispatcher = dispatcher
131-
gpt_oss_model = cfg.build().to(torch.bfloat16)
131+
gpt_oss_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
132132

133133
fsdp_config = FSDPConfig(
134134
ep_size=ep_size,
@@ -155,7 +155,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size):
155155
loss_ctx={"lm": loss_ctx},
156156
)
157157
loss = output["loss"]
158-
self.assertTrue(torch.allclose(loss, expected_loss.to(loss.dtype), atol=1e-2, rtol=1e-2))
158+
self.assertTrue(torch.allclose(loss, expected_loss.to(loss.dtype), atol=5e-2, rtol=5e-2))
159159

160160
@parametrize.parametrize(
161161
"device,dispatcher,ep_size",
@@ -170,7 +170,7 @@ def test_save_hf(self, device, dispatcher, ep_size):
170170
cfg = GptOss21BA3P6Config()
171171
cfg.dispatcher = dispatcher
172172
cfg.ep_size = ep_size
173-
gpt_oss_model = cfg.build().to(torch.bfloat16)
173+
gpt_oss_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
174174

175175
fsdp_config = FSDPConfig(
176176
ep_size=ep_size,

tests/model/test_intern_s1.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def test_interns1_text_run(self, device, tol):
6464
with torch.device("meta"):
6565
model_cfg = InternS1MiniConfig()
6666
model_cfg.compile_cfg = False
67-
interns1_model = model_cfg.build().to(torch.bfloat16)
67+
interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
6868

6969
interns1_model.from_hf(INTERNS1_DENSE_PATH)
7070
interns1_model.eval() # avoid open drop_path
@@ -162,7 +162,7 @@ def test_interns1_image_run(self, device, sp_size, tol):
162162
with torch.device("meta"):
163163
model_cfg = InternS1MiniConfig()
164164
model_cfg.compile_cfg = False
165-
interns1_model = model_cfg.build().to(torch.bfloat16)
165+
interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
166166

167167
interns1_model.from_hf(INTERNS1_DENSE_PATH)
168168
interns1_model.eval() # avoid open drop_path
@@ -238,7 +238,7 @@ def test_fsdp_text_accuracy(self, device, tol):
238238
with torch.device("meta"):
239239
model_cfg = InternS1MiniConfig()
240240
model_cfg.compile_cfg = False
241-
interns1_model = model_cfg.build().to(torch.bfloat16)
241+
interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
242242

243243
fsdp_config = FSDPConfig(
244244
cpu_offload=False,
@@ -342,7 +342,7 @@ def test_fsdp_image_accuracy(self, device, sp_size, compile, tol):
342342
model_cfg = InternS1MiniConfig()
343343
if not compile:
344344
model_cfg.compile_cfg = False
345-
interns1_model = model_cfg.build().to(torch.bfloat16)
345+
interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
346346

347347
fsdp_config = FSDPConfig(
348348
cpu_offload=False,
@@ -394,7 +394,7 @@ def test_save_hf(self, device, tp_size):
394394
self.create_pg(device)
395395
with torch.device("meta"):
396396
model_cfg = InternS1MiniConfig()
397-
interns1_model = model_cfg.build().to(torch.bfloat16)
397+
interns1_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
398398

399399
fsdp_config = FSDPConfig(
400400
tp_size=tp_size,

tests/model/test_qwen3_5.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def test_qwen3_5_vl_run(self, device, sp_size, tol):
199199
model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
200200
# hf_save_cfg of text_model is ignored to align with transformers's forward result
201201
model_cfg.text_config.hf_save_cfg = HFSaveCfg()
202-
qwen3vl_model = model_cfg.build().to(torch.bfloat16)
202+
qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
203203

204204
qwen3vl_model.from_hf(QWEN3_VL_MOE_PATH)
205205
qwen3vl_model.eval()
@@ -221,7 +221,7 @@ def test_qwen3_5_vl_run(self, device, sp_size, tol):
221221
model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
222222
# hf_save_cfg of text_model is ignored to align with transformers's forward result
223223
model_cfg.text_config.hf_save_cfg = HFSaveCfg()
224-
qwen3vl_model = model_cfg.build().to(torch.bfloat16)
224+
qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
225225

226226
fsdp_config = FSDPConfig(cpu_offload=False)
227227
fsdp_mesh = init_world_mesh()
@@ -260,7 +260,7 @@ def test_qwen3_5_vl_run_mtp(self, device, sp_size, tol):
260260
with torch.device("meta"):
261261
model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
262262
model_cfg.text_config.mtp_config = MTPConfig(num_layers=1, loss_scaling_factor=1)
263-
qwen3vl_model = model_cfg.build().to(torch.bfloat16)
263+
qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
264264

265265
qwen3vl_model.from_hf(QWEN3_VL_MOE_PATH)
266266
qwen3vl_model.eval()
@@ -308,7 +308,7 @@ def test_save_hf_with_mtp(self, device, sp_size):
308308
with torch.device("meta"):
309309
model_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
310310
model_cfg.text_config.mtp_config = MTPConfig(num_layers=1)
311-
qwen3vl_model = model_cfg.build().to(torch.bfloat16)
311+
qwen3vl_model = model_cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
312312

313313
fsdp_config = FSDPConfig(cpu_offload=False)
314314
fsdp_mesh = init_world_mesh()

tests/model/test_qwen3_dense.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_qwen3_dense_run(self, device, tp_size, compile, tol, loss_class):
5555
cfg = Qwen3Dense8BConfig()
5656
if not compile:
5757
cfg.compile_cfg = False
58-
qwen_model = cfg.build().to(torch.bfloat16)
58+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
5959

6060
shift_input_ids = input_ids[:, :-1]
6161
shifted_labels = input_ids[:, 1:]
@@ -107,7 +107,7 @@ def test_fsdp_accuracy(self, device, tp_size):
107107

108108
with torch.device("meta"):
109109
cfg = Qwen3Dense8BConfig(compile_cfg=False)
110-
qwen_model = cfg.build().to(torch.bfloat16)
110+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
111111

112112
fsdp_config = FSDPConfig(
113113
tp_size=tp_size,
@@ -158,7 +158,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
158158
use_sliding_window=use_sliding_window,
159159
max_window_layers=max_window_layers,
160160
attention=attention)
161-
qwen_model = cfg.build().to(torch.bfloat16)
161+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
162162
loss_cfg = CELossConfig()
163163

164164
if use_sliding_window is False or max_window_layers >= num_hidden_layers:
@@ -186,7 +186,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
186186
use_sliding_window=use_sliding_window,
187187
max_window_layers=max_window_layers,
188188
attention=attention)
189-
qwen_model = cfg.build().to(torch.bfloat16)
189+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
190190

191191
fsdp_config = FSDPConfig()
192192
tokenizer = AutoTokenizer.from_pretrained(QWEN3_PATH, trust_remote_code=True)
@@ -221,7 +221,7 @@ def test_save_hf(self, device, tp_size):
221221
self.create_pg(device)
222222
with torch.device("meta"):
223223
cfg = Qwen3Dense8BConfig()
224-
qwen_model = cfg.build().to(torch.bfloat16)
224+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
225225

226226
fsdp_config = FSDPConfig(
227227
tp_size=tp_size,

tests/model/test_qwen3_moe.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,6 @@
3030
QWEN3_MOE_FOPE_PATH = os.environ["QWEN3_MOE_FOPE_PATH"]
3131

3232

33-
# Skip fope tests for transformers >= 5.2.0 due to SlidingWindowCache incompatibility
34-
# in the model's remote code
35-
def skip_if_fope_incompatible(model_type):
36-
"""Skip fope model tests if transformers version is incompatible."""
37-
if model_type == "qwen3_moe_fope" and Version(transformers_version) >= Version("5.2.0"):
38-
return True
39-
return False
40-
41-
4233
class TestQwen3MoE(DeterministicDDPTestCase):
4334
def prepare(self):
4435
self.temp_dir = tempfile.TemporaryDirectory()
@@ -58,8 +49,6 @@ def prepare(self):
5849
)
5950
def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_mode, model_type):
6051
assert model_type in ["qwen3_moe", "qwen3_moe_fope"]
61-
if skip_if_fope_incompatible(model_type):
62-
raise unittest.SkipTest(f"Skipping fope test for transformers {transformers_version} due to SlidingWindowCache incompatibility")
6352
os.environ["TRITON_CACHE_DIR"] = str(Path(self.temp_dir.name) / "triton_cache")
6453
self.create_pg(device)
6554

@@ -99,7 +88,7 @@ def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_mod
9988
cfg.compile_cfg = False
10089
cfg.dispatcher = dispatcher
10190
cfg.ep_size = ep_size
102-
qwen_model = cfg.build().to(torch.bfloat16)
91+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
10392
qwen_model.from_hf(hf_model_path)
10493

10594
losses = []
@@ -139,8 +128,6 @@ def test_qwen3_moe_run(self, device, dispatcher, ep_size, compile, tol, loss_mod
139128
)
140129
def test_fsdp_accuracy(self, device, dispatcher, ep_size, model_type):
141130
assert model_type in ["qwen3_moe", "qwen3_moe_fope"]
142-
if skip_if_fope_incompatible(model_type):
143-
raise unittest.SkipTest(f"Skipping fope test for transformers {transformers_version} due to SlidingWindowCache incompatibility")
144131
self.create_pg(device)
145132

146133
hf_model_path = QWEN3_MOE_PATH if model_type == "qwen3_moe" else QWEN3_MOE_FOPE_PATH
@@ -179,7 +166,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size, model_type):
179166
cfg.compile_cfg = False
180167
cfg.ep_size = ep_size
181168
cfg.dispatcher = dispatcher
182-
qwen_model = cfg.build().to(torch.bfloat16)
169+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
183170

184171
fsdp_config = FSDPConfig(
185172
ep_size=ep_size,
@@ -212,7 +199,7 @@ def test_fsdp_accuracy(self, device, dispatcher, ep_size, model_type):
212199
loss = output["loss"]
213200
losses.append(loss)
214201

215-
self._check_loss_curve(losses=torch.tensor(losses), losses_ref=torch.tensor(expected_losses), sim_tol=1e-2, rtol=1e-2)
202+
self._check_loss_curve(losses=torch.tensor(losses), losses_ref=torch.tensor(expected_losses), sim_tol=3e-2, rtol=3e-2)
216203

217204
@parametrize.parametrize(
218205
"use_sliding_window, max_window_layers, sliding_window",
@@ -235,7 +222,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
235222
use_sliding_window=use_sliding_window,
236223
max_window_layers=max_window_layers,
237224
attention=attention)
238-
qwen_model = cfg.build().to(torch.bfloat16)
225+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
239226
loss_cfg = CELossConfig()
240227

241228
if use_sliding_window is False or max_window_layers >= num_hidden_layers:
@@ -264,7 +251,7 @@ def test_sliding_windows(self, use_sliding_window, max_window_layers, sliding_wi
264251
use_sliding_window=use_sliding_window,
265252
max_window_layers=max_window_layers,
266253
attention=attention)
267-
qwen_model = cfg.build().to(torch.bfloat16)
254+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
268255

269256
fsdp_config = FSDPConfig()
270257
tokenizer = AutoTokenizer.from_pretrained(QWEN3_MOE_PATH, trust_remote_code=True)
@@ -303,7 +290,7 @@ def test_save_hf(self, device, dispatcher, ep_size):
303290
cfg = Qwen3MoE30BA3Config()
304291
cfg.dispatcher = dispatcher
305292
cfg.ep_size = ep_size
306-
qwen_model = cfg.build().to(torch.bfloat16)
293+
qwen_model = cfg.build()._to_device_dtype(dtype=torch.bfloat16, skip_buffers_dtype=True)
307294

308295
fsdp_config = FSDPConfig(
309296
ep_size=ep_size,
@@ -382,8 +369,6 @@ def test_fope_auto_config_with_remote_code(self):
382369
],
383370
)
384371
def test_save_hf_fope(self, device, dispatcher, ep_size):
385-
if Version(transformers_version) >= Version("5.2.0"):
386-
raise unittest.SkipTest(f"Skipping fope test for transformers {transformers_version} due to SlidingWindowCache incompatibility")
387372
self.create_pg(device)
388373
with tempfile.TemporaryDirectory() as tmpdir:
389374
load_from = Path(QWEN3_MOE_FOPE_PATH)
@@ -518,4 +503,3 @@ def check_dict_equal(dict1: dict, dict2: dict) -> bool:
518503
print(f"[ERROR] key {key} value is not equal")
519504
return False
520505
return True
521-

0 commit comments

Comments
 (0)