Traceback (most recent call last):
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 33, in <module>
main()
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 29, in main
pipeline.run()
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/fs/fast/ROLL/roll/pipeline/rlvr/rlvr_custom_vlm_pipeline.py", line 471, in run
model_update_metrics: Dict = self.model_update(global_step)
File "/fs/fast/ROLL/roll/pipeline/base_pipeline.py", line 74, in model_update
metrics.update(model_update_group.model_update(global_step))
File "/fs/fast/ROLL/roll/distributed/executor/model_update_group.py", line 35, in model_upda
te
dataprotos: list[DataProto] = ray.get(
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/auto_init_hook.
py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/client_mode_hoo
k.py", line 103, in wrapper
return func(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/client_mode_hoo
k.py", line 103, in wrapper
return func(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", lin
e 2822, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", lin
e 930, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::ActorWorker.start_model_update() (pid=46829, ip=10.0.0.2, actor_id=2f74a
d18013a46002d87ddc201000000, repr=ActorWorker(actor_train-0-G5))
File "/fs/fast/ROLL/roll/distributed/executor/worker.py", line 188, in start_model_update
exec_metrics: Dict = self.strategy.model_update(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/deepspeed_strategy.py", line 593, in model_upd
ate
return self.weight_updaters[model_update_name].model_update()
File "/fs/fast/ROLL/roll/third_party/deepspeed/model_update.py", line 79, in model_update
return self._colocated_model_update()
File "/fs/fast/ROLL/roll/third_party/deepspeed/model_update.py", line 167, in _colocated_mod
el_update
ray.get(refs)
ray.exceptions.RayTaskError: ray::InferWorker.update_parameter_in_bucket() (pid=47376, ip=10.0.0.2, actor_
id=c8f1bcaf9f05bbffadab9c2f01000000, repr=InferWorker(actor_infer-0-G45))
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/concurrent/futures/_base.py", line 458, in
result
return self.__get_result()
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/concurrent/futures/_base.py", line 403, in
__get_result
raise self._exception
File "/fs/fast/ROLL/roll/pipeline/base_worker.py", line 473, in update_parameter_in_bucket
await self.strategy.update_parameter_in_bucket(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/vllm_strategy.py", line 348, in update_paramet
er_in_bucket
await self.model.update_parameter_in_bucket(serialized_named_tensors, is_lora)
File "/fs/fast/ROLL/roll/third_party/vllm/async_llm.py", line 22, in update_parameter_in_buc
ket
await self.engine_core.collective_rpc_async(method="update_parameter_in_bucket", args=(serialized_name
d_tensors, is_lora))
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/v1/engine/core_client.p
y", line 747, in collective_rpc_async
return await self.call_utility_async("collective_rpc", method, timeout,
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/v1/engine/core_client.p
y", line 678, in call_utility_async
return await self._call_utility_async(method,
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/v1/engine/core_client.p
y", line 691, in _call_utility_async
return await future
Exception: Call to collective_rpc method failed: ray::RayWorkerWrapper.execute_method() (pid=49104, ip=10.
0.0.2, actor_id=380289d821e6833eeceacd0003000000, repr=<vllm.executor.ray_utils.RayWorkerWrapper object at
0x14da328ebb50>)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/worker/worker_base.py",
line 621, in execute_method
raise e
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/worker/worker_base.py",
line 612, in execute_method
return run_method(self, method, args, kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/utils.py", line 2378, i
n run_method
return func(*args, **kwargs)
File "/fs/fast/ROLL/roll/third_party/vllm/worker.py", line 144, in update_parameter_in_bucke
t
self.load_weights([(name, weight) for name, weight in named_params])
File "/fs/fast/ROLL/roll/third_party/vllm/worker.py", line 74, in load_weights
self.model_runner.model.load_weights(weights=weights)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/model_executor/models/q
wen2_5_vl.py", line 1116, in load_weights
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/model_executor/models/u
tils.py", line 261, in load_weights
autoloaded_weights = set(self._load_module("", self.module, weights))
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/model_executor/models/u
tils.py", line 250, in _load_module
raise ValueError(msg)
ValueError: There is no module or parameter named 'base_model' in Qwen2_5_VLForConditionalGeneration
Traceback (most recent call last):
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 33, in <module>
main()
File "/fs/fast/ROLL/examples/start_rlvr_vl_custom_pipeline.py", line 29, in main
pipeline.run()
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/torch/utils/_contextlib.py"$
line 116, in decorate_context
return func(*args, **kwargs)
File "/fs/fast/ROLL/roll/pipeline/rlvr/rlvr_custom_vlm_pipeline.py", line 471, in run
model_update_metrics: Dict = self.model_update(global_step)
File "/fs/fast/ROLL/roll/pipeline/base_pipeline.py", line 74, in model_update
metrics.update(model_update_group.model_update(global_step))
File "/fs/fast/ROLL/roll/distributed/executor/model_update_group.py", line 35, in model_upd$
te
dataprotos: list[DataProto] = ray.get(
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/auto_init_hook.
py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/client_mode_hoo
k.py", line 103, in wrapper
return func(*args, **kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", lin
e 2822, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/ray/_private/worker.py", lin
e 930, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::ActorWorker.start_model_update() (pid=46829, ip=10.0.0.2, actor_id=2f74a
d18013a46002d87ddc201000000, repr=ActorWorker(actor_train-0-G5))
File "/fs/fast/ROLL/roll/distributed/executor/worker.py", line 188, in start_model_update
exec_metrics: Dict = self.strategy.model_update(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/deepspeed_strategy.py", line 593, in model_upd
ate
return self.weight_updaters[model_update_name].model_update()
File "/fs/fast/ROLL/roll/third_party/deepspeed/model_update.py", line 79, in model_update
return self._colocated_model_update()
File "/fs/fast/ROLL/roll/third_party/deepspeed/model_update.py", line 167, in _colocated_mod
el_update
ray.get(refs)
ray.exceptions.RayTaskError: ray::InferWorker.update_parameter_in_bucket() (pid=47376, ip=10.0.0.2, actor_
id=c8f1bcaf9f05bbffadab9c2f01000000, repr=InferWorker(actor_infer-0-G45))
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/concurrent/futures/_base.py", line 458, in
result
return self.__get_result()
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/concurrent/futures/_base.py", line 403, in
__get_result
raise self._exception
File "/fs/fast/ROLL/roll/pipeline/base_worker.py", line 473, in update_parameter_in_bucket
await self.strategy.update_parameter_in_bucket(*args, **kwargs)
File "/fs/fast/ROLL/roll/distributed/strategy/vllm_strategy.py", line 348, in update_paramet
er_in_bucket
await self.model.update_parameter_in_bucket(serialized_named_tensors, is_lora)
File "/fs/fast/ROLL/roll/third_party/vllm/async_llm.py", line 22, in update_parameter_in_buc
ket
await self.engine_core.collective_rpc_async(method="update_parameter_in_bucket", args=(serialized_name
d_tensors, is_lora))
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/v1/engine/core_client.p
y", line 747, in collective_rpc_async
return await self.call_utility_async("collective_rpc", method, timeout,
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/v1/engine/core_client.p
y", line 678, in call_utility_async
return await self._call_utility_async(method,
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/v1/engine/core_client.p
y", line 691, in _call_utility_async
return await future
Exception: Call to collective_rpc method failed: ray::RayWorkerWrapper.execute_method() (pid=49104, ip=10.
0.0.2, actor_id=380289d821e6833eeceacd0003000000, repr=<vllm.executor.ray_utils.RayWorkerWrapper object at
0x14da328ebb50>)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/worker/worker_base.py",
line 621, in execute_method
raise e
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/worker/worker_base.py",
line 612, in execute_method
return run_method(self, method, args, kwargs)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/utils.py", line 2378, i
n run_method
return func(*args, **kwargs)
File "/fs/fast/ROLL/roll/third_party/vllm/worker.py", line 144, in update_parameter_in_bucke
t
self.load_weights([(name, weight) for name, weight in named_params])
File "/fs/fast/ROLL/roll/third_party/vllm/worker.py", line 74, in load_weights
self.model_runner.model.load_weights(weights=weights)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/model_executor/models/q
wen2_5_vl.py", line 1116, in load_weights
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/model_executor/models/u
tils.py", line 261, in load_weights
autoloaded_weights = set(self._load_module("", self.module, weights))
File "/fs/fast/anaconda3/envs/qwen/lib/python3.10/site-packages/vllm/model_executor/models/$
tils.py", line 250, in _load_module
raise ValueError(msg)
ValueError: There is no module or parameter named 'base_model' in Qwen2_5_VLForConditionalGeneration
报错如下:
yaml如下: