Skip to content

Commit ceaf5df

Browse files
authored
[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)
1 parent fdfc908 commit ceaf5df

5 files changed

Lines changed: 74 additions & 10 deletions

File tree

fastdeploy/distributed/communication.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from paddle.distributed import fleet
2222

2323
import fastdeploy.envs as envs
24+
from fastdeploy.platforms import current_platform
2425
from fastdeploy.utils import get_logger, register_custom_python_op
2526

2627
logger = get_logger("communication")
@@ -161,12 +162,21 @@ def tensor_model_parallel_all_reduce(
161162
return _TP_AR.custom_all_reduce(input_)
162163

163164
if paddle.in_dynamic_mode():
164-
if group_ is not None:
165-
dist.all_reduce(input_, group=group_)
165+
if current_platform.is_iluvatar():
166+
# use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
167+
if group_ is not None:
168+
stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
169+
else:
170+
hcg = fleet.get_hybrid_communicate_group()
171+
mp_group = hcg.get_model_parallel_group()
172+
stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
166173
else:
167-
hcg = fleet.get_hybrid_communicate_group()
168-
mp_group = hcg.get_model_parallel_group()
169-
dist.all_reduce(input_, group=mp_group)
174+
if group_ is not None:
175+
dist.all_reduce(input_, group=group_)
176+
else:
177+
hcg = fleet.get_hybrid_communicate_group()
178+
mp_group = hcg.get_model_parallel_group()
179+
dist.all_reduce(input_, group=mp_group)
170180
else:
171181
dist.all_reduce(input_)
172182
return input_

fastdeploy/model_executor/model_loader/default_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def download_model(self, model_config: ModelConfig) -> None:
4343

4444
def clean_memory_fragments(self, state_dict: dict) -> None:
4545
"""clean_memory_fragments"""
46-
if current_platform.is_cuda() or current_platform.is_maca():
46+
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
4747
if state_dict:
4848
for k, v in state_dict.items():
4949
if isinstance(v, paddle.Tensor):

fastdeploy/model_executor/model_loader/default_loader_v1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def download_model(self, model_config: ModelConfig) -> None:
4949

5050
def clean_memory_fragments(self) -> None:
5151
"""clean_memory_fragments"""
52-
if current_platform.is_cuda() or current_platform.is_maca():
52+
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
5353
paddle.device.empty_cache()
5454
paddle.device.synchronize()
5555

fastdeploy/worker/iluvatar_model_runner.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from functools import partial
1818

1919
import paddle
20+
from paddleformers.utils.log import logger
2021

2122
from fastdeploy import envs
2223
from fastdeploy.config import FDConfig
@@ -66,10 +67,17 @@ def __init__(
6667
not self.cache_config.enable_chunked_prefill
6768
), "Iluvatar does not support chunked prefill for VL model"
6869

70+
if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
71+
# ernie-vl does not support cuda graph for tp > 1
72+
logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
73+
self.use_cudagraph = False
74+
6975
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
70-
assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
76+
# Iluvatar does not support cuda graph for weight_only_int4 yet
77+
logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
78+
self.use_cudagraph = False
7179

72-
print(f"self.use_cudagraph={self.use_cudagraph}")
80+
logger.info(f"self.use_cudagraph={self.use_cudagraph}")
7381
# VL neox style = True
7482
emb_shape = self.share_inputs["rope_emb"].shape
7583
if emb_shape[-1] == self.model_config.head_dim // 2:

scripts/run_ci_iluvatar.sh

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ function check_server_status() {
187187
echo -e "\n"
188188
}
189189

190-
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
190+
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
191191
clear_message
192192
echo "Start server..."
193193
python -m fastdeploy.entrypoints.openai.api_server \
@@ -233,6 +233,52 @@ fi
233233
# fi
234234
echo -e "\nPASSED"
235235

236+
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
237+
clear_message
238+
echo "Start server..."
239+
python -m fastdeploy.entrypoints.openai.api_server \
240+
--model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
241+
--port 8180 \
242+
--tensor-parallel-size 2 \
243+
--quantization wint8 \
244+
--max-model-len 32768 \
245+
--max-num-seqs 8 \
246+
--block-size 16 \
247+
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
248+
249+
check_server_status
250+
251+
echo "Start inference..."
252+
cp ${CI_PATH}/test.jsonl ./
253+
python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
254+
255+
exit_code=$?
256+
echo -e "\nexit_code is ${exit_code}"
257+
258+
echo -e "\nStop server..."
259+
stop_processes
260+
echo -e "\nStop server done."
261+
262+
if [ ${exit_code} -ne 0 ]; then
263+
print_error_message
264+
exit 1
265+
fi
266+
267+
acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
268+
latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
269+
expected_lowerest_acc=0.8
270+
expected_largest_latency=60
271+
if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
272+
echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
273+
exit 1
274+
fi
275+
276+
# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
277+
# echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
278+
# exit 1
279+
# fi
280+
echo -e "\nPASSED"
281+
236282
echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
237283
clear_message
238284
echo "Start server..."

0 commit comments

Comments
 (0)