Skip to content

Commit 438c9f7

Browse files
[BugFix] 0 not into cuda graph to save memory (PaddlePaddle#5426)
1 parent d1bd40d commit 438c9f7

3 files changed

Lines changed: 11 additions & 12 deletions

File tree

fastdeploy/config.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1577,9 +1577,6 @@ def __init__(
15771577
self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
15781578
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)
15791579

1580-
if self.parallel_config.use_ep:
1581-
self.graph_opt_config.cudagraph_capture_sizes += [0]
1582-
15831580
self.tokenizer = tokenizer
15841581
self.ips = ips
15851582
self.tool_parser = tool_parser

fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ def register_graph_buffers(self):
207207

208208
def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
209209
"""The main allreduce API that provides support for cuda graph."""
210+
211+
if input.shape[0] == 0:
212+
return input
213+
210214
if self.capturing:
211215
lib = cuda_wrapper.CudaRTLibrary()
212216
stream = paddle.device.current_stream()

fastdeploy/worker/gpu_model_runner.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,14 +1020,10 @@ def get_input_length_list(
10201020
"""
10211021
# NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
10221022
max_dec_len = expected_decode_len + 1
1023-
if batch_size == 0:
1024-
# Note(ZKK): divided by 0 is invalid, here we give a input_length = 1
1025-
input_length = 1
1026-
else:
1027-
input_length = min(
1028-
num_tokens // (1 if capture_prefill else batch_size),
1029-
self.model_config.max_model_len - max_dec_len,
1030-
)
1023+
input_length = min(
1024+
num_tokens // (1 if capture_prefill else batch_size),
1025+
self.model_config.max_model_len - max_dec_len,
1026+
)
10311027

10321028
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
10331029
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
@@ -1490,7 +1486,9 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
14901486

14911487
# When support capture both prefill-only and decode-only, this will use [only_prefill_use_cudagraph or only_decode_use_cudagraph]
14921488
self.forward_meta.step_use_cudagraph = (
1493-
only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph
1489+
only_prefill_use_cudagraph
1490+
if self.cudagraph_only_prefill
1491+
else only_decode_use_cudagraph and self.forward_meta.ids_remove_padding.shape[0] > 0
14941492
)
14951493

14961494
# Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends

0 commit comments

Comments
 (0)