Skip to content

Commit 7f8ce7d

Browse files
authored
[Optimization] add warmup for _sample_from_probs (#7956)
1 parent 3711364 commit 7f8ce7d

3 files changed

Lines changed: 5 additions & 0 deletions

File tree

fastdeploy/model_executor/layers/sample/meta_data.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,5 @@ class SamplingMetadata:
6767
# Add for HPU post-processing
6868
seq_lens_encoder: Optional[paddle.Tensor] = None
6969
seq_lens_decoder: Optional[paddle.Tensor] = None
70+
# Add for sampler to distinguish dummy run and profile run
71+
is_dummy_or_profile_run: bool = False

fastdeploy/model_executor/layers/sample/sampler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se
221221
if need_top_k_sampling:
222222
probs = dispatch_top_k_renorm_probs(probs, top_k)
223223
next_tokens = _random_sample(probs, topp_seed=topp_seed)
224+
if sampling_metadata.is_dummy_or_profile_run: # warmup top_p != 1.0 path
225+
_, next_tokens = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed)
224226
else:
225227
_, next_tokens = top_k_top_p_sampling(
226228
probs,

fastdeploy/worker/gpu_model_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,7 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p
13831383
top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"],
13841384
logits_processors=self.share_inputs["logits_processors"],
13851385
share_inputs=self.share_inputs,
1386+
is_dummy_or_profile_run=is_dummy_or_profile_run,
13861387
)
13871388
return token_num, token_num_event
13881389

0 commit comments

Comments
 (0)