From 35869030404beb51896ee1ef81070cb8cf158062 Mon Sep 17 00:00:00 2001 From: Song Yuxing <34940506+rain7996@users.noreply.github.com> Date: Wed, 3 Jun 2026 20:00:25 +0800 Subject: [PATCH] Revert "[Feature] Support computing entropy with fastdeploy runner (#7954)" This reverts commit 529ec9ef60fa7effb5bd5ad54d180952eeaa97a4. --- fastdeploy/model_executor/entropy_utils.py | 168 ++---------------- .../model_executor/pre_and_post_process.py | 4 - fastdeploy/worker/gpu_model_runner.py | 3 +- .../test_entropy_utils_fd_runner_mtp.py | 167 ----------------- 4 files changed, 11 insertions(+), 331 deletions(-) delete mode 100644 tests/model_executor/test_entropy_utils_fd_runner_mtp.py diff --git a/fastdeploy/model_executor/entropy_utils.py b/fastdeploy/model_executor/entropy_utils.py index d61e4afb486..21d1b3421e9 100644 --- a/fastdeploy/model_executor/entropy_utils.py +++ b/fastdeploy/model_executor/entropy_utils.py @@ -14,14 +14,13 @@ # limitations under the License. """ -import os - import paddle from fastdeploy.utils import data_processor_logger def get_entropy(logits): + # Check for -inf values in logits if paddle.any(paddle.isinf(logits) & (logits < 0)): data_processor_logger.debug("Detected -inf values in logits, clipping to minimum value") logits = paddle.clip(logits, min=1e-9) @@ -33,36 +32,7 @@ def get_entropy(logits): return paddle.sum(p0 * (paddle.log(z0) - a0), axis=-1) -def _log_entropy(share_inputs, i): - elist = share_inputs["entropy_list"][i] - data_processor_logger.info( - f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(elist)/len(elist)}, steps: {len(elist)}, all_values: {elist}" - ) - share_inputs["entropy_list"][i] = [] - - -# ============================================================================== -# ernie5_model_runner path (original logic from commit 361a310) -# ============================================================================== - - def calculate_logits_entropy(logits, share_inputs, temperature): - use_fd_runner = os.environ.get("EB5_ENABLE_FD_RUNNER", "0") == "1" - if use_fd_runner: - calculate_logits_entropy_fd(logits, share_inputs, temperature) - else: - calculate_logits_entropy_ori(logits, share_inputs, temperature) - - -def speculate_calculate_logits_entropy(logits, share_inputs, temperature): - use_fd_runner = os.environ.get("EB5_ENABLE_FD_RUNNER", "0") == "1" - if use_fd_runner: - speculate_calculate_logits_entropy_fd(logits, share_inputs, temperature) - else: - speculate_calculate_logits_entropy_ori(logits, share_inputs, temperature) - - -def calculate_logits_entropy_ori(logits, share_inputs, temperature): real_bsz = share_inputs["seq_lens_this_time"].shape[0] real_seq_lens = paddle.where( share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0, @@ -87,10 +57,13 @@ def calculate_logits_entropy_ori(logits, share_inputs, temperature): and share_inputs["seq_lens_decoder"][i] != 0 and len(share_inputs["entropy_list"][i]) != 0 ): - _log_entropy(share_inputs, i) + data_processor_logger.info( + f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(share_inputs['entropy_list'][i])/len(share_inputs['entropy_list'][i])}" + ) + share_inputs["entropy_list"][i] = [] -def speculate_calculate_logits_entropy_ori(logits, share_inputs, temperature): +def speculate_calculate_logits_entropy(logits, share_inputs, temperature): # get accepted logits real_bsz = share_inputs["seq_lens_this_time"].shape[0] total_accepted_num = paddle.sum(share_inputs["accept_num"]) @@ -127,128 +100,7 @@ def speculate_calculate_logits_entropy_ori(logits, share_inputs, temperature): and share_inputs["seq_lens_decoder"][i] != 0 and len(share_inputs["entropy_list"][i]) != 0 ): - _log_entropy(share_inputs, i) - - -# ============================================================================== -# gpu_model_runner (FD runner) path -# ============================================================================== - - -def calculate_logits_entropy_fd(logits, share_inputs, temperature): - real_bsz = share_inputs["seq_lens_this_time"].shape[0] - seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz] - seq_lens_this_time = share_inputs["seq_lens_this_time"] - if seq_lens_encoder.ndim == 2: - seq_lens_encoder = seq_lens_encoder.squeeze(1) - if seq_lens_this_time.ndim == 2: - seq_lens_this_time = seq_lens_this_time.squeeze(1) - real_seq_lens = paddle.where( - seq_lens_encoder != 0, - paddle.ones([1], dtype="int32"), - seq_lens_this_time, - ) - - for i in range(real_bsz): - if int(real_seq_lens[i]) == 0: - continue - t = temperature[i] - if t > 0 and t != 1.0: - logits[i] = logits[i].scale_(1 / t) - - entropy_tensor = get_entropy(logits[:real_bsz]) - - for i in range(real_bsz): - if int(real_seq_lens[i]) == 0: - continue - entropy_val = float(entropy_tensor[i]) - share_inputs["entropy_list"][i].append(entropy_val) - if ( - share_inputs["stop_flags"][i] - and share_inputs["seq_lens_decoder"][i] != 0 - and len(share_inputs["entropy_list"][i]) != 0 - ): - _log_entropy(share_inputs, i) - - -def speculate_calculate_logits_entropy_fd(logits, share_inputs, temperature): - real_bsz = share_inputs["seq_lens_this_time"].shape[0] - total_accepted_num = int(paddle.sum(share_inputs["accept_num"][:real_bsz])) - - if total_accepted_num == 0: - for i in range(real_bsz): - if ( - share_inputs["stop_flags"][i] - and share_inputs["seq_lens_decoder"][i] != 0 - and len(share_inputs["entropy_list"][i]) != 0 - ): - _log_entropy(share_inputs, i) - return - - seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz] - seq_lens_this_time = share_inputs["seq_lens_this_time"] - if seq_lens_encoder.ndim == 2: - seq_lens_encoder = seq_lens_encoder.squeeze(1) - if seq_lens_this_time.ndim == 2: - seq_lens_this_time = seq_lens_this_time.squeeze(1) - real_seq_lens = paddle.where( - seq_lens_encoder != 0, - paddle.ones([1], dtype="int32"), - seq_lens_this_time, - ) - seq_start_idx = paddle.concat([paddle.zeros([1], dtype="int32"), paddle.cumsum(real_seq_lens, dtype="int32")]) - repeated_starts = paddle.repeat_interleave(seq_start_idx[:-1], share_inputs["accept_num"][:real_bsz]) - offsets = paddle.concat([paddle.arange(share_inputs["accept_num"][i].item()) for i in range(real_bsz)]).astype( - "int32" - ) - accepted_idx = repeated_starts + offsets - - accepted_logits = paddle.index_select(logits, accepted_idx, axis=0) - - batch_indices = paddle.arange(real_bsz, dtype="int32") - batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"][:real_bsz]) - temp_per_token = temperature[batch_id_per_token].flatten() - scale = paddle.where( - temp_per_token > 0, - 1.0 / temp_per_token, - paddle.ones_like(temp_per_token), - ) - accepted_logits = accepted_logits * scale.unsqueeze(-1) - - entropy_tensor = get_entropy(accepted_logits) - entropy = entropy_tensor.tolist() - - idx = 0 - for i in range(real_bsz): - accept_count = int(share_inputs["accept_num"][i]) - if accept_count > 0: - for _ in range(accept_count): - e_val = entropy[idx] - share_inputs["entropy_list"][i].append(e_val) - idx += 1 - if ( - share_inputs["stop_flags"][i] - and share_inputs["seq_lens_decoder"][i] != 0 - and len(share_inputs["entropy_list"][i]) != 0 - ): - _log_entropy(share_inputs, i) - - -# ============================================================================== -# Common utility -# ============================================================================== - - -def flush_entropy_on_stop(share_inputs): - """ - Flush entropy for requests whose stop_flags became True after entropy calculation. - Called after unified_update_model_status which sets stop_flags for max_dec_len. - """ - real_bsz = share_inputs["seq_lens_this_time"].shape[0] - for i in range(real_bsz): - if ( - share_inputs["stop_flags"][i] - and share_inputs["seq_lens_decoder"][i] != 0 - and len(share_inputs["entropy_list"][i]) != 0 - ): - _log_entropy(share_inputs, i) + data_processor_logger.info( + f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(share_inputs['entropy_list'][i])/len(share_inputs['entropy_list'][i])}" + ) + share_inputs["entropy_list"][i] = [] diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 10675b3f36f..2cd085e37c3 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -107,7 +107,6 @@ from fastdeploy.model_executor.entropy_utils import ( calculate_logits_entropy, - flush_entropy_on_stop, speculate_calculate_logits_entropy, ) from fastdeploy.model_executor.layers.moe.routing_indices_cache import ( @@ -521,9 +520,6 @@ def post_process_speculate( model_output.max_dec_len, # max_dec_len ) - if enable_entropy: - flush_entropy_on_stop(share_inputs) - def save_output_speculate( sampler_output: SamplerOutput, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 4bcdff61e8b..b787ee54505 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1254,7 +1254,7 @@ def _dummy_prefill_inputs(self, input_length_list: List[int], max_dec_len_list: self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( idx * block_num, (idx + 1) * block_num, 1 ) - self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:batch_size] + self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"] def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_profile_run=False) -> None: """Prepare the model inputs""" @@ -2416,7 +2416,6 @@ def execute_model_overlap( model_inputs, p_done_idxs, token_num_event = self._preprocess( model_forward_batch, num_running_requests, self._cached_launch_token_num, self._cached_real_bsz ) - model_output = self._execute(model_inputs) # save output (last batch) if self._cached_model_output_data is not None: diff --git a/tests/model_executor/test_entropy_utils_fd_runner_mtp.py b/tests/model_executor/test_entropy_utils_fd_runner_mtp.py deleted file mode 100644 index 9c9685d48f4..00000000000 --- a/tests/model_executor/test_entropy_utils_fd_runner_mtp.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Unit tests for entropy_utils FD runner path. - -Run: python tests/model_executor/test_entropy_utils_fd_runner_mtp.py -""" - -import math -import unittest - -import paddle - -from fastdeploy.model_executor.entropy_utils import ( - calculate_logits_entropy_fd, - flush_entropy_on_stop, - get_entropy, - speculate_calculate_logits_entropy_fd, -) - - -def _make_share_inputs( - bsz, seq_lens_this_time, seq_lens_encoder, seq_lens_decoder, stop_flags, req_ids, accept_num=None -): - d = { - "seq_lens_this_time": paddle.to_tensor(seq_lens_this_time, dtype="int32"), - "seq_lens_encoder": paddle.to_tensor(seq_lens_encoder, dtype="int32"), - "seq_lens_decoder": paddle.to_tensor(seq_lens_decoder, dtype="int32"), - "entropy_list": [[] for _ in range(bsz)], - "stop_flags": paddle.to_tensor(stop_flags, dtype="bool"), - "req_ids": req_ids, - } - if accept_num is not None: - d["accept_num"] = paddle.to_tensor(accept_num, dtype="int32") - return d - - -class TestGetEntropy(unittest.TestCase): - def test_uniform_and_deterministic(self): - logits = paddle.zeros([1, 4], dtype="float32") - self.assertAlmostEqual(float(get_entropy(logits)[0]), math.log(4), places=5) - - logits = paddle.to_tensor([[100.0, 0.0, 0.0, 0.0]], dtype="float32") - self.assertAlmostEqual(float(get_entropy(logits)[0]), 0.0, places=5) - - def test_negative_inf(self): - logits = paddle.to_tensor([[10.0, -float("inf"), -float("inf")]], dtype="float32") - self.assertGreaterEqual(float(get_entropy(logits)[0]), 0.0) - - -class TestNonMTP(unittest.TestCase): - def test_accumulation_and_skip_zero(self): - si = _make_share_inputs(3, [1, 1, 0], [0, 0, 0], [10, 10, 0], [False, False, False], ["a", "b", "c"]) - logits = paddle.to_tensor([[10.0, 1.0, 1.0], [1.0, 1.0, 10.0], [5.0, 5.0, 5.0]], dtype="float32") - calculate_logits_entropy_fd(logits, si, paddle.ones([3], dtype="float32")) - - self.assertEqual(len(si["entropy_list"][0]), 1) - self.assertEqual(len(si["entropy_list"][1]), 1) - self.assertEqual(len(si["entropy_list"][2]), 0) - # [10,1,1] and [1,1,10] symmetric => same entropy - self.assertAlmostEqual(si["entropy_list"][0][0], si["entropy_list"][1][0], places=6) - - def test_stop_flags_and_temperature(self): - # stop_flags clears entropy - si = _make_share_inputs(2, [1, 1], [0, 0], [10, 10], [True, False], ["a", "b"]) - logits = paddle.to_tensor([[10.0, 1.0, 1.0], [10.0, 1.0, 1.0]], dtype="float32") - calculate_logits_entropy_fd(logits, si, paddle.ones([2], dtype="float32")) - self.assertEqual(si["entropy_list"][0], []) - self.assertEqual(len(si["entropy_list"][1]), 1) - - # temperature scaling: lower temp => lower entropy - si = _make_share_inputs(2, [1, 1], [0, 0], [10, 10], [False, False], ["a", "b"]) - logits = paddle.to_tensor([[10.0, 1.0, 1.0], [10.0, 1.0, 1.0]], dtype="float32") - calculate_logits_entropy_fd(logits, si, paddle.to_tensor([1.0, 0.5], dtype="float32")) - self.assertGreater(si["entropy_list"][0][0], si["entropy_list"][1][0]) - - -class TestMTP(unittest.TestCase): - def test_accepted_idx_and_partial(self): - """accept_num=[2, 0, 1, 2], verifies correct row extraction and per-slot counts.""" - si = _make_share_inputs( - 4, - [2, 2, 2, 2], - [0, 0, 0, 0], - [10, 10, 10, 10], - [False, False, False, False], - ["a", "b", "c", "d"], - [2, 0, 1, 2], - ) - logits = paddle.to_tensor( - [ - [10.0, 1.0, 1.0], - [1.0, 10.0, 1.0], # slot 0 (both accepted) - [5.0, 5.0, 5.0], - [5.0, 5.0, 5.0], # slot 1 (none accepted) - [1.0, 1.0, 10.0], - [5.0, 5.0, 5.0], # slot 2 (first accepted) - [10.0, 10.0, 1.0], - [1.0, 10.0, 10.0], # slot 3 (both accepted) - ], - dtype="float32", - ) - speculate_calculate_logits_entropy_fd(logits, si, paddle.ones([4], dtype="float32")) - - self.assertEqual(len(si["entropy_list"][0]), 2) - self.assertEqual(len(si["entropy_list"][1]), 0) - self.assertEqual(len(si["entropy_list"][2]), 1) - self.assertEqual(len(si["entropy_list"][3]), 2) - # [10,1,1] [1,10,1] [1,1,10] all symmetric => same entropy - self.assertAlmostEqual(si["entropy_list"][0][0], si["entropy_list"][0][1], places=6) - self.assertAlmostEqual(si["entropy_list"][0][0], si["entropy_list"][2][0], places=6) - - def test_zero_accepted_flushes_stop(self): - si = _make_share_inputs(2, [1, 1], [0, 0], [10, 10], [True, False], ["a", "b"], [0, 0]) - si["entropy_list"][0] = [1.0, 2.0, 3.0] - si["entropy_list"][1] = [4.0, 5.0] - speculate_calculate_logits_entropy_fd( - paddle.zeros([2, 3], dtype="float32"), si, paddle.ones([2], dtype="float32") - ) - self.assertEqual(si["entropy_list"][0], []) - self.assertEqual(si["entropy_list"][1], [4.0, 5.0]) - - def test_multi_step_and_stop(self): - si = _make_share_inputs(2, [2, 2], [0, 0], [10, 10], [False, False], ["a", "b"], [2, 1]) - logits = paddle.to_tensor( - [[10.0, 1.0, 1.0], [1.0, 10.0, 1.0], [1.0, 1.0, 10.0], [5.0, 5.0, 5.0]], dtype="float32" - ) - speculate_calculate_logits_entropy_fd(logits, si, paddle.ones([2], dtype="float32")) - self.assertEqual(len(si["entropy_list"][0]), 2) - self.assertEqual(len(si["entropy_list"][1]), 1) - - # Step 2: slot 1 stops - si["accept_num"] = paddle.to_tensor([1, 1], dtype="int32") - si["stop_flags"] = paddle.to_tensor([False, True], dtype="bool") - logits2 = paddle.to_tensor( - [[10.0, 1.0, 1.0], [5.0, 5.0, 5.0], [10.0, 1.0, 1.0], [5.0, 5.0, 5.0]], dtype="float32" - ) - speculate_calculate_logits_entropy_fd(logits2, si, paddle.ones([2], dtype="float32")) - self.assertEqual(len(si["entropy_list"][0]), 3) - self.assertEqual(si["entropy_list"][1], []) - - -class TestFlushEntropyOnStop(unittest.TestCase): - def test_flush(self): - si = _make_share_inputs(3, [1, 1, 1], [0, 0, 0], [10, 10, 10], [True, False, True], ["a", "b", "c"]) - si["entropy_list"][0] = [1.0, 2.0] - si["entropy_list"][2] = [3.0] - flush_entropy_on_stop(si) - self.assertEqual(si["entropy_list"][0], []) - self.assertEqual(si["entropy_list"][1], []) - self.assertEqual(si["entropy_list"][2], []) - - -if __name__ == "__main__": - unittest.main()