From 35869030404beb51896ee1ef81070cb8cf158062 Mon Sep 17 00:00:00 2001
From: Song Yuxing <34940506+rain7996@users.noreply.github.com>
Date: Wed, 3 Jun 2026 20:00:25 +0800
Subject: [PATCH] Revert "[Feature] Support computing entropy with fastdeploy
 runner (#7954)"

This reverts commit 529ec9ef60fa7effb5bd5ad54d180952eeaa97a4.
---
 fastdeploy/model_executor/entropy_utils.py    | 168 ++----------------
 .../model_executor/pre_and_post_process.py    |   4 -
 fastdeploy/worker/gpu_model_runner.py         |   3 +-
 .../test_entropy_utils_fd_runner_mtp.py       | 167 -----------------
 4 files changed, 11 insertions(+), 331 deletions(-)
 delete mode 100644 tests/model_executor/test_entropy_utils_fd_runner_mtp.py

diff --git a/fastdeploy/model_executor/entropy_utils.py b/fastdeploy/model_executor/entropy_utils.py
index d61e4afb486..21d1b3421e9 100644
--- a/fastdeploy/model_executor/entropy_utils.py
+++ b/fastdeploy/model_executor/entropy_utils.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 """
 
-import os
-
 import paddle
 
 from fastdeploy.utils import data_processor_logger
 
 
 def get_entropy(logits):
+    # Check for -inf values in logits
     if paddle.any(paddle.isinf(logits) & (logits < 0)):
         data_processor_logger.debug("Detected -inf values in logits, clipping to minimum value")
         logits = paddle.clip(logits, min=1e-9)
@@ -33,36 +32,7 @@ def get_entropy(logits):
     return paddle.sum(p0 * (paddle.log(z0) - a0), axis=-1)
 
 
-def _log_entropy(share_inputs, i):
-    elist = share_inputs["entropy_list"][i]
-    data_processor_logger.info(
-        f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(elist)/len(elist)}, steps: {len(elist)}, all_values: {elist}"
-    )
-    share_inputs["entropy_list"][i] = []
-
-
-# ==============================================================================
-# ernie5_model_runner path (original logic from commit 361a310)
-# ==============================================================================
-
-
 def calculate_logits_entropy(logits, share_inputs, temperature):
-    use_fd_runner = os.environ.get("EB5_ENABLE_FD_RUNNER", "0") == "1"
-    if use_fd_runner:
-        calculate_logits_entropy_fd(logits, share_inputs, temperature)
-    else:
-        calculate_logits_entropy_ori(logits, share_inputs, temperature)
-
-
-def speculate_calculate_logits_entropy(logits, share_inputs, temperature):
-    use_fd_runner = os.environ.get("EB5_ENABLE_FD_RUNNER", "0") == "1"
-    if use_fd_runner:
-        speculate_calculate_logits_entropy_fd(logits, share_inputs, temperature)
-    else:
-        speculate_calculate_logits_entropy_ori(logits, share_inputs, temperature)
-
-
-def calculate_logits_entropy_ori(logits, share_inputs, temperature):
     real_bsz = share_inputs["seq_lens_this_time"].shape[0]
     real_seq_lens = paddle.where(
         share_inputs["seq_lens_encoder"][:real_bsz].squeeze(1) != 0,
@@ -87,10 +57,13 @@ def calculate_logits_entropy_ori(logits, share_inputs, temperature):
             and share_inputs["seq_lens_decoder"][i] != 0
             and len(share_inputs["entropy_list"][i]) != 0
         ):
-            _log_entropy(share_inputs, i)
+            data_processor_logger.info(
+                f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(share_inputs['entropy_list'][i])/len(share_inputs['entropy_list'][i])}"
+            )
+            share_inputs["entropy_list"][i] = []
 
 
-def speculate_calculate_logits_entropy_ori(logits, share_inputs, temperature):
+def speculate_calculate_logits_entropy(logits, share_inputs, temperature):
     # get accepted logits
     real_bsz = share_inputs["seq_lens_this_time"].shape[0]
     total_accepted_num = paddle.sum(share_inputs["accept_num"])
@@ -127,128 +100,7 @@ def speculate_calculate_logits_entropy_ori(logits, share_inputs, temperature):
             and share_inputs["seq_lens_decoder"][i] != 0
             and len(share_inputs["entropy_list"][i]) != 0
         ):
-            _log_entropy(share_inputs, i)
-
-
-# ==============================================================================
-# gpu_model_runner (FD runner) path
-# ==============================================================================
-
-
-def calculate_logits_entropy_fd(logits, share_inputs, temperature):
-    real_bsz = share_inputs["seq_lens_this_time"].shape[0]
-    seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz]
-    seq_lens_this_time = share_inputs["seq_lens_this_time"]
-    if seq_lens_encoder.ndim == 2:
-        seq_lens_encoder = seq_lens_encoder.squeeze(1)
-    if seq_lens_this_time.ndim == 2:
-        seq_lens_this_time = seq_lens_this_time.squeeze(1)
-    real_seq_lens = paddle.where(
-        seq_lens_encoder != 0,
-        paddle.ones([1], dtype="int32"),
-        seq_lens_this_time,
-    )
-
-    for i in range(real_bsz):
-        if int(real_seq_lens[i]) == 0:
-            continue
-        t = temperature[i]
-        if t > 0 and t != 1.0:
-            logits[i] = logits[i].scale_(1 / t)
-
-    entropy_tensor = get_entropy(logits[:real_bsz])
-
-    for i in range(real_bsz):
-        if int(real_seq_lens[i]) == 0:
-            continue
-        entropy_val = float(entropy_tensor[i])
-        share_inputs["entropy_list"][i].append(entropy_val)
-        if (
-            share_inputs["stop_flags"][i]
-            and share_inputs["seq_lens_decoder"][i] != 0
-            and len(share_inputs["entropy_list"][i]) != 0
-        ):
-            _log_entropy(share_inputs, i)
-
-
-def speculate_calculate_logits_entropy_fd(logits, share_inputs, temperature):
-    real_bsz = share_inputs["seq_lens_this_time"].shape[0]
-    total_accepted_num = int(paddle.sum(share_inputs["accept_num"][:real_bsz]))
-
-    if total_accepted_num == 0:
-        for i in range(real_bsz):
-            if (
-                share_inputs["stop_flags"][i]
-                and share_inputs["seq_lens_decoder"][i] != 0
-                and len(share_inputs["entropy_list"][i]) != 0
-            ):
-                _log_entropy(share_inputs, i)
-        return
-
-    seq_lens_encoder = share_inputs["seq_lens_encoder"][:real_bsz]
-    seq_lens_this_time = share_inputs["seq_lens_this_time"]
-    if seq_lens_encoder.ndim == 2:
-        seq_lens_encoder = seq_lens_encoder.squeeze(1)
-    if seq_lens_this_time.ndim == 2:
-        seq_lens_this_time = seq_lens_this_time.squeeze(1)
-    real_seq_lens = paddle.where(
-        seq_lens_encoder != 0,
-        paddle.ones([1], dtype="int32"),
-        seq_lens_this_time,
-    )
-    seq_start_idx = paddle.concat([paddle.zeros([1], dtype="int32"), paddle.cumsum(real_seq_lens, dtype="int32")])
-    repeated_starts = paddle.repeat_interleave(seq_start_idx[:-1], share_inputs["accept_num"][:real_bsz])
-    offsets = paddle.concat([paddle.arange(share_inputs["accept_num"][i].item()) for i in range(real_bsz)]).astype(
-        "int32"
-    )
-    accepted_idx = repeated_starts + offsets
-
-    accepted_logits = paddle.index_select(logits, accepted_idx, axis=0)
-
-    batch_indices = paddle.arange(real_bsz, dtype="int32")
-    batch_id_per_token = paddle.repeat_interleave(batch_indices, share_inputs["accept_num"][:real_bsz])
-    temp_per_token = temperature[batch_id_per_token].flatten()
-    scale = paddle.where(
-        temp_per_token > 0,
-        1.0 / temp_per_token,
-        paddle.ones_like(temp_per_token),
-    )
-    accepted_logits = accepted_logits * scale.unsqueeze(-1)
-
-    entropy_tensor = get_entropy(accepted_logits)
-    entropy = entropy_tensor.tolist()
-
-    idx = 0
-    for i in range(real_bsz):
-        accept_count = int(share_inputs["accept_num"][i])
-        if accept_count > 0:
-            for _ in range(accept_count):
-                e_val = entropy[idx]
-                share_inputs["entropy_list"][i].append(e_val)
-                idx += 1
-        if (
-            share_inputs["stop_flags"][i]
-            and share_inputs["seq_lens_decoder"][i] != 0
-            and len(share_inputs["entropy_list"][i]) != 0
-        ):
-            _log_entropy(share_inputs, i)
-
-
-# ==============================================================================
-# Common utility
-# ==============================================================================
-
-
-def flush_entropy_on_stop(share_inputs):
-    """
-    Flush entropy for requests whose stop_flags became True after entropy calculation.
-    Called after unified_update_model_status which sets stop_flags for max_dec_len.
-    """
-    real_bsz = share_inputs["seq_lens_this_time"].shape[0]
-    for i in range(real_bsz):
-        if (
-            share_inputs["stop_flags"][i]
-            and share_inputs["seq_lens_decoder"][i] != 0
-            and len(share_inputs["entropy_list"][i]) != 0
-        ):
-            _log_entropy(share_inputs, i)
+            data_processor_logger.info(
+                f"req_id: {share_inputs['req_ids'][i]}, entropy: {sum(share_inputs['entropy_list'][i])/len(share_inputs['entropy_list'][i])}"
+            )
+            share_inputs["entropy_list"][i] = []
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 10675b3f36f..2cd085e37c3 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -107,7 +107,6 @@
 
 from fastdeploy.model_executor.entropy_utils import (
     calculate_logits_entropy,
-    flush_entropy_on_stop,
     speculate_calculate_logits_entropy,
 )
 from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
@@ -521,9 +520,6 @@ def post_process_speculate(
         model_output.max_dec_len,  # max_dec_len
     )
 
-    if enable_entropy:
-        flush_entropy_on_stop(share_inputs)
-
 
 def save_output_speculate(
     sampler_output: SamplerOutput,
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 4bcdff61e8b..b787ee54505 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1254,7 +1254,7 @@ def _dummy_prefill_inputs(self, input_length_list: List[int], max_dec_len_list:
             self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
                 idx * block_num, (idx + 1) * block_num, 1
             )
-        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:batch_size]
+        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"]
 
     def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_profile_run=False) -> None:
         """Prepare the model inputs"""
@@ -2416,7 +2416,6 @@ def execute_model_overlap(
         model_inputs, p_done_idxs, token_num_event = self._preprocess(
             model_forward_batch, num_running_requests, self._cached_launch_token_num, self._cached_real_bsz
         )
-
         model_output = self._execute(model_inputs)
         # save output (last batch)
         if self._cached_model_output_data is not None:
diff --git a/tests/model_executor/test_entropy_utils_fd_runner_mtp.py b/tests/model_executor/test_entropy_utils_fd_runner_mtp.py
deleted file mode 100644
index 9c9685d48f4..00000000000
--- a/tests/model_executor/test_entropy_utils_fd_runner_mtp.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Unit tests for entropy_utils FD runner path.
-
-Run:  python tests/model_executor/test_entropy_utils_fd_runner_mtp.py
-"""
-
-import math
-import unittest
-
-import paddle
-
-from fastdeploy.model_executor.entropy_utils import (
-    calculate_logits_entropy_fd,
-    flush_entropy_on_stop,
-    get_entropy,
-    speculate_calculate_logits_entropy_fd,
-)
-
-
-def _make_share_inputs(
-    bsz, seq_lens_this_time, seq_lens_encoder, seq_lens_decoder, stop_flags, req_ids, accept_num=None
-):
-    d = {
-        "seq_lens_this_time": paddle.to_tensor(seq_lens_this_time, dtype="int32"),
-        "seq_lens_encoder": paddle.to_tensor(seq_lens_encoder, dtype="int32"),
-        "seq_lens_decoder": paddle.to_tensor(seq_lens_decoder, dtype="int32"),
-        "entropy_list": [[] for _ in range(bsz)],
-        "stop_flags": paddle.to_tensor(stop_flags, dtype="bool"),
-        "req_ids": req_ids,
-    }
-    if accept_num is not None:
-        d["accept_num"] = paddle.to_tensor(accept_num, dtype="int32")
-    return d
-
-
-class TestGetEntropy(unittest.TestCase):
-    def test_uniform_and_deterministic(self):
-        logits = paddle.zeros([1, 4], dtype="float32")
-        self.assertAlmostEqual(float(get_entropy(logits)[0]), math.log(4), places=5)
-
-        logits = paddle.to_tensor([[100.0, 0.0, 0.0, 0.0]], dtype="float32")
-        self.assertAlmostEqual(float(get_entropy(logits)[0]), 0.0, places=5)
-
-    def test_negative_inf(self):
-        logits = paddle.to_tensor([[10.0, -float("inf"), -float("inf")]], dtype="float32")
-        self.assertGreaterEqual(float(get_entropy(logits)[0]), 0.0)
-
-
-class TestNonMTP(unittest.TestCase):
-    def test_accumulation_and_skip_zero(self):
-        si = _make_share_inputs(3, [1, 1, 0], [0, 0, 0], [10, 10, 0], [False, False, False], ["a", "b", "c"])
-        logits = paddle.to_tensor([[10.0, 1.0, 1.0], [1.0, 1.0, 10.0], [5.0, 5.0, 5.0]], dtype="float32")
-        calculate_logits_entropy_fd(logits, si, paddle.ones([3], dtype="float32"))
-
-        self.assertEqual(len(si["entropy_list"][0]), 1)
-        self.assertEqual(len(si["entropy_list"][1]), 1)
-        self.assertEqual(len(si["entropy_list"][2]), 0)
-        # [10,1,1] and [1,1,10] symmetric => same entropy
-        self.assertAlmostEqual(si["entropy_list"][0][0], si["entropy_list"][1][0], places=6)
-
-    def test_stop_flags_and_temperature(self):
-        # stop_flags clears entropy
-        si = _make_share_inputs(2, [1, 1], [0, 0], [10, 10], [True, False], ["a", "b"])
-        logits = paddle.to_tensor([[10.0, 1.0, 1.0], [10.0, 1.0, 1.0]], dtype="float32")
-        calculate_logits_entropy_fd(logits, si, paddle.ones([2], dtype="float32"))
-        self.assertEqual(si["entropy_list"][0], [])
-        self.assertEqual(len(si["entropy_list"][1]), 1)
-
-        # temperature scaling: lower temp => lower entropy
-        si = _make_share_inputs(2, [1, 1], [0, 0], [10, 10], [False, False], ["a", "b"])
-        logits = paddle.to_tensor([[10.0, 1.0, 1.0], [10.0, 1.0, 1.0]], dtype="float32")
-        calculate_logits_entropy_fd(logits, si, paddle.to_tensor([1.0, 0.5], dtype="float32"))
-        self.assertGreater(si["entropy_list"][0][0], si["entropy_list"][1][0])
-
-
-class TestMTP(unittest.TestCase):
-    def test_accepted_idx_and_partial(self):
-        """accept_num=[2, 0, 1, 2], verifies correct row extraction and per-slot counts."""
-        si = _make_share_inputs(
-            4,
-            [2, 2, 2, 2],
-            [0, 0, 0, 0],
-            [10, 10, 10, 10],
-            [False, False, False, False],
-            ["a", "b", "c", "d"],
-            [2, 0, 1, 2],
-        )
-        logits = paddle.to_tensor(
-            [
-                [10.0, 1.0, 1.0],
-                [1.0, 10.0, 1.0],  # slot 0 (both accepted)
-                [5.0, 5.0, 5.0],
-                [5.0, 5.0, 5.0],  # slot 1 (none accepted)
-                [1.0, 1.0, 10.0],
-                [5.0, 5.0, 5.0],  # slot 2 (first accepted)
-                [10.0, 10.0, 1.0],
-                [1.0, 10.0, 10.0],  # slot 3 (both accepted)
-            ],
-            dtype="float32",
-        )
-        speculate_calculate_logits_entropy_fd(logits, si, paddle.ones([4], dtype="float32"))
-
-        self.assertEqual(len(si["entropy_list"][0]), 2)
-        self.assertEqual(len(si["entropy_list"][1]), 0)
-        self.assertEqual(len(si["entropy_list"][2]), 1)
-        self.assertEqual(len(si["entropy_list"][3]), 2)
-        # [10,1,1] [1,10,1] [1,1,10] all symmetric => same entropy
-        self.assertAlmostEqual(si["entropy_list"][0][0], si["entropy_list"][0][1], places=6)
-        self.assertAlmostEqual(si["entropy_list"][0][0], si["entropy_list"][2][0], places=6)
-
-    def test_zero_accepted_flushes_stop(self):
-        si = _make_share_inputs(2, [1, 1], [0, 0], [10, 10], [True, False], ["a", "b"], [0, 0])
-        si["entropy_list"][0] = [1.0, 2.0, 3.0]
-        si["entropy_list"][1] = [4.0, 5.0]
-        speculate_calculate_logits_entropy_fd(
-            paddle.zeros([2, 3], dtype="float32"), si, paddle.ones([2], dtype="float32")
-        )
-        self.assertEqual(si["entropy_list"][0], [])
-        self.assertEqual(si["entropy_list"][1], [4.0, 5.0])
-
-    def test_multi_step_and_stop(self):
-        si = _make_share_inputs(2, [2, 2], [0, 0], [10, 10], [False, False], ["a", "b"], [2, 1])
-        logits = paddle.to_tensor(
-            [[10.0, 1.0, 1.0], [1.0, 10.0, 1.0], [1.0, 1.0, 10.0], [5.0, 5.0, 5.0]], dtype="float32"
-        )
-        speculate_calculate_logits_entropy_fd(logits, si, paddle.ones([2], dtype="float32"))
-        self.assertEqual(len(si["entropy_list"][0]), 2)
-        self.assertEqual(len(si["entropy_list"][1]), 1)
-
-        # Step 2: slot 1 stops
-        si["accept_num"] = paddle.to_tensor([1, 1], dtype="int32")
-        si["stop_flags"] = paddle.to_tensor([False, True], dtype="bool")
-        logits2 = paddle.to_tensor(
-            [[10.0, 1.0, 1.0], [5.0, 5.0, 5.0], [10.0, 1.0, 1.0], [5.0, 5.0, 5.0]], dtype="float32"
-        )
-        speculate_calculate_logits_entropy_fd(logits2, si, paddle.ones([2], dtype="float32"))
-        self.assertEqual(len(si["entropy_list"][0]), 3)
-        self.assertEqual(si["entropy_list"][1], [])
-
-
-class TestFlushEntropyOnStop(unittest.TestCase):
-    def test_flush(self):
-        si = _make_share_inputs(3, [1, 1, 1], [0, 0, 0], [10, 10, 10], [True, False, True], ["a", "b", "c"])
-        si["entropy_list"][0] = [1.0, 2.0]
-        si["entropy_list"][2] = [3.0]
-        flush_entropy_on_stop(si)
-        self.assertEqual(si["entropy_list"][0], [])
-        self.assertEqual(si["entropy_list"][1], [])
-        self.assertEqual(si["entropy_list"][2], [])
-
-
-if __name__ == "__main__":
-    unittest.main()