add logits compute into FD

liuruyan · liuruyan · commit 85e722a1f26e · 2026-03-26T14:50:45.000+08:00
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
@@ -2029,6 +2029,7 @@ def _start_worker_service(self):
             "use_internode_ll_two_stage": self.cfg.parallel_config.use_internode_ll_two_stage,
             "disable_sequence_parallel_moe": self.cfg.parallel_config.disable_sequence_parallel_moe,
             "enable_logprob": self.cfg.model_config.enable_logprob,
+            "compute_logits_stats": self.cfg.model_config.compute_logits_stats,
             "lm_head_fp32": self.cfg.model_config.lm_head_fp32,
             "enable_entropy": self.cfg.model_config.enable_entropy,
             "enable_overlap_schedule": self.cfg.scheduler_config.enable_overlap_schedule,
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -364,6 +364,7 @@ def __init__(self, fd_config: FDConfig = None, logprobs_mode: str = "raw_logprob
 
         self.guided_decoding = GuidedDecoding(fd_config)
         self.logprobs_mode = fd_config.model_config.logprobs_mode if fd_config is not None else logprobs_mode
+        self.compute_logits_stats = fd_config.model_config.compute_logits_stats if fd_config is not None else False
         # Can only be created when fd_config.early_stopper_config.enable_early_stop = True
         if (
             fd_config is not None
@@ -507,6 +508,19 @@ def forward_cuda(
             elif self.logprobs_mode == "raw_logits":
                 raw_logprobs = logits.clone()
 
+        # Compute logits statistics (min/max/mean/std) per sequence before penalties
+        logits_min = None
+        logits_max = None
+        logits_mean = None
+        logits_std = None
+        if num_logprobs is not None and self.compute_logits_stats:
+            with paddle.no_grad():
+                # logits shape: [batch_size, vocab_size], compute stats per sequence (reduce over vocab dimension)
+                logits_min = paddle.min(logits, axis=1)  # [batch_size]
+                logits_max = paddle.max(logits, axis=1)  # [batch_size]
+                logits_mean = paddle.mean(logits, axis=1)  # [batch_size]
+                logits_std = paddle.std(logits, axis=1)  # [batch_size]
+
         for proc in sampling_metadata.logits_processors or []:
             logits = proc.apply(logits)
 
@@ -546,6 +560,33 @@ def forward_cuda(
         logprobs_tensors = (
             None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
         )
+
+        # Pack logits stats into LogprobsTensors
+        if logprobs_tensors is not None and logits_min is not None:
+            if current_platform.is_cuda():
+                logits_min_cpu = paddle.empty_like(logits_min, device="cpu").pin_memory()
+                logits_max_cpu = paddle.empty_like(logits_max, device="cpu").pin_memory()
+                logits_mean_cpu = paddle.empty_like(logits_mean, device="cpu").pin_memory()
+                logits_std_cpu = paddle.empty_like(logits_std, device="cpu").pin_memory()
+                logits_min_cpu.copy_(logits_min, False)
+                logits_max_cpu.copy_(logits_max, False)
+                logits_mean_cpu.copy_(logits_mean, False)
+                logits_std_cpu.copy_(logits_std, False)
+            else:
+                logits_min_cpu = logits_min.cpu()
+                logits_max_cpu = logits_max.cpu()
+                logits_mean_cpu = logits_mean.cpu()
+                logits_std_cpu = logits_std.cpu()
+            logprobs_tensors = LogprobsTensors(
+                logprob_token_ids=logprobs_tensors.logprob_token_ids,
+                logprobs=logprobs_tensors.logprobs,
+                selected_token_ranks=logprobs_tensors.selected_token_ranks,
+                logits_min=logits_min_cpu,
+                logits_max=logits_max_cpu,
+                logits_mean=logits_mean_cpu,
+                logits_std=logits_std_cpu,
+            )
+
         if sampling_metadata.enable_early_stop:
             # will set the stop batch in stop_flags
             assert sampling_metadata.stop_flags is not None, "need stop_flags for early stop"
@@ -621,6 +662,7 @@ def __init__(self, fd_config: FDConfig):
         else:
             raise NotImplementedError
         self.logprobs_mode = fd_config.model_config.logprobs_mode
+        self.compute_logits_stats = fd_config.model_config.compute_logits_stats
         self.speculative_verify_window = fd_config.speculative_config.verify_window
         self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
         self.speculative_benchmark_mode = fd_config.speculative_config.benchmark_mode
@@ -872,12 +914,36 @@ def forward_cuda(
 
         logprobs_tensors = None
         if num_logprobs is not None:
+            # Compute logits statistics on target_logits for accepted tokens
+            logits_min = None
+            logits_max = None
+            logits_mean = None
+            logits_std = None
+            if self.compute_logits_stats:
+                with paddle.no_grad():
+                    logits_min = paddle.min(target_logits, axis=1)
+                    logits_max = paddle.max(target_logits, axis=1)
+                    logits_mean = paddle.mean(target_logits, axis=1)
+                    logits_std = paddle.std(target_logits, axis=1)
+
             token_ids = share_inputs["accept_tokens"]
             idx = paddle.arange(share_inputs["accept_tokens"].shape[1], dtype="int32")
             mask = idx < share_inputs["accept_num"].unsqueeze(1)
             token_ids = paddle.masked_select(share_inputs["accept_tokens"], mask)
             logprobs_tensors = self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=token_ids)
 
+            # Pack logits stats into LogprobsTensors
+            if logits_min is not None:
+                logprobs_tensors = LogprobsTensors(
+                    logprob_token_ids=logprobs_tensors.logprob_token_ids,
+                    logprobs=logprobs_tensors.logprobs,
+                    selected_token_ranks=logprobs_tensors.selected_token_ranks,
+                    logits_min=logits_min,
+                    logits_max=logits_max,
+                    logits_mean=logits_mean,
+                    logits_std=logits_std,
+                )
+
         sampler_output = SamplerOutput(
             sampled_token_ids=share_inputs["accept_tokens"],
             logprobs_tensors=logprobs_tensors,
@@ -987,6 +1053,7 @@ def __init__(self, fd_config: FDConfig):
         else:
             raise NotImplementedError
         self.logprobs_mode = fd_config.model_config.logprobs_mode
+        self.compute_logits_stats = fd_config.model_config.compute_logits_stats
         self.enable_draft_logprob = fd_config.speculative_config.enable_draft_logprob
 
     def pre_process(self, skip_idx_list: List[int] = []):
@@ -1167,6 +1234,24 @@ def forward_cuda(
 
             logprobs_tensors = self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=token_ids)
 
+            # Compute logits statistics on draft_logits for MTP tokens
+            if self.compute_logits_stats:
+                draft_logits_for_stats = share_inputs["draft_logits"][:real_token_num, :]
+                with paddle.no_grad():
+                    logits_min = paddle.min(draft_logits_for_stats, axis=1)
+                    logits_max = paddle.max(draft_logits_for_stats, axis=1)
+                    logits_mean = paddle.mean(draft_logits_for_stats, axis=1)
+                    logits_std = paddle.std(draft_logits_for_stats, axis=1)
+                logprobs_tensors = LogprobsTensors(
+                    logprob_token_ids=logprobs_tensors.logprob_token_ids,
+                    logprobs=logprobs_tensors.logprobs,
+                    selected_token_ranks=logprobs_tensors.selected_token_ranks,
+                    logits_min=logits_min,
+                    logits_max=logits_max,
+                    logits_mean=logits_mean,
+                    logits_std=logits_std,
+                )
+
         sampler_output = SamplerOutput(
             sampled_token_ids=token_ids,
             logprobs_tensors=logprobs_tensors,
diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py
@@ -21,6 +21,7 @@
 from fastdeploy.model_executor.layers.rotary_embedding import get_rope
 from fastdeploy.model_executor.logits_processor import build_logits_processors
 from fastdeploy.platforms import current_platform
+from fastdeploy.worker.output import LogprobsTensors
 
 
 class InputBatch:
@@ -1127,9 +1128,26 @@ def recover_batch_index_for_sampler_output(sampler_output, index_to_batch_id, en
         real_logprob_token_ids = _recover_tensor(logprob_token_ids, src_order)
         real_logprobs = _recover_tensor(logprobs, src_order)
         real_selected_token_ranks = _recover_tensor(selected_token_ranks, src_order)
-        sampler_output.logprobs_tensors.logprob_token_ids = real_logprob_token_ids
-        sampler_output.logprobs_tensors.logprobs = real_logprobs
-        sampler_output.logprobs_tensors.sampled_token_ranks = real_selected_token_ranks
+
+        real_logits_min = None
+        real_logits_max = None
+        real_logits_mean = None
+        real_logits_std = None
+        if sampler_output.logprobs_tensors.logits_min is not None:
+            real_logits_min = _recover_tensor(sampler_output.logprobs_tensors.logits_min, src_order)
+            real_logits_max = _recover_tensor(sampler_output.logprobs_tensors.logits_max, src_order)
+            real_logits_mean = _recover_tensor(sampler_output.logprobs_tensors.logits_mean, src_order)
+            real_logits_std = _recover_tensor(sampler_output.logprobs_tensors.logits_std, src_order)
+
+        sampler_output.logprobs_tensors = LogprobsTensors(
+            logprob_token_ids=real_logprob_token_ids,
+            logprobs=real_logprobs,
+            selected_token_ranks=real_selected_token_ranks,
+            logits_min=real_logits_min,
+            logits_max=real_logits_max,
+            logits_mean=real_logits_mean,
+            logits_std=real_logits_std,
+        )
 
     if sampler_output.token_num_per_batch is not None:
         token_num_per_batch = sampler_output.token_num_per_batch