style: fix black formatting and drop unused var for pre-commit

sufubao · sufubao · commit 0752b1d2e1c4 · 2026-06-05T14:42:28.000+08:00
diff --git a/lightllm/common/basemodel/attention/fa3/fp.py b/lightllm/common/basemodel/attention/fa3/fp.py
@@ -105,7 +105,7 @@ def _nomarl_prefill_att(
 
         k_descale, v_descale = None, None  # disable quantization
         Lq = q.shape[-1]
-        sm_scale = 1.0 / (Lq**0.5)
+        sm_scale = 1.0 / (Lq ** 0.5)
         o = flash_attn_with_kvcache(
             q=q,
             k_cache=k.view(k.shape[0], 1, k.shape[1], k.shape[2]),
@@ -237,7 +237,7 @@ def _normal_decode_att(
 
         k_descale, v_descale = None, None  # disable quantization
         Lq = q.shape[-1]
-        sm_scale = 1.0 / (Lq**0.5)
+        sm_scale = 1.0 / (Lq ** 0.5)
         o = flash_attn_with_kvcache(
             q=q,
             k_cache=k.view(k.shape[0], 1, k.shape[1], k.shape[2]),
diff --git a/lightllm/common/basemodel/attention/fa3/fp8.py b/lightllm/common/basemodel/attention/fa3/fp8.py
@@ -44,9 +44,12 @@ def init_state(self):
             torch.arange(batch_size, device=device), self.infer_state.b_q_seq_len
         )
         # 为了减少推理计算量，在推理外部初始化k_descale和v_descale
-        self.k_descale = offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-        self.v_descale = offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-
+        self.k_descale = (
+            offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        )
+        self.v_descale = (
+            offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        )
 
     def prefill_att(
         self,
@@ -115,16 +118,19 @@ def init_state(self):
         super().init_state()
         self.backend: Fp8Fa3AttBackend = self.backend
 
-        device = self.infer_state.input_ids.device
         batch_size = self.b_att_seq_len.shape[0]
         mem_manager = self.backend.model.mem_manager
 
         offline_scales: torch.Tensor = mem_manager.scales
         head_num = mem_manager.head_num
 
         # 为了减少推理计算量，在推理外部初始化k_descale和v_descale
-        self.k_descale = offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-        self.v_descale = offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        self.k_descale = (
+            offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        )
+        self.v_descale = (
+            offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        )
 
         return
 
diff --git a/lightllm/models/qwen3_5_mtp/layer_infer/pre_layer_infer.py b/lightllm/models/qwen3_5_mtp/layer_infer/pre_layer_infer.py
@@ -6,7 +6,6 @@
 
 
 class Qwen3_5MTPPreLayerInfer(Qwen3VLMultimodalPreLayerInfer):
-
     def __init__(self, network_config):
         super().__init__(network_config)
         self.eps_ = network_config["rms_norm_eps"]
diff --git a/lightllm/models/qwen3_5_mtp/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_5_mtp/layer_weights/pre_and_post_layer_weight.py
@@ -9,7 +9,6 @@
 
 
 class Qwen3_5MTPPreAndPostLayerWeight(PreAndPostLayerWeight):
-
     def __init__(self, data_type, network_config, quant_cfg: Quantcfg):
         super().__init__(data_type, network_config)
         self.quant_cfg: Quantcfg = quant_cfg
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -115,11 +115,7 @@ def prefill_normal(
         model_input, run_reqs = prepare_prefill_inputs(prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
-            (
-                _,
-                next_token_ids_cpu,
-                next_token_logprobs_cpu,
-            ) = self._sample_and_scatter_token(
+            (_, next_token_ids_cpu, next_token_logprobs_cpu,) = self._sample_and_scatter_token(
                 logits=model_output.logits,
                 b_req_idx=model_input.b_req_idx,
                 b_mtp_index=model_input.b_mtp_index,
@@ -162,11 +158,7 @@ def decode_normal(
         model_input, run_reqs = prepare_decode_inputs(decode_reqs)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
-            (
-                _,
-                next_token_ids_cpu,
-                next_token_logprobs_cpu,
-            ) = self._sample_and_scatter_token(
+            (_, next_token_ids_cpu, next_token_logprobs_cpu,) = self._sample_and_scatter_token(
                 logits=model_output.logits,
                 b_req_idx=model_input.b_req_idx,
                 b_mtp_index=model_input.b_mtp_index,
@@ -204,11 +196,7 @@ def prefill_mtp(
         model_input, run_reqs = prepare_prefill_inputs(prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
-            (
-                next_token_ids,
-                next_token_ids_cpu,
-                next_token_logprobs_cpu,
-            ) = self._sample_and_scatter_token(
+            (next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu,) = self._sample_and_scatter_token(
                 logits=model_output.logits,
                 b_req_idx=model_input.b_req_idx,
                 b_mtp_index=model_input.b_mtp_index,
@@ -490,11 +478,7 @@ def _draft_decode_eagle(
         g_infer_state_lock.release()
         eagle_mem_indexes = eagle_mem_indexes_cpu.cuda(non_blocking=True)
 
-        (
-            draft_model_input,
-            draft_next_token_ids,
-            accepted_req_idx,
-        ) = self._build_eagle_accepted_draft_input(
+        (draft_model_input, draft_next_token_ids, accepted_req_idx,) = self._build_eagle_accepted_draft_input(
             main_model_input=main_model_input,
             main_model_output=main_model_output,
             next_token_ids=next_token_ids,
diff --git a/unit_tests/common/basemodel/test_mtp_decode_cuda_graph.py b/unit_tests/common/basemodel/test_mtp_decode_cuda_graph.py
@@ -286,11 +286,7 @@ def test_build_eagle_accepted_draft_input_narrows_to_accepted_rows():
     b_req_mtp_start_loc = torch.tensor([0, 3], dtype=torch.int32)
     mtp_accept_len = torch.tensor([2, 3], dtype=torch.int32)
 
-    (
-        draft_input,
-        accepted_next_tokens,
-        accepted_req_idx,
-    ) = backend._build_eagle_accepted_draft_input(
+    (draft_input, accepted_next_tokens, accepted_req_idx,) = backend._build_eagle_accepted_draft_input(
         main_model_input=main_input,
         main_model_output=main_output,
         next_token_ids=next_token_ids,