support xpu w4a8c8+skip quant

zccjjj · zccjjj · commit bc7152bd3eee · 2026-04-03T17:45:14.000+08:00
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -235,9 +235,11 @@ def allocated_slots(self, request: Request):
         return len(request.block_tables) * self.config.cache_config.block_size
 
     def get_new_block_nums(self, request: Request, num_new_tokens: int):
+        # Account for preallocated blocks that haven't been added to block_tables yet
+        preallocated_count = len(getattr(request, 'preallocated_blocks', []))
         block_num = (
             request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1
-        ) // self.config.cache_config.block_size - len(request.block_tables)
+        ) // self.config.cache_config.block_size - len(request.block_tables) - preallocated_count
 
         if self.config.speculative_config.method is not None:
             block_num = min(block_num + 1, self.config.cache_config.max_block_num_per_seq)
@@ -800,8 +802,14 @@ def get_enough_request(request, scheduled_reqs):
                         self.allocated_slots(request) - request.num_total_tokens
                         <= self.config.cache_config.prealloc_dec_block_slot_num_threshold
                     ):
+                        # First, consume any preallocated blocks before allocating new ones
+                        preallocated = getattr(request, 'preallocated_blocks', [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
+                            scheduled_reqs.append(self._prepare_decode_task(request))
                         # Allocation for next decoding blocks
-                        if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
+                        elif self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
                             llm_logger.debug(
                                 f"schedule decoding task: {request} request.num_total_tokens {request.num_total_tokens} request.num_computed_tokens {request.num_computed_tokens}"
                             )
@@ -911,6 +919,12 @@ def _allocate_decode_and_extend():
                         request.block_tables.extend(
                             self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
                         )
+                        # Merge preallocated blocks (from PD disaggregation) into block_tables
+                        # so the attention kernel can access all reserved blocks.
+                        preallocated = getattr(request, 'preallocated_blocks', [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
                         # Prepare prefill task
                         scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
                     else:  # Not enough blocks to allocate, trigger preemption
@@ -920,6 +934,11 @@ def _allocate_decode_and_extend():
                         request.block_tables.extend(
                             self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
                         )
+                        # Merge preallocated blocks (from PD disaggregation) into block_tables
+                        preallocated = getattr(request, 'preallocated_blocks', [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
                         # Prepare prefill task
                         scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
                     token_budget -= num_new_tokens
@@ -1403,9 +1422,10 @@ def preallocate_resource_in_d(self, request: Request):
         """
         assert self.config.scheduler_config.splitwise_role == "decode", "Only D instance can call this method"
         request.need_prefill_tokens = len(request.prompt_token_ids)
-        need_prealloc_prefill_blocks = (
+        actual_prefill_blocks = (
             request.need_prefill_tokens + self.config.cache_config.block_size - 1
-        ) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num
+        ) // self.config.cache_config.block_size
+        need_prealloc_prefill_blocks = actual_prefill_blocks + self.config.cache_config.enc_dec_block_num
 
         with self.lock:
             if len(self.waiting) > 0:
@@ -1416,11 +1436,16 @@ def preallocate_resource_in_d(self, request: Request):
             if not self.cache_manager.can_allocate_gpu_blocks(total_need_blocks):
                 return False
 
-            request.block_tables = self.cache_manager.allocate_gpu_blocks(
+            all_blocks = self.cache_manager.allocate_gpu_blocks(
                 need_prealloc_prefill_blocks, request.request_id
             )
+            # Only put the blocks that will actually contain prefilled KV data into block_tables.
+            # The extra enc_dec_block_num blocks are pre-reserved for future decode tokens and
+            # stored separately to avoid the attention kernel reading uninitialized KV cache data.
+            request.block_tables = all_blocks[:actual_prefill_blocks]
+            request.preallocated_blocks = all_blocks[actual_prefill_blocks:]
             request.num_computed_tokens = request.need_prefill_tokens
-            request.disaggregate_info["block_tables"] = request.block_tables
+            request.disaggregate_info["block_tables"] = all_blocks
             allocated_position = self.get_available_position()
             request.idx = allocated_position
             self.tasks_list[request.idx] = request
@@ -1470,6 +1495,12 @@ def add_prefilled_request(self, request_output: RequestOutput):
             self.running.append(request)
 
     def _free_blocks(self, request: Request):
+        # Also free any preallocated blocks that haven't been consumed yet
+        preallocated = getattr(request, 'preallocated_blocks', [])
+        if preallocated:
+            self.cache_manager.recycle_gpu_blocks(preallocated, request.request_id)
+            request.preallocated_blocks = []
+
         if self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode":
             self.cache_manager.release_block_ids(request)
             self.cache_manager.recycle_gpu_blocks(
diff --git a/fastdeploy/model_executor/layers/backends/xpu/attention.py b/fastdeploy/model_executor/layers/backends/xpu/attention.py
@@ -181,8 +181,9 @@ def forward_mixed(
         cache_v_scale = getattr(layer, "cache_v_scale", None)
         cache_k_out_scale = getattr(layer, "cache_k_out_scale", None)
         cache_v_out_scale = getattr(layer, "cache_v_out_scale", None)
-        cache_k_zp = getattr(self, "cache_k_zp", None)
-        cache_v_zp = getattr(self, "cache_v_zp", None)
+        # todo: there are accuracy bugs in block_attn with zero_point
+        cache_k_zp = getattr(layer, "cache_k_zp", None)
+        cache_v_zp = getattr(layer, "cache_v_zp", None)
 
         if layer.use_qk_norm:
             q_norm_weight = layer.q_norm_weight
@@ -220,8 +221,8 @@ def forward_mixed(
             cache_v_scale,
             cache_k_out_scale,
             cache_v_out_scale,
-            cache_k_zp,
-            cache_v_zp,
+            cache_k_zp.astype("bfloat16") if cache_k_zp is not None else None, # for C8 with zero_point
+            cache_v_zp.astype("bfloat16") if cache_v_zp is not None else None, # for C8 with zero_point
             None,  # shift
             None,  # smooth
             q_norm_weight,
diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -268,6 +268,12 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                         default_initializer=paddle.nn.initializer.Constant(0),
                     ),
                 )
+                set_weight_attrs(
+                    getattr(layer, self.added_scale_attrs[0]),
+                    {
+                        "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                    },
+                )
                 setattr(
                     layer,
                     self.added_scale_attrs[1],
@@ -277,6 +283,26 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                         default_initializer=paddle.nn.initializer.Constant(0),
                     ),
                 )
+                set_weight_attrs(
+                    getattr(layer, self.added_scale_attrs[1]),
+                    {
+                        "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                    },
+                )
+
+                set_weight_attrs(
+                    layer.up_gate_proj_weight,
+                    {
+                        "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                    },
+                )
+                set_weight_attrs(
+                    layer.down_proj_weight,
+                    {
+                        "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                    },
+                )
+
 
             if self.moe_quant_type in ["w8a8", "w4a8"]:
                 for in_scale_name in self.added_in_scale_attrs:
@@ -289,6 +315,21 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                             default_initializer=paddle.nn.initializer.Constant(0),
                         ),
                     )
+                set_weight_attrs(
+                    layer.down_proj_in_scale,
+                    {
+                        "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None},
+                        "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                    },
+                )
+
+                set_weight_attrs(
+                    layer.up_gate_proj_in_scale,
+                    { 
+                        "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None},
+                        "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
+                    },
+                )
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py
@@ -16,6 +16,7 @@
 
 from typing import Optional
 
+import os
 import paddle
 from paddle import nn
 
@@ -42,6 +43,7 @@ def __init__(self, kv_cache_quant_type: str, is_channel_wise: bool, has_zero_poi
         super().__init__()
         self.kv_cache_quant_type = kv_cache_quant_type
         self.is_channel_wise = is_channel_wise
+        self.has_zero_point = has_zero_point
 
         try:
             self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type)
@@ -139,6 +141,48 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         scale_shape = [layer.fd_config.model_config.num_key_value_heads]
         if self.cache_quant_config.is_channel_wise:
             scale_shape = [layer.kv_num_heads * layer.head_dim]
+            # Custom weight_loader for C8+TP: the safetensors scale/zp shape is
+            # [1, num_kv_heads, 1, head_dim]. We must split along the kv_heads
+            # dimension (dim=1), not the last dimension. The default_weight_loader
+            # treats output_dim as boolean and always splits along dim=-1, which
+            # is incorrect for 4D tensors where we need to split along dim=1.
+            fd_config = layer.fd_config
+            total_kv_heads = fd_config.model_config.num_key_value_heads
+            tp_size = fd_config.parallel_config.tensor_parallel_size
+            tp_rank = fd_config.parallel_config.tensor_parallel_rank
+            def _kv_scale_weight_loader(param, loaded_weight, shard_id=None,
+                                        _total_kv_heads=total_kv_heads,
+                                        _tp_size=tp_size, _tp_rank=tp_rank):
+                loaded_weight = get_tensor(loaded_weight).cast("float32")
+                # TP split along kv_heads dimension
+                if _tp_size > 1 and not fd_config.load_config.is_pre_sharded:
+                    head_dim = loaded_weight.numel() // _total_kv_heads
+                    loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim])
+                    kv_heads_per_rank = _total_kv_heads // _tp_size
+                    start = _tp_rank * kv_heads_per_rank
+                    end = start + kv_heads_per_rank
+                    loaded_weight = loaded_weight[start:end, :]
+                loaded_weight = (127/loaded_weight).reshape(param.shape).cast(param.dtype)
+                param.copy_(loaded_weight, False)
+            def _kv_zp_weight_loader(param, loaded_weight, shard_id=None,
+                                     _total_kv_heads=total_kv_heads,
+                                     _tp_size=tp_size, _tp_rank=tp_rank):
+                loaded_weight = get_tensor(loaded_weight).cast(param.dtype)
+                # TP split along kv_heads dimension
+                if _tp_size > 1 and not fd_config.load_config.is_pre_sharded:
+                    head_dim = loaded_weight.numel() // _total_kv_heads
+                    loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim])
+                    kv_heads_per_rank = _total_kv_heads // _tp_size
+                    start = _tp_rank * kv_heads_per_rank
+                    end = start + kv_heads_per_rank
+                    loaded_weight = loaded_weight[start:end, :]
+                loaded_weight = loaded_weight.reshape(param.shape)
+                param.copy_(loaded_weight, False)
+            scale_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_scale_weight_loader}
+            zp_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_zp_weight_loader}
+        else:
+            scale_weight_attrs = extra_weight_attrs
+            zp_weight_attrs = extra_weight_attrs
 
         layer.cache_k_scale = layer.create_parameter(
             shape=scale_shape,
@@ -154,13 +198,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         set_weight_attrs(
             layer.cache_k_scale,
             {
-                **extra_weight_attrs,
+                **scale_weight_attrs,
             },
         )
         set_weight_attrs(
             layer.cache_v_scale,
             {
-                **extra_weight_attrs,
+                **scale_weight_attrs,
             },
         )
 
@@ -189,13 +233,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             set_weight_attrs(
                 layer.cache_k_zp,
                 {
-                    **extra_weight_attrs,
+                    **zp_weight_attrs,
                 },
             )
             set_weight_attrs(
                 layer.cache_v_zp,
                 {
-                    **extra_weight_attrs,
+                    **zp_weight_attrs,
                 },
             )
 
@@ -218,11 +262,18 @@ def process_weights_after_loading(self, layer: nn.Layer):
         """
         use for loader v1
         """
-        # cache_k_out_scale is the reciprocal of cache_k_scale
-        if layer.cache_k_scale._is_initialized():
-            layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale)  # cache_k_out_scale
-        if layer.cache_v_scale._is_initialized():
-            layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
+        use_c8 = os.getenv("FD_XPU_YIYAN_MODEL", "0") == "1"
+        if use_c8:
+            if layer.cache_k_scale._is_initialized():
+                layer.cache_k_out_scale.set_value(self.cache_quant_config.max_bound / layer.cache_k_scale.cast("float32").reshape_([-1]))  # cache_k_out_scale
+            if layer.cache_v_scale._is_initialized():
+                layer.cache_v_out_scale.set_value(self.cache_quant_config.max_bound / layer.cache_v_scale.cast("float32").reshape_([-1]))
+        else:
+            # cache_k_out_scale is the reciprocal of cache_k_scale
+            if layer.cache_k_scale._is_initialized():
+                layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale)  # cache_k_out_scale
+            if layer.cache_v_scale._is_initialized():
+                layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
 
     def apply(self, layer):
         """
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -260,6 +260,19 @@ def __init__(
             tp_size={self.tp_size}."
         )
 
+    def _load_in_scale_weight(self, param, expert_id, loaded_weight):
+        # only spport ernie now
+        expert_param = param[expert_id - self.expert_id_offset]
+        loaded_weight = get_tensor(loaded_weight)
+        if len(expert_param.shape) != len(loaded_weight.shape):
+            loaded_weight = loaded_weight.reshape(expert_param.shape)
+        assert expert_param.shape == loaded_weight.shape, (
+            f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
+        )
+        if expert_param.dtype != loaded_weight.dtype:
+            loaded_weight = loaded_weight.cast(expert_param.dtype)
+        param[expert_id - self.expert_id_offset].copy_(loaded_weight, False)
+
     def weight_loader(
         self,
         param,
@@ -292,9 +305,15 @@ def weight_loader(
         if weight_need_transpose:
             loaded_weight = loaded_weight.transpose([1, 0])
 
+        if SHARD_ID_TO_SHARDED_DIM["gate"] is None and SHARD_ID_TO_SHARDED_DIM["up"] is None:
+            # in scale
+            self._load_in_scale_weight(param, expert_id, loaded_weight)
+            return
+
         if shard_id is None:
             # 1.gate up fused in disk
-            output_size = param[expert_id - self.expert_id_offset].shape[SHARD_ID_TO_SHARDED_DIM["gate"]]
+            shard_param = param[expert_id - self.expert_id_offset]
+            output_size = shard_param.shape[SHARD_ID_TO_SHARDED_DIM["gate"]]
             shard_offsets = [
                 # (shard_id, shard_offset, shard_size)
                 ("gate", 0, output_size // 2 * self.tp_size),
diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -173,6 +173,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     from .weight_only import WeightOnlyConfig, WINT4Config, WINT8Config
     from .wfp8afp8 import WFP8AFP8Config
     from .wint2 import WINT2Config
+    from fastdeploy.platforms import current_platform
 
     if envs.FD_MOE_MXFP4_BACKEND is not None:
         from .mxfp4 import MXFP4Config
@@ -196,4 +197,9 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     if quantization == "modelopt_fp4":
         method_to_config["modelopt_fp4"] = ModelOptNvFp4Config
 
+    # For XPU platform, use XPUKvCacheQuantConfig instead of KvCacheQuantConfig
+    if quantization == "kvcache" and current_platform.is_xpu():
+        from ..backends.xpu.quantization.kv_cache import XPUKvCacheQuantConfig
+        method_to_config["kvcache"] = XPUKvCacheQuantConfig
+
     return method_to_config[quantization]
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -597,6 +597,7 @@ def load_weights(self, weights_iterator) -> None:
             ("attn.cache_k_scale", "cachek_matmul.in_scale", None, None),
             ("attn.cache_v_scale", "cachev_matmul.in_scale", None, None),
             ("up_gate_proj_in_scale", "up_gate_proj.in_scale", None, None),
+            ("down_proj_in_scale", "down_proj.in_scale", None, None),
         ]
 
         expert_params_mapping = []
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py

Original file line number	Diff line number	Diff line change
`@@ -597,6 +597,7 @@ def load_weights(self, weights_iterator) -> None:`
`597`	`597`	`("attn.cache_k_scale", "cachek_matmul.in_scale", None, None),`
`598`	`598`	`("attn.cache_v_scale", "cachev_matmul.in_scale", None, None),`
`599`	`599`	`("up_gate_proj_in_scale", "up_gate_proj.in_scale", None, None),`
	`600`	`+ ("down_proj_in_scale", "down_proj.in_scale", None, None),`
`600`	`601`	`]`
`601`	`602`
`602`	`603`	`expert_params_mapping = []`