BingooYang
diff --git a/‎.github/workflows/_gpu_4cards_case_test.yml‎
Lines changed: 11 additions & 1 deletion b/‎.github/workflows/_gpu_4cards_case_test.yml‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/benchmark_serving.py‎
Lines changed: 5 additions & 3 deletions b/‎benchmarks/benchmark_serving.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu‎
Lines changed: 4 additions & 4 deletions b/‎custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh‎
Lines changed: 4 additions & 4 deletions b/‎custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu‎
Lines changed: 26 additions & 24 deletions b/‎custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu‎
Lines changed: 5 additions & 4 deletions b/‎custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎custom_ops/gpu_ops/fused_rotary_position_encoding.cu‎
Lines changed: 3 additions & 6 deletions b/‎custom_ops/gpu_ops/fused_rotary_position_encoding.cu‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎custom_ops/gpu_ops/get_attn_mask_q.cu‎
Lines changed: 10 additions & 10 deletions b/‎custom_ops/gpu_ops/get_attn_mask_q.cu‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎custom_ops/gpu_ops/merge_prefill_decode_output.cu‎
Lines changed: 46 additions & 31 deletions b/‎custom_ops/gpu_ops/merge_prefill_decode_output.cu‎
Lines changed: 46 additions & 31 deletions
@@ -181,11 +181,17 @@ jobs:
             docker rm -f ${runner_name} || true
           fi
 
+          export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')
+
           docker run --rm --net=host \
-          --shm-size=64g \
           --sysctl kernel.msgmax=1048576 \
           --sysctl kernel.msgmnb=268435456 \
           --name ${runner_name} \
+          --cap-add=SYS_PTRACE --cap-add=IPC_LOCK \
+          --shm-size=64G \
+          ${RDMA_DEVICES} \
+          --device=/dev/infiniband/rdma_cm \
+          --ulimit memlock=-1:-1 \
           -v $(pwd):/workspace -w /workspace \
           -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
           -v "${CACHE_DIR}/.cache:/root/.cache" \
@@ -197,6 +203,10 @@ jobs:
           -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
           -e "FLASK_PORT=${FLASK_PORT}" \
           -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
+          -e "FD_ROUTER_PORT=${FD_ROUTER_PORT}" \
+          -e "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}" \
+          -e "FD_RDMA_PORT=${FD_RDMA_PORT}" \
+          -e "CLEAN_CUDA=1" \
           -e TZ="Asia/Shanghai" \
           -e "fd_wheel_url=${fd_wheel_url}" \
           -e "BASE_REF=${BASE_REF}" \
 
@@ -173,6 +173,7 @@ custom_ops/tmp*
 build
 
 .ccls-cache
+.claude
 
 third_party
 
 
@@ -1123,8 +1123,10 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str
 def main(args: argparse.Namespace):
     """Main entry point"""
     print(args)
-    random.seed(args.seed)
-    np.random.seed(args.seed)
+    if args.seed is not None:
+        print(f"Using random seed: {args.seed}")
+        random.seed(args.seed)
+        np.random.seed(args.seed)
 
     backend = args.backend
     # 支持多轮对话方式请求，仅支持chat接口
@@ -1431,7 +1433,7 @@ def main(args: argparse.Namespace):
         "bursty requests. A higher burstiness value (burstiness > 1) "
         "results in a more uniform arrival of requests.",
     )
-    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--seed", type=int, default=None)
     parser.add_argument(
         "--shuffle",
         action="store_true",
 
@@ -146,10 +146,10 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
           rope_3d);
     } else {
       if (rotary_dim < dim_head) {
-        auto* kernelFn =
-            append_decode_cache_T_neox_partial_rope_kernel<T,
-                                                           PackSize,
-                                                           EnforceFmulRN>;
+        auto* kernelFn = append_decode_cache_T_neox_partial_rope_kernel<
+            T,
+            PackSize,
+            false>;  // GLM use EnforceFmulRN=false
         launchWithPdlWhenEnabled(kernelFn,
                                  grid_size,
                                  blocksize,
 
@@ -2543,10 +2543,10 @@ void gqa_rotary_qk_variable(
         }
         const int pack_num_new = elem_nums / PackSize;
         GetNumBlocks<128>(pack_num_new, &grid_size);
-        auto *kernelFn =
-            GQANeoxVariableLengthPartialRotaryKernel<T,
-                                                     PackSize,
-                                                     EnforceFmulRN>;
+        auto *kernelFn = GQANeoxVariableLengthPartialRotaryKernel<
+            T,
+            PackSize,
+            false>;  // GLM use EnforceFmulRN=false
         launchWithPdlWhenEnabled(kernelFn,
                                  grid_size,
                                  blocksize,
 
@@ -387,30 +387,32 @@ void gqa_neox_partial_rotary_qk_split_variable(
 
   const float *cos_emb = rotary_emb;
   const float *sin_emb = rotary_emb + max_model_len * rotary_dim / 2;
-  launchWithPdlWhenEnabled(
-      GQAVariableLengthNeoxPartialRotarySplitKernel<T, PackSize, EnforceFmulRN>,
-      grid_size,
-      block_size,
-      0,
-      stream,
-      qkv_input,
-      cos_emb,
-      sin_emb,
-      batch_id_per_token,
-      cu_seqlens_q,
-      seq_lens_encoder,
-      seq_lens_decoder,
-      cu_seqlens_k,
-      qkv_out,
-      q,
-      k,
-      v,
-      elem_nums,
-      num_heads,
-      kv_num_heads,
-      max_model_len,
-      head_dim,
-      rotary_dim);
+  launchWithPdlWhenEnabled(GQAVariableLengthNeoxPartialRotarySplitKernel<
+                               T,
+                               PackSize,
+                               false>,  // GLM use EnforceFmulRN=false
+                           grid_size,
+                           block_size,
+                           0,
+                           stream,
+                           qkv_input,
+                           cos_emb,
+                           sin_emb,
+                           batch_id_per_token,
+                           cu_seqlens_q,
+                           seq_lens_encoder,
+                           seq_lens_decoder,
+                           cu_seqlens_k,
+                           qkv_out,
+                           q,
+                           k,
+                           v,
+                           elem_nums,
+                           num_heads,
+                           kv_num_heads,
+                           max_model_len,
+                           head_dim,
+                           rotary_dim);
 }
 
 template <typename T,
 
@@ -130,10 +130,11 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
   GetNumBlocks(pack_num, &grid_size);
   if (use_neox_style) {
     if (rotary_dim < dim_head) {
-      append_speculate_cache_neox_partial_rope_kernel<T,
-                                                      PackSize,
-                                                      QKV_TYPE,
-                                                      EnforceFmulRN>
+      append_speculate_cache_neox_partial_rope_kernel<
+          T,
+          PackSize,
+          QKV_TYPE,
+          false>  // GLM use EnforceFmulRN=false
           <<<grid_size, threads_per_block, 0, stream>>>(
               qkv,  // [token_num, num_heads + 2 * gqa_group_size, head_size]
               key_cache,
 
@@ -54,8 +54,8 @@ __global__ void apply_rotary_embedding_kernel(
     const int num_heads,
     const int num_kv_heads,
     const int head_size) {
-  // Each thread block is responsible for one token.
   const int token_idx = blockIdx.x;
+
   int pos = position_ids[token_idx];
   const T* cache_ptr = cos_sin_cache + pos * rot_dim;
 
@@ -99,13 +99,10 @@ void FusedRotaryPositionEncoding(
   int64_t query_stride = num_heads * head_size;
   int64_t key_stride = num_kv_heads * head_size;
 
-  if (num_tokens > 65535) {
-    PD_THROW(
-        "apply_rotary_embedding_kernel launch failed when num_tokens > 65535.");
-  }
-
+  // 1D grid：gridDim.x 最大 2^31-1，远超实际 token 数
   dim3 grid(num_tokens);
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       query.dtype(), "apply_rotary_embedding_kernel", [&] {
         if (is_neox) {
 
@@ -24,7 +24,7 @@ __global__ void get_attn_mask_q_kernel(
     const int max_batch_size) {
   constexpr int VecSize = 4;
   const uint32_t tid = threadIdx.x, bid = blockIdx.x;
-  int startend_row_vec[4];
+  int startend_row_vec[2];
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   cudaGridDependencySynchronize();
 #endif
@@ -49,9 +49,9 @@ __global__ void get_attn_mask_q_kernel(
     const uint32_t cache_k_idx = cu_seqlens_k_idx - kv_start;
 
     startend_row_vec[0] = this_batch_q_end;
-    startend_row_vec[1] = cu_seqlens_q[max_batch_size];
-    startend_row_vec[2] = 0;
-    startend_row_vec[3] = this_batch_q_end;
+    // startend_row_vec[1] = cu_seqlens_q[max_batch_size];
+    // startend_row_vec[2] = 0;
+    startend_row_vec[1] = this_batch_q_end;
     for (int this_batch_q_idx = this_batch_q_start;
          this_batch_q_idx < this_batch_q_end;
          ++this_batch_q_idx) {
@@ -62,14 +62,14 @@ __global__ void get_attn_mask_q_kernel(
                            : this_batch_q_idx - this_batch_q_start + kv_len -
                                  (this_batch_q_len);
       if (cache_k_idx <= append_mask_k_end) {
-        startend_row_vec[3] = min(startend_row_vec[3], this_batch_q_idx);
+        startend_row_vec[1] = min(startend_row_vec[1], this_batch_q_idx);
         // 可提前跳出循环
         break;
       }
     }
-    reinterpret_cast<int4*>(startend_row_indices_ptr +
-                            cu_seqlens_k_idx * 4)[0] =
-        reinterpret_cast<int4*>(startend_row_vec)[0];
+    reinterpret_cast<int2*>(startend_row_indices_ptr +
+                            cu_seqlens_k_idx * 2)[0] =
+        reinterpret_cast<int2*>(startend_row_vec)[0];
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   cudaTriggerProgrammaticLaunchCompletion();
@@ -82,7 +82,7 @@ std::vector<paddle::Tensor> get_attn_mask_q(
     const paddle::optional<paddle::Tensor>& attn_mask_kv,
     const int kv_token_num) {
   paddle::Tensor attn_mask_startend_row_indices = GetEmptyTensor(
-      {1, 1, kv_token_num, 4}, paddle::DataType::INT32, cu_seqlens_k.place());
+      {1, 1, kv_token_num, 2}, paddle::DataType::INT32, cu_seqlens_k.place());
   const int max_batch_size = cu_seqlens_k.dims()[0] - 1;
   constexpr int block_size = 512;
   int grid_size = div_up(kv_token_num, block_size);
@@ -123,7 +123,7 @@ std::vector<std::vector<int64_t>> GetAttnMaskQInferShape(
     const std::vector<int64_t>& cu_seqlens_k_shape,
     const paddle::optional<std::vector<int64_t>>& attn_mask_kv_shape,
     const int kv_token_num) {
-  return {{1, 1, kv_token_num, 4}};
+  return {{1, 1, kv_token_num, 2}};
 }
 
 PD_BUILD_STATIC_OP(get_attn_mask_q)
 
@@ -44,13 +44,49 @@ __global__ void FillEncoderDecoderResKernel(T *encoder_res_data,
     return;
   }
 
-  const int load_idx =
-      ((cu_seq_q[bidb] + token_id) * head_num + bidh) * head_dim + land_id * 4;
+  const int base_idx =
+      ((cu_seq_q[bidb] + token_id) * head_num + bidh) * head_dim;
 
-  *reinterpret_cast<float2 *>(encoder_res_data + load_idx) =
-      *reinterpret_cast<float2 *>(decoder_res_data + load_idx);
+  if (head_dim == 128) {
+    const int load_idx = base_idx + land_id * 4;
+    *reinterpret_cast<float2 *>(encoder_res_data + load_idx) =
+        *reinterpret_cast<float2 *>(decoder_res_data + load_idx);
+  } else if (head_dim == 192) {
+    const int load_idx = base_idx + land_id * 4;
+    *reinterpret_cast<float2 *>(encoder_res_data + load_idx) =
+        *reinterpret_cast<float2 *>(decoder_res_data + load_idx);
+    if (land_id < 16) {
+      *reinterpret_cast<float2 *>(encoder_res_data + load_idx + 128) =
+          *reinterpret_cast<float2 *>(decoder_res_data + load_idx + 128);
+    }
+  } else if (head_dim == 256) {
+    // float4 = 单条LDG.128，性能最优
+    const int load_idx = base_idx + land_id * 8;
+    *reinterpret_cast<float4 *>(encoder_res_data + load_idx) =
+        *reinterpret_cast<float4 *>(decoder_res_data + load_idx);
+  }
 }
 
+#define LAUNCH_KERNEL(T, WARPS)                           \
+  FillEncoderDecoderResKernel<WARPS>                      \
+      <<<grid_dims, head_dim, 0, encoder_res.stream()>>>( \
+          const_cast<T *>(encoder_res.data<T>()),         \
+          const_cast<T *>(decoder_res.data<T>()),         \
+          seq_lens_encoder.data<int>(),                   \
+          seq_lens_decoder.data<int>(),                   \
+          seq_lens_this_time.data<int>(),                 \
+          cu_seq_q.data<int>(),                           \
+          head_num,                                       \
+          head_dim)
+
+#define LAUNCH_KERNEL_BY_HEAD_DIM(T) \
+  if (head_dim == 128)               \
+    LAUNCH_KERNEL(T, 4);             \
+  else if (head_dim == 192)          \
+    LAUNCH_KERNEL(T, 6);             \
+  else if (head_dim == 256)          \
+  LAUNCH_KERNEL(T, 8)
+
 void MergePrefillDecodeOutput(const paddle::Tensor &encoder_res,
                               const paddle::Tensor &decoder_res,
                               const paddle::Tensor &seq_lens_encoder,
@@ -60,41 +96,20 @@ void MergePrefillDecodeOutput(const paddle::Tensor &encoder_res,
                               const int head_num,
                               const int head_dim,
                               const int max_token) {
-  if (head_dim != 128) {
-    PD_THROW("Only supported head_dim = 128");
+  if (head_dim != 128 && head_dim != 192 && head_dim != 256) {
+    PD_THROW("Only supported head_dim = 128, 192 or 256");
   }
   const int batch_size = seq_lens_encoder.shape()[0];
-  constexpr int warps = 4;
+  const int warps = head_dim / 32;
   const int tokens_block = (max_token + warps - 1) / warps;
-  dim3 grid_dims;
-  grid_dims.x = batch_size;
-  grid_dims.y = head_num;
-  grid_dims.z = tokens_block;
+  dim3 grid_dims(batch_size, head_num, tokens_block);
 
   if (encoder_res.dtype() == paddle::DataType::FLOAT16) {
     using T = phi::dtype::float16;
-    FillEncoderDecoderResKernel<warps>
-        <<<grid_dims, 128, 0, encoder_res.stream()>>>(
-            const_cast<T *>(encoder_res.data<T>()),
-            const_cast<T *>(decoder_res.data<T>()),
-            seq_lens_encoder.data<int>(),
-            seq_lens_decoder.data<int>(),
-            seq_lens_this_time.data<int>(),
-            cu_seq_q.data<int>(),
-            head_num,
-            head_dim);
+    LAUNCH_KERNEL_BY_HEAD_DIM(T);
   } else if (encoder_res.dtype() == paddle::DataType::BFLOAT16) {
     using T = phi::dtype::bfloat16;
-    FillEncoderDecoderResKernel<warps>
-        <<<grid_dims, 128, 0, encoder_res.stream()>>>(
-            const_cast<T *>(encoder_res.data<T>()),
-            const_cast<T *>(decoder_res.data<T>()),
-            seq_lens_encoder.data<int>(),
-            seq_lens_decoder.data<int>(),
-            seq_lens_this_time.data<int>(),
-            cu_seq_q.data<int>(),
-            head_num,
-            head_dim);
+    LAUNCH_KERNEL_BY_HEAD_DIM(T);
   }
 }