Skip to content

Commit 93fad2c

Browse files
authored
Merge branch 'develop' into trtllm_allreduce
2 parents dcda582 + 26c47c2 commit 93fad2c

75 files changed

Lines changed: 1043 additions & 756 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/_gpu_4cards_case_test.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,17 @@ jobs:
181181
docker rm -f ${runner_name} || true
182182
fi
183183
184+
export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')
185+
184186
docker run --rm --net=host \
185-
--shm-size=64g \
186187
--sysctl kernel.msgmax=1048576 \
187188
--sysctl kernel.msgmnb=268435456 \
188189
--name ${runner_name} \
190+
--cap-add=SYS_PTRACE --cap-add=IPC_LOCK \
191+
--shm-size=64G \
192+
${RDMA_DEVICES} \
193+
--device=/dev/infiniband/rdma_cm \
194+
--ulimit memlock=-1:-1 \
189195
-v $(pwd):/workspace -w /workspace \
190196
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
191197
-v "${CACHE_DIR}/.cache:/root/.cache" \
@@ -197,6 +203,10 @@ jobs:
197203
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
198204
-e "FLASK_PORT=${FLASK_PORT}" \
199205
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
206+
-e "FD_ROUTER_PORT=${FD_ROUTER_PORT}" \
207+
-e "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}" \
208+
-e "FD_RDMA_PORT=${FD_RDMA_PORT}" \
209+
-e "CLEAN_CUDA=1" \
200210
-e TZ="Asia/Shanghai" \
201211
-e "fd_wheel_url=${fd_wheel_url}" \
202212
-e "BASE_REF=${BASE_REF}" \

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ custom_ops/tmp*
173173
build
174174

175175
.ccls-cache
176+
.claude
176177

177178
third_party
178179

benchmarks/benchmark_serving.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,8 +1123,10 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str
11231123
def main(args: argparse.Namespace):
11241124
"""Main entry point"""
11251125
print(args)
1126-
random.seed(args.seed)
1127-
np.random.seed(args.seed)
1126+
if args.seed is not None:
1127+
print(f"Using random seed: {args.seed}")
1128+
random.seed(args.seed)
1129+
np.random.seed(args.seed)
11281130

11291131
backend = args.backend
11301132
# 支持多轮对话方式请求,仅支持chat接口
@@ -1431,7 +1433,7 @@ def main(args: argparse.Namespace):
14311433
"bursty requests. A higher burstiness value (burstiness > 1) "
14321434
"results in a more uniform arrival of requests.",
14331435
)
1434-
parser.add_argument("--seed", type=int, default=0)
1436+
parser.add_argument("--seed", type=int, default=None)
14351437
parser.add_argument(
14361438
"--shuffle",
14371439
action="store_true",

custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,10 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
146146
rope_3d);
147147
} else {
148148
if (rotary_dim < dim_head) {
149-
auto* kernelFn =
150-
append_decode_cache_T_neox_partial_rope_kernel<T,
151-
PackSize,
152-
EnforceFmulRN>;
149+
auto* kernelFn = append_decode_cache_T_neox_partial_rope_kernel<
150+
T,
151+
PackSize,
152+
false>; // GLM use EnforceFmulRN=false
153153
launchWithPdlWhenEnabled(kernelFn,
154154
grid_size,
155155
blocksize,

custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2543,10 +2543,10 @@ void gqa_rotary_qk_variable(
25432543
}
25442544
const int pack_num_new = elem_nums / PackSize;
25452545
GetNumBlocks<128>(pack_num_new, &grid_size);
2546-
auto *kernelFn =
2547-
GQANeoxVariableLengthPartialRotaryKernel<T,
2548-
PackSize,
2549-
EnforceFmulRN>;
2546+
auto *kernelFn = GQANeoxVariableLengthPartialRotaryKernel<
2547+
T,
2548+
PackSize,
2549+
false>; // GLM use EnforceFmulRN=false
25502550
launchWithPdlWhenEnabled(kernelFn,
25512551
grid_size,
25522552
blocksize,

custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -387,30 +387,32 @@ void gqa_neox_partial_rotary_qk_split_variable(
387387

388388
const float *cos_emb = rotary_emb;
389389
const float *sin_emb = rotary_emb + max_model_len * rotary_dim / 2;
390-
launchWithPdlWhenEnabled(
391-
GQAVariableLengthNeoxPartialRotarySplitKernel<T, PackSize, EnforceFmulRN>,
392-
grid_size,
393-
block_size,
394-
0,
395-
stream,
396-
qkv_input,
397-
cos_emb,
398-
sin_emb,
399-
batch_id_per_token,
400-
cu_seqlens_q,
401-
seq_lens_encoder,
402-
seq_lens_decoder,
403-
cu_seqlens_k,
404-
qkv_out,
405-
q,
406-
k,
407-
v,
408-
elem_nums,
409-
num_heads,
410-
kv_num_heads,
411-
max_model_len,
412-
head_dim,
413-
rotary_dim);
390+
launchWithPdlWhenEnabled(GQAVariableLengthNeoxPartialRotarySplitKernel<
391+
T,
392+
PackSize,
393+
false>, // GLM use EnforceFmulRN=false
394+
grid_size,
395+
block_size,
396+
0,
397+
stream,
398+
qkv_input,
399+
cos_emb,
400+
sin_emb,
401+
batch_id_per_token,
402+
cu_seqlens_q,
403+
seq_lens_encoder,
404+
seq_lens_decoder,
405+
cu_seqlens_k,
406+
qkv_out,
407+
q,
408+
k,
409+
v,
410+
elem_nums,
411+
num_heads,
412+
kv_num_heads,
413+
max_model_len,
414+
head_dim,
415+
rotary_dim);
414416
}
415417

416418
template <typename T,

custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,11 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
130130
GetNumBlocks(pack_num, &grid_size);
131131
if (use_neox_style) {
132132
if (rotary_dim < dim_head) {
133-
append_speculate_cache_neox_partial_rope_kernel<T,
134-
PackSize,
135-
QKV_TYPE,
136-
EnforceFmulRN>
133+
append_speculate_cache_neox_partial_rope_kernel<
134+
T,
135+
PackSize,
136+
QKV_TYPE,
137+
false> // GLM use EnforceFmulRN=false
137138
<<<grid_size, threads_per_block, 0, stream>>>(
138139
qkv, // [token_num, num_heads + 2 * gqa_group_size, head_size]
139140
key_cache,

custom_ops/gpu_ops/fused_rotary_position_encoding.cu

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ __global__ void apply_rotary_embedding_kernel(
5454
const int num_heads,
5555
const int num_kv_heads,
5656
const int head_size) {
57-
// Each thread block is responsible for one token.
5857
const int token_idx = blockIdx.x;
58+
5959
int pos = position_ids[token_idx];
6060
const T* cache_ptr = cos_sin_cache + pos * rot_dim;
6161

@@ -99,13 +99,10 @@ void FusedRotaryPositionEncoding(
9999
int64_t query_stride = num_heads * head_size;
100100
int64_t key_stride = num_kv_heads * head_size;
101101

102-
if (num_tokens > 65535) {
103-
PD_THROW(
104-
"apply_rotary_embedding_kernel launch failed when num_tokens > 65535.");
105-
}
106-
102+
// 1D grid:gridDim.x 最大 2^31-1,远超实际 token 数
107103
dim3 grid(num_tokens);
108104
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
105+
109106
PD_DISPATCH_FLOATING_AND_HALF_TYPES(
110107
query.dtype(), "apply_rotary_embedding_kernel", [&] {
111108
if (is_neox) {

custom_ops/gpu_ops/get_attn_mask_q.cu

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ __global__ void get_attn_mask_q_kernel(
2424
const int max_batch_size) {
2525
constexpr int VecSize = 4;
2626
const uint32_t tid = threadIdx.x, bid = blockIdx.x;
27-
int startend_row_vec[4];
27+
int startend_row_vec[2];
2828
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
2929
cudaGridDependencySynchronize();
3030
#endif
@@ -49,9 +49,9 @@ __global__ void get_attn_mask_q_kernel(
4949
const uint32_t cache_k_idx = cu_seqlens_k_idx - kv_start;
5050

5151
startend_row_vec[0] = this_batch_q_end;
52-
startend_row_vec[1] = cu_seqlens_q[max_batch_size];
53-
startend_row_vec[2] = 0;
54-
startend_row_vec[3] = this_batch_q_end;
52+
// startend_row_vec[1] = cu_seqlens_q[max_batch_size];
53+
// startend_row_vec[2] = 0;
54+
startend_row_vec[1] = this_batch_q_end;
5555
for (int this_batch_q_idx = this_batch_q_start;
5656
this_batch_q_idx < this_batch_q_end;
5757
++this_batch_q_idx) {
@@ -62,14 +62,14 @@ __global__ void get_attn_mask_q_kernel(
6262
: this_batch_q_idx - this_batch_q_start + kv_len -
6363
(this_batch_q_len);
6464
if (cache_k_idx <= append_mask_k_end) {
65-
startend_row_vec[3] = min(startend_row_vec[3], this_batch_q_idx);
65+
startend_row_vec[1] = min(startend_row_vec[1], this_batch_q_idx);
6666
// 可提前跳出循环
6767
break;
6868
}
6969
}
70-
reinterpret_cast<int4*>(startend_row_indices_ptr +
71-
cu_seqlens_k_idx * 4)[0] =
72-
reinterpret_cast<int4*>(startend_row_vec)[0];
70+
reinterpret_cast<int2*>(startend_row_indices_ptr +
71+
cu_seqlens_k_idx * 2)[0] =
72+
reinterpret_cast<int2*>(startend_row_vec)[0];
7373
}
7474
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
7575
cudaTriggerProgrammaticLaunchCompletion();
@@ -82,7 +82,7 @@ std::vector<paddle::Tensor> get_attn_mask_q(
8282
const paddle::optional<paddle::Tensor>& attn_mask_kv,
8383
const int kv_token_num) {
8484
paddle::Tensor attn_mask_startend_row_indices = GetEmptyTensor(
85-
{1, 1, kv_token_num, 4}, paddle::DataType::INT32, cu_seqlens_k.place());
85+
{1, 1, kv_token_num, 2}, paddle::DataType::INT32, cu_seqlens_k.place());
8686
const int max_batch_size = cu_seqlens_k.dims()[0] - 1;
8787
constexpr int block_size = 512;
8888
int grid_size = div_up(kv_token_num, block_size);
@@ -123,7 +123,7 @@ std::vector<std::vector<int64_t>> GetAttnMaskQInferShape(
123123
const std::vector<int64_t>& cu_seqlens_k_shape,
124124
const paddle::optional<std::vector<int64_t>>& attn_mask_kv_shape,
125125
const int kv_token_num) {
126-
return {{1, 1, kv_token_num, 4}};
126+
return {{1, 1, kv_token_num, 2}};
127127
}
128128

129129
PD_BUILD_STATIC_OP(get_attn_mask_q)

custom_ops/gpu_ops/merge_prefill_decode_output.cu

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,49 @@ __global__ void FillEncoderDecoderResKernel(T *encoder_res_data,
4444
return;
4545
}
4646

47-
const int load_idx =
48-
((cu_seq_q[bidb] + token_id) * head_num + bidh) * head_dim + land_id * 4;
47+
const int base_idx =
48+
((cu_seq_q[bidb] + token_id) * head_num + bidh) * head_dim;
4949

50-
*reinterpret_cast<float2 *>(encoder_res_data + load_idx) =
51-
*reinterpret_cast<float2 *>(decoder_res_data + load_idx);
50+
if (head_dim == 128) {
51+
const int load_idx = base_idx + land_id * 4;
52+
*reinterpret_cast<float2 *>(encoder_res_data + load_idx) =
53+
*reinterpret_cast<float2 *>(decoder_res_data + load_idx);
54+
} else if (head_dim == 192) {
55+
const int load_idx = base_idx + land_id * 4;
56+
*reinterpret_cast<float2 *>(encoder_res_data + load_idx) =
57+
*reinterpret_cast<float2 *>(decoder_res_data + load_idx);
58+
if (land_id < 16) {
59+
*reinterpret_cast<float2 *>(encoder_res_data + load_idx + 128) =
60+
*reinterpret_cast<float2 *>(decoder_res_data + load_idx + 128);
61+
}
62+
} else if (head_dim == 256) {
63+
// float4 = 单条LDG.128,性能最优
64+
const int load_idx = base_idx + land_id * 8;
65+
*reinterpret_cast<float4 *>(encoder_res_data + load_idx) =
66+
*reinterpret_cast<float4 *>(decoder_res_data + load_idx);
67+
}
5268
}
5369

70+
#define LAUNCH_KERNEL(T, WARPS) \
71+
FillEncoderDecoderResKernel<WARPS> \
72+
<<<grid_dims, head_dim, 0, encoder_res.stream()>>>( \
73+
const_cast<T *>(encoder_res.data<T>()), \
74+
const_cast<T *>(decoder_res.data<T>()), \
75+
seq_lens_encoder.data<int>(), \
76+
seq_lens_decoder.data<int>(), \
77+
seq_lens_this_time.data<int>(), \
78+
cu_seq_q.data<int>(), \
79+
head_num, \
80+
head_dim)
81+
82+
#define LAUNCH_KERNEL_BY_HEAD_DIM(T) \
83+
if (head_dim == 128) \
84+
LAUNCH_KERNEL(T, 4); \
85+
else if (head_dim == 192) \
86+
LAUNCH_KERNEL(T, 6); \
87+
else if (head_dim == 256) \
88+
LAUNCH_KERNEL(T, 8)
89+
5490
void MergePrefillDecodeOutput(const paddle::Tensor &encoder_res,
5591
const paddle::Tensor &decoder_res,
5692
const paddle::Tensor &seq_lens_encoder,
@@ -60,41 +96,20 @@ void MergePrefillDecodeOutput(const paddle::Tensor &encoder_res,
6096
const int head_num,
6197
const int head_dim,
6298
const int max_token) {
63-
if (head_dim != 128) {
64-
PD_THROW("Only supported head_dim = 128");
99+
if (head_dim != 128 && head_dim != 192 && head_dim != 256) {
100+
PD_THROW("Only supported head_dim = 128, 192 or 256");
65101
}
66102
const int batch_size = seq_lens_encoder.shape()[0];
67-
constexpr int warps = 4;
103+
const int warps = head_dim / 32;
68104
const int tokens_block = (max_token + warps - 1) / warps;
69-
dim3 grid_dims;
70-
grid_dims.x = batch_size;
71-
grid_dims.y = head_num;
72-
grid_dims.z = tokens_block;
105+
dim3 grid_dims(batch_size, head_num, tokens_block);
73106

74107
if (encoder_res.dtype() == paddle::DataType::FLOAT16) {
75108
using T = phi::dtype::float16;
76-
FillEncoderDecoderResKernel<warps>
77-
<<<grid_dims, 128, 0, encoder_res.stream()>>>(
78-
const_cast<T *>(encoder_res.data<T>()),
79-
const_cast<T *>(decoder_res.data<T>()),
80-
seq_lens_encoder.data<int>(),
81-
seq_lens_decoder.data<int>(),
82-
seq_lens_this_time.data<int>(),
83-
cu_seq_q.data<int>(),
84-
head_num,
85-
head_dim);
109+
LAUNCH_KERNEL_BY_HEAD_DIM(T);
86110
} else if (encoder_res.dtype() == paddle::DataType::BFLOAT16) {
87111
using T = phi::dtype::bfloat16;
88-
FillEncoderDecoderResKernel<warps>
89-
<<<grid_dims, 128, 0, encoder_res.stream()>>>(
90-
const_cast<T *>(encoder_res.data<T>()),
91-
const_cast<T *>(decoder_res.data<T>()),
92-
seq_lens_encoder.data<int>(),
93-
seq_lens_decoder.data<int>(),
94-
seq_lens_this_time.data<int>(),
95-
cu_seq_q.data<int>(),
96-
head_num,
97-
head_dim);
112+
LAUNCH_KERNEL_BY_HEAD_DIM(T);
98113
}
99114
}
100115

0 commit comments

Comments
 (0)