@@ -53,10 +53,10 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
5353 const int64_t nmat = ne / (ne00 * ne01);
5454 const int64_t n = ne00 * ne01;
5555
56- const int x = blockIdx .x * CUDA_CPY_TILE_DIM_2D + threadIdx .x ;
57- const int y = blockIdx .y * CUDA_CPY_TILE_DIM_2D + threadIdx .y ;
58- const int tx = blockIdx .y * CUDA_CPY_TILE_DIM_2D + threadIdx .x ; // transpose block offset
59- const int ty = blockIdx .x * CUDA_CPY_TILE_DIM_2D + threadIdx .y ;
56+ const int64_t x = ( int64_t ) blockIdx .x * CUDA_CPY_TILE_DIM_2D + threadIdx .x ;
57+ const int64_t y = ( int64_t ) blockIdx .y * CUDA_CPY_TILE_DIM_2D + threadIdx .y ;
58+ const int64_t tx = ( int64_t ) blockIdx .y * CUDA_CPY_TILE_DIM_2D + threadIdx .x ; // transpose block offset
59+ const int64_t ty = ( int64_t ) blockIdx .x * CUDA_CPY_TILE_DIM_2D + threadIdx .y ;
6060
6161 __shared__ float tile[2 ][CUDA_CPY_TILE_DIM_2D ][CUDA_CPY_TILE_DIM_2D +1 ];
6262 int cur_tile_buf = 0 ;
@@ -197,7 +197,7 @@ static void ggml_cpy_scalar_contiguous_cuda(
197197cudaStream_t stream) {
198198
199199 const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE ;
200- GGML_ASSERT (num_blocks < UINT_MAX );
200+ GGML_ASSERT (num_blocks <= INT_MAX );
201201 const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params ((dim3 )num_blocks, CUDA_CPY_BLOCK_SIZE , 0 , stream);
202202 ggml_cuda_kernel_launch (cpy_scalar_contiguous<src_t , dst_t >, launch_params, cx, cdst, ne);
203203}
@@ -208,6 +208,14 @@ static void ggml_cpy_scalar_cuda(
208208 const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
209209 const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
210210
211+ const auto launch_scalar_generic = [&]() {
212+ const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE ;
213+ GGML_ASSERT (num_blocks <= INT_MAX );
214+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params ((dim3 )num_blocks, CUDA_CPY_BLOCK_SIZE , 0 , stream);
215+ ggml_cuda_kernel_launch (cpy_scalar<cpy_1_scalar<src_t , dst_t >>, launch_params,
216+ cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
217+ };
218+
211219 if (transposed) {
212220 GGML_ASSERT (ne == ne00*ne01*ne02); // ne[3] is 1 assumed
213221 int64_t ne00n, ne01n, ne02n;
@@ -224,20 +232,18 @@ static void ggml_cpy_scalar_cuda(
224232 int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1 ) / CUDA_CPY_TILE_DIM_2D ;
225233 int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1 ) / CUDA_CPY_TILE_DIM_2D ;
226234 int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1 ) / CUDA_CPY_BLOCK_NM ;
227- GGML_ASSERT (grid_x < UINT_MAX );
228- GGML_ASSERT (grid_y < USHRT_MAX );
229- GGML_ASSERT (grid_z < USHRT_MAX );
230- dim3 dimGrid (grid_x, grid_y, grid_z);
231- dim3 dimBlock (CUDA_CPY_TILE_DIM_2D , CUDA_CPY_BLOCK_ROWS , 1 );
232- const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params (dimGrid, dimBlock, 0 , stream);
233- ggml_cuda_kernel_launch (cpy_scalar_transpose<dst_t >, launch_params,
234- cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
235+ GGML_ASSERT (grid_x <= INT_MAX );
236+ if (grid_y > USHRT_MAX || grid_z > USHRT_MAX ) {
237+ launch_scalar_generic ();
238+ } else {
239+ dim3 dimGrid (grid_x, grid_y, grid_z);
240+ dim3 dimBlock (CUDA_CPY_TILE_DIM_2D , CUDA_CPY_BLOCK_ROWS , 1 );
241+ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params (dimGrid, dimBlock, 0 , stream);
242+ ggml_cuda_kernel_launch (cpy_scalar_transpose<dst_t >, launch_params,
243+ cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
244+ }
235245 } else {
236- const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE ;
237- GGML_ASSERT (num_blocks < UINT_MAX );
238- const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params ((dim3 )num_blocks, CUDA_CPY_BLOCK_SIZE , 0 , stream);
239- ggml_cuda_kernel_launch (cpy_scalar<cpy_1_scalar<src_t , dst_t >>, launch_params,
240- cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
246+ launch_scalar_generic ();
241247 }
242248}
243249
@@ -248,7 +254,7 @@ static void ggml_cpy_f32_q8_0_cuda(
248254
249255 GGML_ASSERT (ne % QK8_0 == 0 );
250256 const int64_t num_blocks = ne / QK8_0 ;
251- GGML_ASSERT (num_blocks < UINT_MAX );
257+ GGML_ASSERT (num_blocks <= INT_MAX );
252258 cpy_f32_q<cpy_blck_f32_q8_0, QK8_0 ><<<num_blocks, 1 , 0 , stream>>>
253259 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
254260}
@@ -259,7 +265,7 @@ static void ggml_cpy_q8_0_f32_cuda(
259265 const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
260266
261267 const int64_t num_blocks = ne;
262- GGML_ASSERT (num_blocks < UINT_MAX );
268+ GGML_ASSERT (num_blocks <= INT_MAX );
263269 cpy_q_f32<cpy_blck_q8_0_f32, QK8_0 ><<<num_blocks, 1 , 0 , stream>>>
264270 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
265271}
@@ -271,7 +277,7 @@ static void ggml_cpy_f32_q4_0_cuda(
271277
272278 GGML_ASSERT (ne % QK4_0 == 0 );
273279 const int64_t num_blocks = ne / QK4_0 ;
274- GGML_ASSERT (num_blocks < UINT_MAX );
280+ GGML_ASSERT (num_blocks <= INT_MAX );
275281 cpy_f32_q<cpy_blck_f32_q4_0, QK4_0 ><<<num_blocks, 1 , 0 , stream>>>
276282 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
277283}
@@ -284,7 +290,7 @@ static void ggml_cpy_q4_0_f32_cuda(
284290 const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
285291 cudaStream_t stream) {
286292 const int64_t num_blocks = ne;
287- GGML_ASSERT (num_blocks < UINT_MAX );
293+ GGML_ASSERT (num_blocks <= INT_MAX );
288294 cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0 >, QK4_0 ><<<num_blocks, 1 , 0 , stream>>> (
289295 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
290296 ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -297,7 +303,7 @@ static void ggml_cpy_f32_q4_1_cuda(
297303
298304 GGML_ASSERT (ne % QK4_1 == 0 );
299305 const int64_t num_blocks = ne / QK4_1 ;
300- GGML_ASSERT (num_blocks < UINT_MAX );
306+ GGML_ASSERT (num_blocks <= INT_MAX );
301307 cpy_f32_q<cpy_blck_f32_q4_1, QK4_1 ><<<num_blocks, 1 , 0 , stream>>>
302308 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
303309}
@@ -310,7 +316,7 @@ static void ggml_cpy_q4_1_f32_cuda(
310316 const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
311317 cudaStream_t stream) {
312318 const int64_t num_blocks = ne;
313- GGML_ASSERT (num_blocks < UINT_MAX );
319+ GGML_ASSERT (num_blocks <= INT_MAX );
314320 cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1 >, QK4_1 ><<<num_blocks, 1 , 0 , stream>>> (
315321 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
316322 ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -323,7 +329,7 @@ static void ggml_cpy_f32_q5_0_cuda(
323329
324330 GGML_ASSERT (ne % QK5_0 == 0 );
325331 const int64_t num_blocks = ne / QK5_0 ;
326- GGML_ASSERT (num_blocks < UINT_MAX );
332+ GGML_ASSERT (num_blocks <= INT_MAX );
327333 cpy_f32_q<cpy_blck_f32_q5_0, QK5_0 ><<<num_blocks, 1 , 0 , stream>>>
328334 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
329335}
@@ -336,7 +342,7 @@ static void ggml_cpy_q5_0_f32_cuda(
336342 const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
337343 cudaStream_t stream) {
338344 const int64_t num_blocks = ne;
339- GGML_ASSERT (num_blocks < UINT_MAX );
345+ GGML_ASSERT (num_blocks <= INT_MAX );
340346 cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0 >, QK5_0 ><<<num_blocks, 1 , 0 , stream>>> (
341347 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
342348 ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -349,7 +355,7 @@ static void ggml_cpy_f32_q5_1_cuda(
349355
350356 GGML_ASSERT (ne % QK5_1 == 0 );
351357 const int64_t num_blocks = ne / QK5_1 ;
352- GGML_ASSERT (num_blocks < UINT_MAX );
358+ GGML_ASSERT (num_blocks <= INT_MAX );
353359 cpy_f32_q<cpy_blck_f32_q5_1, QK5_1 ><<<num_blocks, 1 , 0 , stream>>>
354360 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
355361}
@@ -362,7 +368,7 @@ static void ggml_cpy_q5_1_f32_cuda(
362368 const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
363369 cudaStream_t stream) {
364370 const int64_t num_blocks = ne;
365- GGML_ASSERT (num_blocks < UINT_MAX );
371+ GGML_ASSERT (num_blocks <= INT_MAX );
366372 cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1 >, QK5_1 ><<<num_blocks, 1 , 0 , stream>>> (
367373 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
368374 ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -375,7 +381,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(
375381
376382 GGML_ASSERT (ne % QK4_NL == 0 );
377383 const int64_t num_blocks = ne / QK4_NL ;
378- GGML_ASSERT (num_blocks < UINT_MAX );
384+ GGML_ASSERT (num_blocks <= INT_MAX );
379385 cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL ><<<num_blocks, 1 , 0 , stream>>>
380386 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
381387}
0 commit comments