-
Notifications
You must be signed in to change notification settings - Fork 92
Expand file tree
/
Copy pathmatmul_raw.cu
More file actions
119 lines (101 loc) · 2.73 KB
/
matmul_raw.cu
File metadata and controls
119 lines (101 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#include <stdio.h>
#include <cuda_runtime.h>
#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
void sgemm_naive_cpu(float *A, float *B, float *C, int M, int N, int K)
{
for (int x = 0; x < M; x++)
{
for (int y = 0; y < N; y++)
{
float sum = 0.0f;
for (int i = 0; i < K; i++)
{
sum += A[x * K + i] * B[i * N + y];
}
C[x * N + y] = sum;
}
}
}
__global__ void sgemm_naive_kernel(float *A, float *B, float *C, int M, int N, int K)
{
const uint x = blockIdx.x * blockDim.x + threadIdx.x;
const uint y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < M && y < N)
{
float sum = 0.0f;
for (int i = 0; i < K; i++)
{
sum += A[x * K + i] * B[i * N + y];
}
C[x * N + y] = sum;
}
}
void run_sgemm_naive(float *A, float *B, float *C, int m, int n, int k)
{
dim3 block_size(32, 32);
dim3 grid_size(CEIL_DIV(m, 32), CEIL_DIV(n, 32));
sgemm_naive_kernel<<<grid_size, block_size>>>(A, B, C, m, n, k);
}
void randomize_matrix(float *mat, int N)
{
for (int i = 0; i < N; i++)
{
mat[i] = rand() % 100;
}
}
int main()
{
int m = 256;
int n = 256;
int k = 256;
// Allocate memory for matrices
float *A, *B, *C, *C_ref;
float *d_A, *d_B, *d_C;
A = new float[m * k];
B = new float[k * n];
C = new float[m * n];
// save reference result
C_ref = new float[m * n];
// Initialize matrices
randomize_matrix(A, m * k);
randomize_matrix(B, k * n);
// Allocate device memory
cudaMalloc((void **)&d_A, m * k * sizeof(float));
cudaMalloc((void **)&d_B, k * n * sizeof(float));
cudaMalloc((void **)&d_C, m * n * sizeof(float));
// Copy matrices to device
cudaMemcpy(d_A, A, m * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, k * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, m * n * sizeof(float), cudaMemcpyHostToDevice);
// Run naive sgemm
run_sgemm_naive(d_A, d_B, d_C, m, n, k);
// Copy result to host
cudaMemcpy(C, d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost);
// Run reference sgemm
sgemm_naive_cpu(A, B, C_ref, m, n, k);
// Verify result
for (int i = 0; i < m * n; i++)
{
if (C[i] != C_ref[i])
{
printf("Error: mismatch at index %d, expected %f, got %f\n", i, C_ref[i], C[i]);
return 1;
}
}
free(A);
free(B);
free(C);
free(C_ref);
A = nullptr;
B = nullptr;
C = nullptr;
C_ref = nullptr;
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
d_A = nullptr;
d_B = nullptr;
d_C = nullptr;
printf("Success!\n");
return 0;
}