-
Notifications
You must be signed in to change notification settings - Fork 230
Expand file tree
/
Copy pathlib.rs
More file actions
80 lines (67 loc) · 3.07 KB
/
lib.rs
File metadata and controls
80 lines (67 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
use core::mem::MaybeUninit;
use cuda_std::*;
#[kernel]
#[allow(improper_ctypes_definitions)]
#[allow(clippy::needless_range_loop)]
/// # Safety
///
/// This function is unsafe because it dereferences raw pointers.
pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) {
let bx: usize = cuda_std::thread::block_idx().x as usize;
let by: usize = cuda_std::thread::block_idx().y as usize;
let tx: usize = cuda_std::thread::thread_idx().x as usize;
let ty: usize = cuda_std::thread::thread_idx().y as usize;
const BLOCK_SIZE: usize = 32;
let a_begin = wa * BLOCK_SIZE * by;
let a_end = a_begin + wa - 1;
let a_step = BLOCK_SIZE;
let b_begin = BLOCK_SIZE * bx;
let b_step = BLOCK_SIZE * wb;
let mut c_sub: f32 = 0.0;
let mut kahan_correction_factor = 0.0f32;
let mut bi = b_begin;
for ai in (a_begin..=a_end).step_by(a_step) {
// The equivalent Cuda C++ code for the below is:
// ```
// __shared__ float A_MATRIX[BLOCK_SIZE][BLOCK_SIZE];
// ```
// This memory space is shared between threads of the same block
#[address_space(shared)]
static mut A_MATRIX: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
#[address_space(shared)]
static mut B_MATRIX: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
// Load A and B matrices into shared memory
// A.add(index) returns the pointer to the index-th element of A
// Hence a dereference is needed to get the value at that index
unsafe {
A_MATRIX[ty][tx].write(a[ai + wa * ty + tx]);
B_MATRIX[ty][tx].write(b[bi + wb * ty + tx]);
}
// Synchronize to make sure the matrices are loaded
cuda_std::thread::sync_threads();
for k in 0..BLOCK_SIZE {
// Typically, this would be a simple calculation:
// ```
// c_sub += A_MATRIX[ty][k] * B_MATRIX[k][tx];
// ```
// However, to improve numerical stability, we use Kahan summation here so that the error can be isolated
// and not allow it to accumulate in c_sub
let input = unsafe { A_MATRIX[ty][k].assume_init() * B_MATRIX[k][tx].assume_init() };
let y = input - kahan_correction_factor;
let sum = c_sub + y;
// This seems like the correction factor would yield zero, however due to f32 precision limitations,
// it helps to isolate the small errors that would otherwise accumulate in c_sub
kahan_correction_factor = (sum - c_sub) - y;
c_sub = sum;
}
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
cuda_std::thread::sync_threads();
bi += b_step;
}
let ci = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
unsafe {
*c.add((ci + wb * ty + tx) as usize) = c_sub;
}
}