-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel.py
More file actions
87 lines (71 loc) · 2.93 KB
/
kernel.py
File metadata and controls
87 lines (71 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
from pycuda import gpuarray
mod = SourceModule("""
__global__ void l1_distance_matrix(float *distance_matrix,
float **data_1,
float **data_2,
int num_series_1,
int num_series_2,
int series_length) {
extern __shared__ float smem[];
float* s_data1 = smem;
float* s_data2 = &smem[series_length*blockDim.x];
int tx = threadIdx.x;
int ty = threadIdx.y;
int ts_data_1_id = blockIdx.y * blockDim.y + ty;
int ts_data_2_id = blockIdx.x * blockDim.x + tx;
if (ts_data_1_id >= num_series_1 || ts_data_2_id >= num_series_2) return;
if (ty == 0 && tx == 0) {
for (int i = 0; i < series_length; ++i) {
s_data1[i] = data_1[ts_data_1_id][i];
s_data2[i] = data_2[ts_data_2_id][i];
}
}
else if (tx == 0) {
for (int i = 0; i < series_length; ++i) {
s_data1[ty*series_length + i] = data_1[ts_data_1_id][i];
}
}
else if (ty == 0) {
for (int i = 0; i < series_length; ++i) {
s_data2[tx*series_length + i] = data_2[ts_data_2_id][i];
}
}
__syncthreads();
float sum = 0.0f;
for (int i = 0; i < series_length; ++i) {
float diff = s_data1[ty*series_length + i] - s_data2[tx*series_length + i];
sum += fabsf(diff);
}
distance_matrix[ts_data_1_id * num_series_2 + ts_data_2_id] = sum;
}
""")
def compute_l1_matrix(data_1, data_2):
num_series_1 = data_1.shape[0]
num_series_2 = data_2.shape[0]
data_1_gpu = [gpuarray.to_gpu(data_1[i]) for i in range(num_series_1)]
data_2_gpu = [gpuarray.to_gpu(data_2[i]) for i in range(num_series_2)]
data_1_gpu_ptrs = gpuarray.to_gpu(
np.array([int(row.gpudata) for row in data_1_gpu], dtype=np.uintp)
)
data_2_gpu_ptrs = gpuarray.to_gpu(
np.array([int(row.gpudata) for row in data_2_gpu], dtype=np.uintp)
)
distance_matrix_gpu = gpuarray.empty((num_series_1, num_series_2), dtype=np.float32)
l1_kernel = mod.get_function("l1_distance_matrix")
block_size = 6
grid_size = ((num_series_1 + block_size - 1) // block_size,
(num_series_2 + block_size - 1) // block_size)
series_length = np.int32(data_1.shape[1])
shared_mem_size = int( 2*block_size *series_length * np.float32().itemsize)
l1_kernel(
distance_matrix_gpu, data_1_gpu_ptrs, data_2_gpu_ptrs,
np.int32(num_series_1), np.int32(num_series_2), np.int32(data_1.shape[1]),
block=(block_size, block_size, 1), grid=grid_size, shared=shared_mem_size
)
# Результат
distance_matrix = distance_matrix_gpu.get()
return distance_matrix