forked from parallelcomputingabo/Homework-2
-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathmain.cpp
More file actions
237 lines (210 loc) · 7.37 KB
/
Copy pathmain.cpp
File metadata and controls
237 lines (210 loc) · 7.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <omp.h>
#include <pthread.h>
#include <stdexcept>
#include <string>
void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
uint32_t p) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < p; j++) {
for (int k = 0; k < n; k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}
void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
uint32_t p, uint32_t block_size) {
for (int ii = 0; ii < m; ii += block_size) {
for (int jj = 0; jj < p; jj += block_size) {
for (int kk = 0; kk < n; kk += block_size) {
// Process block: C[ii:ii+block_size, jj:jj+block_size] +=
// A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size,
// jj:jj+block_size]
for (int i = ii; i < std::min(ii + block_size, m); i++) {
for (int j = jj; j < std::min(jj + block_size, p); j++) {
for (int k = kk; k < std::min(kk + block_size, n); k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}
}
}
}
void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
uint32_t p) {
#pragma omp parallel for
for (int i = 0; i < m; i++) {
for (int j = 0; j < p; j++) {
for (int k = 0; k < n; k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}
bool validate_result(const std::string &result_file,
const std::string &reference_file) {
std::ifstream res_file(result_file);
std::ifstream ref_file(reference_file);
int res_dimsM, ref_dimsM;
int res_dimsN, ref_dimsN;
float curr_res, curr_ref;
res_file >> res_dimsM;
ref_file >> ref_dimsM;
res_file >> res_dimsN;
ref_file >> ref_dimsN;
if (res_dimsM != ref_dimsM || res_dimsN != ref_dimsN) {
return false;
}
int M = res_dimsM;
int N = res_dimsN;
const int total = M * N;
for (int i = 0; i < total; ++i) {
res_file >> curr_res;
ref_file >> curr_ref;
if (curr_res != curr_ref) {
std::cout << "Got: " << curr_res << " Expected:" << curr_ref
<< " at index = " << i << std::endl;
return false;
}
}
return true;
}
void write_float_matrix(const std::string filename, const float *data, int M,
int N) {
std::ofstream file(filename);
std::string result;
file << M << " " << N << "\n";
std::stringstream ss;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
// setting the float precision explicitly to be 2 decimals to match input
// file format
file << std::fixed << std::setprecision(2) << data[i * N + j];
// Add whitespaces as long as we have not hit a new row of the matrix
if (j < N - 1)
file << " ";
}
file << "\n";
}
file.close();
}
std::pair<int, int> read_float_matrix(const std::string filename,
float **data_ptr) {
std::ifstream file(filename);
int M, N;
file >> M;
file >> N;
float *data = (float *)calloc(M * N, sizeof(float));
const int total = M * N;
for (int i = 0; i < total; ++i) {
// the insanity of c++...
// the overloaded >> operator implicitly skips all types of whitespace.
// Oh, and since the type of data is float* it correctly converts to float
if (!(file >> data[i])) {
throw std::runtime_error("Parse error or unexpected EOF at element " +
std::to_string(i));
}
}
*data_ptr = data;
file.close();
return {M, N};
}
int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}
int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}
// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";
float *data_ptr;
auto A_size = read_float_matrix(input0_file, &data_ptr);
float *A = data_ptr;
auto B_size = read_float_matrix(input1_file, &data_ptr);
float *B = data_ptr;
int m = A_size.first;
int n = A_size.second;
int p = B_size.second;
// Allocate memory for result matrices
// NOTE: Use calloc to zero init memory and eliminate any problems that comes from
// using += in the matmul implementation
float *C_naive = (float *)calloc(m * p, sizeof(float));
float *C_blocked = (float *)calloc(m * p, sizeof(float));
float *C_parallel = (float *)calloc(m * p, sizeof(float));
// Measure performance of naive_matmul
double start_time = omp_get_wtime();
naive_matmul(C_naive, A, B, m, n, p);
double naive_time = omp_get_wtime() - start_time;
write_float_matrix(result_file, C_naive, m, p);
// Validate naive result
bool naive_correct = validate_result(result_file, reference_file);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number
<< std::endl;
}
// Measure performance of blocked_matmul (use block_size = 32 as default)
start_time = omp_get_wtime();
blocked_matmul(C_blocked, A, B, m, n, p, 32);
double blocked_time = omp_get_wtime() - start_time;
write_float_matrix(result_file, C_blocked, m, p);
// Validate blocked result
bool blocked_correct = validate_result(result_file, reference_file);
if (!blocked_correct) {
std::cerr << "Blocked result validation failed for case " << case_number
<< std::endl;
}
// Measure performance of parallel_matmul
start_time = omp_get_wtime();
parallel_matmul(C_parallel, A, B, m, n, p);
double parallel_time = omp_get_wtime() - start_time;
// TODO Write parallel result to file
write_float_matrix(result_file, C_parallel, m, p);
// Validate parallel result
bool parallel_correct = validate_result(result_file, reference_file);
if (!parallel_correct) {
std::cerr << "Parallel result validation failed for case " << case_number
<< std::endl;
}
// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p
<< "):\n";
std::cout << std::fixed << std::setprecision(8)
<< "Naive time: " << naive_time << " seconds\n";
std::cout << std::fixed << std::setprecision(8)
<< "Blocked time: " << blocked_time << " seconds\n";
std::cout << std::fixed << std::setprecision(8)
<< "Parallel time: " << parallel_time << " seconds\n";
std::cout << std::fixed << std::setprecision(8)
<< "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
std::cout << std::fixed << std::setprecision(8)
<< "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
// Append timing results to CSV
std::ofstream csv_file("timings.csv", std::ios::app);
csv_file << std::fixed << naive_time << ";" << blocked_time << ";"
<< parallel_time << "\n";
csv_file.close();
// Clean up
// NOTE: I use calloc so free must be used, I think it is
// UB to use delete on malloced memory
free(A);
free(B);
free(C_naive);
free(C_blocked);
free(C_parallel);
return 0;
}