Assignment-4-Optional/main.cpp at d4260e552e80c8d8b4395f349f1f48bb779dc2bc · AA-parallel-computing/Assignment-4-Optional · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <omp.h>
#include <pthread.h>
#include <stdexcept>
#include <string>

void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
                  uint32_t p) {
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < p; j++) {
      for (int k = 0; k < n; k++) {
        C[i * p + j] += A[i * n + k] * B[k * p + j];
      }
    }
  }
}

void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
                    uint32_t p, uint32_t block_size) {
  for (int ii = 0; ii < m; ii += block_size) {
    for (int jj = 0; jj < p; jj += block_size) {
      for (int kk = 0; kk < n; kk += block_size) {
        // Process block: C[ii:ii+block_size, jj:jj+block_size] +=
        // A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size,
        // jj:jj+block_size]
        for (int i = ii; i < std::min(ii + block_size, m); i++) {
          for (int j = jj; j < std::min(jj + block_size, p); j++) {
            for (int k = kk; k < std::min(kk + block_size, n); k++) {
              C[i * p + j] += A[i * n + k] * B[k * p + j];
            }
          }
        }
      }
    }
  }
}

void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
                     uint32_t p) {
#pragma omp parallel for
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < p; j++) {
      for (int k = 0; k < n; k++) {
        C[i * p + j] += A[i * n + k] * B[k * p + j];
      }
    }
  }
}

bool validate_result(const std::string &result_file,
                     const std::string &reference_file) {
  std::ifstream res_file(result_file);
  std::ifstream ref_file(reference_file);

  int res_dimsM, ref_dimsM;
  int res_dimsN, ref_dimsN;
  float curr_res, curr_ref;

  res_file >> res_dimsM;
  ref_file >> ref_dimsM;
  res_file >> res_dimsN;
  ref_file >> ref_dimsN;
  if (res_dimsM != ref_dimsM || res_dimsN != ref_dimsN) {
    return false;
  }
  int M = res_dimsM;
  int N = res_dimsN;
  const int total = M * N;
  for (int i = 0; i < total; ++i) {
    res_file >> curr_res;
    ref_file >> curr_ref;
    if (curr_res != curr_ref) {
      std::cout << "Got: " << curr_res << " Expected:" << curr_ref
                << " at index = " << i << std::endl;
      return false;
    }
  }
  return true;
}

void write_float_matrix(const std::string filename, const float *data, int M,
                        int N) {
  std::ofstream file(filename);
  std::string result;
  file << M << " " << N << "\n";
  std::stringstream ss;
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      // setting the float precision explicitly to be 2 decimals to match input
      // file format
      file << std::fixed << std::setprecision(2) << data[i * N + j];
      // Add whitespaces as long as we have not hit a new row of the matrix
      if (j < N - 1)
        file << " ";
    }
    file << "\n";
  }
  file.close();
}

std::pair<int, int> read_float_matrix(const std::string filename,
                                      float **data_ptr) {
  std::ifstream file(filename);
  int M, N;
  file >> M;
  file >> N;
  float *data = (float *)calloc(M * N, sizeof(float));

  const int total = M * N;
  for (int i = 0; i < total; ++i) {
    // the insanity of c++...
    // the overloaded >> operator implicitly skips all types of whitespace.
    // Oh, and since the type of data is float* it correctly converts to float
    if (!(file >> data[i])) {
      throw std::runtime_error("Parse error or unexpected EOF at element " +
                               std::to_string(i));
    }
  }
  *data_ptr = data;
  file.close();
  return {M, N};
}

int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
    return 1;
  }

  int case_number = std::atoi(argv[1]);
  if (case_number < 0 || case_number > 9) {
    std::cerr << "Case number must be between 0 and 9" << std::endl;
    return 1;
  }

  // Construct file paths
  std::string folder = "data/" + std::to_string(case_number) + "/";
  std::string input0_file = folder + "input0.raw";
  std::string input1_file = folder + "input1.raw";
  std::string result_file = folder + "result.raw";
  std::string reference_file = folder + "output.raw";

  float *data_ptr;
  auto A_size = read_float_matrix(input0_file, &data_ptr);
  float *A = data_ptr;
  auto B_size = read_float_matrix(input1_file, &data_ptr);
  float *B = data_ptr;
  int m = A_size.first;
  int n = A_size.second;
  int p = B_size.second;

  // Allocate memory for result matrices
  // NOTE: Use calloc to zero init memory and eliminate any problems that comes from
  // using += in the matmul implementation
  float *C_naive = (float *)calloc(m * p, sizeof(float));
  float *C_blocked = (float *)calloc(m * p, sizeof(float));
  float *C_parallel = (float *)calloc(m * p, sizeof(float));

  // Measure performance of naive_matmul
  double start_time = omp_get_wtime();
  naive_matmul(C_naive, A, B, m, n, p);
  double naive_time = omp_get_wtime() - start_time;

  write_float_matrix(result_file, C_naive, m, p);

  // Validate naive result
  bool naive_correct = validate_result(result_file, reference_file);
  if (!naive_correct) {
    std::cerr << "Naive result validation failed for case " << case_number
              << std::endl;
  }

  // Measure performance of blocked_matmul (use block_size = 32 as default)
  start_time = omp_get_wtime();
  blocked_matmul(C_blocked, A, B, m, n, p, 32);
  double blocked_time = omp_get_wtime() - start_time;

  write_float_matrix(result_file, C_blocked, m, p);

  // Validate blocked result
  bool blocked_correct = validate_result(result_file, reference_file);
  if (!blocked_correct) {
    std::cerr << "Blocked result validation failed for case " << case_number
              << std::endl;
  }

  // Measure performance of parallel_matmul
  start_time = omp_get_wtime();
  parallel_matmul(C_parallel, A, B, m, n, p);
  double parallel_time = omp_get_wtime() - start_time;

  // TODO Write parallel result to file
  write_float_matrix(result_file, C_parallel, m, p);

  // Validate parallel result
  bool parallel_correct = validate_result(result_file, reference_file);
  if (!parallel_correct) {
    std::cerr << "Parallel result validation failed for case " << case_number
              << std::endl;
  }

  // Print performance results
  std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p
            << "):\n";
  std::cout << std::fixed << std::setprecision(8)
            << "Naive time: " << naive_time << " seconds\n";
  std::cout << std::fixed << std::setprecision(8)
            << "Blocked time: " << blocked_time << " seconds\n";
  std::cout << std::fixed << std::setprecision(8)
            << "Parallel time: " << parallel_time << " seconds\n";
  std::cout << std::fixed << std::setprecision(8)
            << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
  std::cout << std::fixed << std::setprecision(8)
            << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";

  // Append timing results to CSV
  std::ofstream csv_file("timings.csv", std::ios::app);
  csv_file << std::fixed << naive_time << ";" << blocked_time << ";"
           << parallel_time << "\n";
  csv_file.close();

  // Clean up
  // NOTE: I use calloc so free must be used, I think it is
  // UB to use delete on malloced memory
  free(A);
  free(B);
  free(C_naive);
  free(C_blocked);
  free(C_parallel);

  return 0;
}