diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3a1ad18
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+build/
+data/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b04fd0..156eaeb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,10 +2,12 @@ cmake_minimum_required(VERSION 3.10)
 
 project(MatrixMultiplication)
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 23)
 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 find_package(OpenMP REQUIRED)
 
 if(OpenMP_CXX_FOUND)
@@ -17,9 +19,10 @@ if(APPLE)
 endif()
 
 
-add_executable(matmul main_ans.cpp)
+add_executable(matmul main.cpp)
+target_compile_options(matmul PRIVATE -g -O3)
 
 
 if(OpenMP_CXX_FOUND)
     target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
-endif()
\ No newline at end of file
+endif()
diff --git a/README.md b/README.md
index 51c7f2a..b2432bd 100644
--- a/README.md
+++ b/README.md
@@ -235,3 +235,11 @@ git push origin student-name
     - Use small test cases to debug your blocked and parallel implementations.
 
 Good luck, and enjoy optimizing your matrix multiplication!
+
+## Report
+
+The table of results is in `timings.csv` and also `report.pdf`,
+please read report.pdf, it has a table and graph of the results.
+
+`report.typ` has the plaintext report before compilation. It depends on
+my personal assignment template so it is not of much use.
diff --git a/main.cpp b/main.cpp
index 65bf108..c92e7a8 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,114 +1,237 @@
-#include <iostream>
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <fstream>
-#include <string>
+#include <iomanip>
+#include <iostream>
 #include <omp.h>
-#include <cmath>
-
-void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    //TODO : Implement naive matrix multiplication
-}
-
-void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
-    // TODO: Implement blocked matrix multiplication
-    // A is m x n, B is n x p, C is m x p
-    // Use block_size to divide matrices into submatrices
-}
-
-void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement parallel matrix multiplication using OpenMP
-    // A is m x n, B is n x p, C is m x p
-}
+#include <pthread.h>
+#include <stdexcept>
+#include <string>
 
-bool validate_result(const std::string &result_file, const std::string &reference_file) {
-   //TODO : Implement result validation
+void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
+                  uint32_t p) {
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < p; j++) {
+      for (int k = 0; k < n; k++) {
+        C[i * p + j] += A[i * n + k] * B[k * p + j];
+      }
+    }
+  }
 }
 
-int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
-        return 1;
+void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
+                    uint32_t p, uint32_t block_size) {
+  for (int ii = 0; ii < m; ii += block_size) {
+    for (int jj = 0; jj < p; jj += block_size) {
+      for (int kk = 0; kk < n; kk += block_size) {
+        // Process block: C[ii:ii+block_size, jj:jj+block_size] +=
+        // A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size,
+        // jj:jj+block_size]
+        for (int i = ii; i < std::min(ii + block_size, m); i++) {
+          for (int j = jj; j < std::min(jj + block_size, p); j++) {
+            for (int k = kk; k < std::min(kk + block_size, n); k++) {
+              C[i * p + j] += A[i * n + k] * B[k * p + j];
+            }
+          }
+        }
+      }
     }
+  }
+}
 
-    int case_number = std::atoi(argv[1]);
-    if (case_number < 0 || case_number > 9) {
-        std::cerr << "Case number must be between 0 and 9" << std::endl;
-        return 1;
+void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
+                     uint32_t p) {
+#pragma omp parallel for
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < p; j++) {
+      for (int k = 0; k < n; k++) {
+        C[i * p + j] += A[i * n + k] * B[k * p + j];
+      }
     }
+  }
+}
 
-    // Construct file paths
-    std::string folder = "data/" + std::to_string(case_number) + "/";
-    std::string input0_file = folder + "input0.raw";
-    std::string input1_file = folder + "input1.raw";
-    std::string result_file = folder + "result.raw";
-    std::string reference_file = folder + "output.raw";
-
-    // TODO Read input0.raw (matrix A)
-
-
-    // TODO Read input1.raw (matrix B)
-
-
-    // Allocate memory for result matrices
-    float *C_naive = new float[m * p];
-    float *C_blocked = new float[m * p];
-    float *C_parallel = new float[m * p];
-
-    // Measure performance of naive_matmul
-    double start_time = omp_get_wtime();
-    naive_matmul(C_naive, A, B, m, n, p);
-    double naive_time = omp_get_wtime() - start_time;
-
-    // TODO Write naive result to file
-
-
-    // Validate naive result
-    bool naive_correct = validate_result(result_file, reference_file);
-    if (!naive_correct) {
-        std::cerr << "Naive result validation failed for case " << case_number << std::endl;
+bool validate_result(const std::string &result_file,
+                     const std::string &reference_file) {
+  std::ifstream res_file(result_file);
+  std::ifstream ref_file(reference_file);
+
+  int res_dimsM, ref_dimsM;
+  int res_dimsN, ref_dimsN;
+  float curr_res, curr_ref;
+
+  res_file >> res_dimsM;
+  ref_file >> ref_dimsM;
+  res_file >> res_dimsN;
+  ref_file >> ref_dimsN;
+  if (res_dimsM != ref_dimsM || res_dimsN != ref_dimsN) {
+    return false;
+  }
+  int M = res_dimsM;
+  int N = res_dimsN;
+  const int total = M * N;
+  for (int i = 0; i < total; ++i) {
+    res_file >> curr_res;
+    ref_file >> curr_ref;
+    if (curr_res != curr_ref) {
+      std::cout << "Got: " << curr_res << " Expected:" << curr_ref
+                << " at index = " << i << std::endl;
+      return false;
     }
+  }
+  return true;
+}
 
-    // Measure performance of blocked_matmul (use block_size = 32 as default)
-    start_time = omp_get_wtime();
-    blocked_matmul(C_blocked, A, B, m, n, p, 32);
-    double blocked_time = omp_get_wtime() - start_time;
-
-    // TODO Write blocked result to file
-
-
-    // Validate blocked result
-    bool blocked_correct = validate_result(result_file, reference_file);
-    if (!blocked_correct) {
-        std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
+void write_float_matrix(const std::string filename, const float *data, int M,
+                        int N) {
+  std::ofstream file(filename);
+  std::string result;
+  file << M << " " << N << "\n";
+  std::stringstream ss;
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      // setting the float precision explicitly to be 2 decimals to match input
+      // file format
+      file << std::fixed << std::setprecision(2) << data[i * N + j];
+      // Add whitespaces as long as we have not hit a new row of the matrix
+      if (j < N - 1)
+        file << " ";
     }
+    file << "\n";
+  }
+  file.close();
+}
 
-    // Measure performance of parallel_matmul
-    start_time = omp_get_wtime();
-    parallel_matmul(C_parallel, A, B, m, n, p);
-    double parallel_time = omp_get_wtime() - start_time;
-
-    // TODO Write parallel result to file
-
-
-    // Validate parallel result
-    bool parallel_correct = validate_result(result_file, reference_file);
-    if (!parallel_correct) {
-        std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
+std::pair<int, int> read_float_matrix(const std::string filename,
+                                      float **data_ptr) {
+  std::ifstream file(filename);
+  int M, N;
+  file >> M;
+  file >> N;
+  float *data = (float *)calloc(M * N, sizeof(float));
+
+  const int total = M * N;
+  for (int i = 0; i < total; ++i) {
+    // the insanity of c++...
+    // the overloaded >> operator implicitly skips all types of whitespace.
+    // Oh, and since the type of data is float* it correctly converts to float
+    if (!(file >> data[i])) {
+      throw std::runtime_error("Parse error or unexpected EOF at element " +
+                               std::to_string(i));
     }
+  }
+  *data_ptr = data;
+  file.close();
+  return {M, N};
+}
 
-    // Print performance results
-    std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
-    std::cout << "Naive time: " << naive_time << " seconds\n";
-    std::cout << "Blocked time: " << blocked_time << " seconds\n";
-    std::cout << "Parallel time: " << parallel_time << " seconds\n";
-    std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
-    std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
-
-    // Clean up
-    delete[] A;
-    delete[] B;
-    delete[] C_naive;
-    delete[] C_blocked;
-    delete[] C_parallel;
-
-    return 0;
-}
\ No newline at end of file
+int main(int argc, char *argv[]) {
+  if (argc != 2) {
+    std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
+    return 1;
+  }
+
+  int case_number = std::atoi(argv[1]);
+  if (case_number < 0 || case_number > 9) {
+    std::cerr << "Case number must be between 0 and 9" << std::endl;
+    return 1;
+  }
+
+  // Construct file paths
+  std::string folder = "data/" + std::to_string(case_number) + "/";
+  std::string input0_file = folder + "input0.raw";
+  std::string input1_file = folder + "input1.raw";
+  std::string result_file = folder + "result.raw";
+  std::string reference_file = folder + "output.raw";
+
+  float *data_ptr;
+  auto A_size = read_float_matrix(input0_file, &data_ptr);
+  float *A = data_ptr;
+  auto B_size = read_float_matrix(input1_file, &data_ptr);
+  float *B = data_ptr;
+  int m = A_size.first;
+  int n = A_size.second;
+  int p = B_size.second;
+
+  // Allocate memory for result matrices
+  // NOTE: Use calloc to zero init memory and eliminate any problems that comes from
+  // using += in the matmul implementation
+  float *C_naive = (float *)calloc(m * p, sizeof(float));
+  float *C_blocked = (float *)calloc(m * p, sizeof(float));
+  float *C_parallel = (float *)calloc(m * p, sizeof(float));
+
+  // Measure performance of naive_matmul
+  double start_time = omp_get_wtime();
+  naive_matmul(C_naive, A, B, m, n, p);
+  double naive_time = omp_get_wtime() - start_time;
+
+  write_float_matrix(result_file, C_naive, m, p);
+
+  // Validate naive result
+  bool naive_correct = validate_result(result_file, reference_file);
+  if (!naive_correct) {
+    std::cerr << "Naive result validation failed for case " << case_number
+              << std::endl;
+  }
+
+  // Measure performance of blocked_matmul (use block_size = 32 as default)
+  start_time = omp_get_wtime();
+  blocked_matmul(C_blocked, A, B, m, n, p, 32);
+  double blocked_time = omp_get_wtime() - start_time;
+
+  write_float_matrix(result_file, C_blocked, m, p);
+
+  // Validate blocked result
+  bool blocked_correct = validate_result(result_file, reference_file);
+  if (!blocked_correct) {
+    std::cerr << "Blocked result validation failed for case " << case_number
+              << std::endl;
+  }
+
+  // Measure performance of parallel_matmul
+  start_time = omp_get_wtime();
+  parallel_matmul(C_parallel, A, B, m, n, p);
+  double parallel_time = omp_get_wtime() - start_time;
+
+  // TODO Write parallel result to file
+  write_float_matrix(result_file, C_parallel, m, p);
+
+  // Validate parallel result
+  bool parallel_correct = validate_result(result_file, reference_file);
+  if (!parallel_correct) {
+    std::cerr << "Parallel result validation failed for case " << case_number
+              << std::endl;
+  }
+
+  // Print performance results
+  std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p
+            << "):\n";
+  std::cout << std::fixed << std::setprecision(8)
+            << "Naive time: " << naive_time << " seconds\n";
+  std::cout << std::fixed << std::setprecision(8)
+            << "Blocked time: " << blocked_time << " seconds\n";
+  std::cout << std::fixed << std::setprecision(8)
+            << "Parallel time: " << parallel_time << " seconds\n";
+  std::cout << std::fixed << std::setprecision(8)
+            << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
+  std::cout << std::fixed << std::setprecision(8)
+            << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
+
+  // Append timing results to CSV
+  std::ofstream csv_file("timings.csv", std::ios::app);
+  csv_file << std::fixed << naive_time << ";" << blocked_time << ";"
+           << parallel_time << "\n";
+  csv_file.close();
+
+  // Clean up
+  // NOTE: I use calloc so free must be used, I think it is
+  // UB to use delete on malloced memory
+  free(A);
+  free(B);
+  free(C_naive);
+  free(C_blocked);
+  free(C_parallel);
+
+  return 0;
+}
diff --git a/report.pdf b/report.pdf
new file mode 100644
index 0000000..c63a1e6
Binary files /dev/null and b/report.pdf differ
diff --git a/report.typ b/report.typ
new file mode 100644
index 0000000..1c2c791
--- /dev/null
+++ b/report.typ
@@ -0,0 +1,35 @@
+#import "@local/assignment-template:0.1.0": conf
+
+#show link: set text(font: "Iosevka Term", fallback: false)
+
+#show: doc => conf(
+    title: [Parallel Programming - Assignment 4],
+    authors: "Mirek Kudinoff (2102812)",
+    doc,
+  )
+
+== Results
+
+#figure(
+  image("graph1.png", width: 80%),
+  caption: [
+    Graph plotting case number vs time taken to run each variant of matmul
+  ],
+)
+
+
+== Results as a table (times are in seconds)
+
+#let results = csv("timings.csv", delimiter: ";")
+
+#table(
+columns: 4,
+..results.flatten())
+
+CSV with results obtained by running `for num in $(seq 9); ./build/matmul $num end` in fish shell,
+the added code at the end appends results to a file called timings.csv. It is then used to create the
+graph and the table above.
+
+I think the reason that blocked implementation does not consistently show an improvement has to
+do with cache size and the fact that the smaller matrix inputs might fit into cache in such a way
+that the naive implementation is not negatively affected.
diff --git a/timings.csv b/timings.csv
new file mode 100644
index 0000000..a92803b
--- /dev/null
+++ b/timings.csv
@@ -0,0 +1,19 @@
+Input Size;Naive;Blocked;Parallel
+128x64x128;0.000663;0.000619;0.002903
+100x128x56;0.000668;0.000542;0.002990
+128x64x128;0.000685;0.000632;0.002343
+32x128x32;0.000183;0.000146;0.002642
+200x100x256;0.005250;0.004530;0.001293
+256x256x256;0.017633;0.016888;0.003996
+256x300x256;0.022895;0.019400;0.004624
+64x128x64;0.000612;0.000294;0.000343
+256x256x257;0.012911;0.009770;0.002916
+
+
+
+
+
+
+
+
+