diff --git a/README.md b/README.md
index 51c7f2a..f634165 100644
--- a/README.md
+++ b/README.md
@@ -109,22 +109,37 @@ threads (e.g., 2, 4, 8) by setting the environment variable `OMP_NUM_THREADS`.
 For each test case (0 through 9 in the `data` folder):
 
 - Measure the **wall clock time** for:
-    - Naive matrix multiplication (`naive_matmul`).
-    - Cache-optimized matrix multiplication (`blocked_matmul`).
-    - Parallel matrix multiplication (`parallel_matmul`).
+  - Naive matrix multiplication (`naive_matmul`).
+  - Cache-optimized matrix multiplication (`blocked_matmul`).
+  - Parallel matrix multiplication (`parallel_matmul`).
 - Use `omp_get_wtime()` for timing, as it provides high-resolution wall clock time.
 - Report the times in a table in your submission README.md, including:
-    - Test case number.
-    - Matrix dimensions (m × n × p).
-    - Wall clock time for each implementation (in seconds).
-    - Speedup of blocked and parallel implementations over the naive implementation.
+  - Test case number.
+  - Matrix dimensions (m × n × p).
+  - Wall clock time for each implementation (in seconds).
+  - Speedup of blocked and parallel implementations over the naive implementation.
 
 Example table format:
 
 | Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
-|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
+| --------- | ---------------------- | -------------- | ---------------- | ----------------- | --------------- | ---------------- |
 | 0         | 512 × 512 × 512        | 2.345          | 0.987            | 0.543             | 2.38×           | 4.32×            |
 
+## Results - Group H
+
+| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
+| --------- | ---------------------- | -------------- | ---------------- | ----------------- | --------------- | ---------------- |
+| 0         | 64 x 64 x 64           | 0.000397436    | 0.000262500      | 0.000198718       | 1.51x           | 2.00x            |
+| 1         | 128 x 64 x 128         | 0.001657894    | 0.000984128      | 0.000712641       | 1.68x           | 2.33x            |
+| 2         | 100 x 128 x 56         | 0.001347827    | 0.000594339      | 0.000440559       | 2.27x           | 3.06x            |
+| 3         | 128 x 64 x 128         | 0.001729733    | 0.001550001      | 0.000840000       | 1.12x           | 2.06x            |
+| 4         | 32 x 128 x 32          | 0.000229629    | 0.000165354      | 0.000174157       | 1.39x           | 1.32x            |
+| 5         | 200 x 100 x 256        | 0.007874995    | 0.004499997      | 0.002863635       | 1.75x           | 2.75x            |
+| 6         | 256 x 256 x 256        | 0.052999973    | 0.013999999      | 0.011199999       | 3.79x           | 4.73x            |
+| 7         | 256 x 300 x 256        | 0.034000039    | 0.019500017      | 0.013499975       | 1.74x           | 2.52x            |
+| 8         | 64 x 128 x 64          | 0.000819675    | 0.000439656      | 0.000398438       | 1.86x           | 2.06x            |
+| 9         | 256 x 256 x 257        | 0.026499987    | 0.019500017      | 0.007750005       | 1.36x           | 3.42x            |
+
 ---
 
 #### Matrix Storage and Memory Management
@@ -138,9 +153,9 @@ Example table format:
 #### Input/Output and Validation
 
 - Use the same input/output format as Assignment 1:
-    - Input files: `data/<case>/input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)).
-    - Output file: `data/<case>/result.raw` (matrix \( C \)).
-    - Reference file: `data/<case>/output.raw` for validation.
+  - Input files: `data/<case>/input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)).
+  - Output file: `data/<case>/result.raw` (matrix \( C \)).
+  - Reference file: `data/<case>/output.raw` for validation.
 - The executable accepts a case number (0–9) as a command-line argument.
 - Validate correctness by comparing `result.raw` with `output.raw` for each implementation.
 
@@ -150,22 +165,22 @@ Example table format:
 
 - Use the provided `CMakeLists.txt` to build the project.
 - **Additional Requirements**:
-    - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC).
-    - The provided CMake file includes OpenMP support.
+  - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC).
+  - The provided CMake file includes OpenMP support.
 - **Windows Users**:
-    - Use CLion or Visual Studio with CMake.
-    - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`.
+  - Use CLion or Visual Studio with CMake.
+  - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`.
 - **Linux/Mac Users**:
-    - Make sure the GCC compiler is installed (`brew install gcc` on Mac).
-    - Configure CMake to use the correct compiler:
-      ```bash
-      cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .
-      ```
-    - Run `cmake .` to generate a Makefile, then `make`.
+  - Make sure the GCC compiler is installed (`brew install gcc` on Mac).
+  - Configure CMake to use the correct compiler:
+    ```bash
+    cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .
+    ```
+  - Run `cmake .` to generate a Makefile, then `make`.
 - **Testing OpenMP**:
-    - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on
-      Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows).
-    - Test with different thread counts to find the best performance.
+  - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on
+    Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows).
+  - Test with different thread counts to find the best performance.
 
 ---
 
@@ -209,7 +224,7 @@ git push origin student-name
 ### Grading (100 Points Total)
 
 | Subtask                                     | Points |
-|---------------------------------------------|--------|
+| ------------------------------------------- | ------ |
 | Correct implementation of `blocked_matmul`  | 30     |
 | Correct implementation of `parallel_matmul` | 30     |
 | Accurate performance measurements           | 20     |
@@ -222,16 +237,16 @@ git push origin student-name
 ### Tips for Success
 
 - **Cache Optimization**:
-    - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64).
-    - Use a block size that balances cache usage without excessive overhead.
+  - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64).
+  - Use a block size that balances cache usage without excessive overhead.
 - **OpenMP**:
-    - Test with different thread counts to find the optimal number for your system.
-    - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues).
+  - Test with different thread counts to find the optimal number for your system.
+  - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues).
 - **Performance Measurement**:
-    - Run multiple iterations for each test case and report the average time to reduce variability.
-    - Ensure no other heavy processes are running during measurements.
+  - Run multiple iterations for each test case and report the average time to reduce variability.
+  - Ensure no other heavy processes are running during measurements.
 - **Debugging**:
-    - Validate each implementation against `output.raw` to ensure correctness before optimizing.
-    - Use small test cases to debug your blocked and parallel implementations.
+  - Validate each implementation against `output.raw` to ensure correctness before optimizing.
+  - Use small test cases to debug your blocked and parallel implementations.
 
 Good luck, and enjoy optimizing your matrix multiplication!
diff --git a/main.cpp b/main.cpp
index 65bf108..78cc3f7 100644
--- a/main.cpp
+++ b/main.cpp
@@ -3,34 +3,190 @@
 #include <string>
 #include <omp.h>
 #include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <iomanip>
 
-void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    //TODO : Implement naive matrix multiplication
+const double MIN_TIMING_SECONDS = 0.05;
+
+bool read_matrix(const std::string &file_name, float *&matrix, uint32_t &rows, uint32_t &cols)
+{
+    std::ifstream file(file_name);
+    if (!file)
+    {
+        std::cerr << "Failed to open " << file_name << std::endl;
+        return false;
+    }
+
+    if (!(file >> rows >> cols))
+    {
+        std::cerr << "Failed to read matrix dimensions from " << file_name << std::endl;
+        return false;
+    }
+
+    matrix = new float[rows * cols];
+    for (uint32_t i = 0; i < rows * cols; i++)
+    {
+        if (!(file >> matrix[i]))
+        {
+            std::cerr << "Failed to read matrix value " << i << " from " << file_name << std::endl;
+            delete[] matrix;
+            matrix = nullptr;
+            return false;
+        }
+    }
+
+    return true;
 }
 
-void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
-    // TODO: Implement blocked matrix multiplication
-    // A is m x n, B is n x p, C is m x p
-    // Use block_size to divide matrices into submatrices
+bool write_matrix(const std::string &file_name, const float *matrix, uint32_t rows, uint32_t cols)
+{
+    std::ofstream file(file_name);
+    if (!file)
+    {
+        std::cerr << "Failed to open " << file_name << " for writing" << std::endl;
+        return false;
+    }
+
+    file << rows << " " << cols << "\n";
+    file << std::fixed << std::setprecision(2);
+    for (uint32_t i = 0; i < rows; i++)
+    {
+        for (uint32_t j = 0; j < cols; j++)
+        {
+            if (j > 0)
+            {
+                file << " ";
+            }
+            file << matrix[i * cols + j];
+        }
+        file << "\n";
+    }
+
+    return true;
 }
 
-void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement parallel matrix multiplication using OpenMP
-    // A is m x n, B is n x p, C is m x p
+void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
+{
+    std::fill(C, C + m * p, 0.0f);
+
+    for (uint32_t i = 0; i < m; i++)
+    {
+        for (uint32_t j = 0; j < p; j++)
+        {
+            for (uint32_t k = 0; k < n; k++)
+            {
+                C[i * p + j] += A[i * n + k] * B[k * p + j];
+            }
+        }
+    }
 }
 
-bool validate_result(const std::string &result_file, const std::string &reference_file) {
-   //TODO : Implement result validation
+void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size)
+{
+    std::fill(C, C + m * p, 0.0f);
+
+    for (uint32_t ii = 0; ii < m; ii += block_size)
+    {
+        for (uint32_t kk = 0; kk < n; kk += block_size)
+        {
+            for (uint32_t jj = 0; jj < p; jj += block_size)
+            {
+                const uint32_t i_end = std::min(ii + block_size, m);
+                const uint32_t k_end = std::min(kk + block_size, n);
+                const uint32_t j_end = std::min(jj + block_size, p);
+
+                for (uint32_t i = ii; i < i_end; i++)
+                {
+                    for (uint32_t k = kk; k < k_end; k++)
+                    {
+                        const float a = A[i * n + k];
+                        for (uint32_t j = jj; j < j_end; j++)
+                        {
+                            C[i * p + j] += a * B[k * p + j];
+                        }
+                    }
+                }
+            }
+        }
+    }
 }
 
-int main(int argc, char *argv[]) {
-    if (argc != 2) {
+void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
+{
+    std::fill(C, C + m * p, 0.0f);
+
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < static_cast<int>(m); i++)
+    {
+        for (uint32_t j = 0; j < p; j++)
+        {
+            for (uint32_t k = 0; k < n; k++)
+            {
+                C[i * p + j] += A[i * n + k] * B[k * p + j];
+            }
+        }
+    }
+}
+
+bool validate_result(const std::string &result_file, const std::string &reference_file)
+{
+    float *result = nullptr;
+    float *reference = nullptr;
+    uint32_t result_rows = 0;
+    uint32_t result_cols = 0;
+    uint32_t reference_rows = 0;
+    uint32_t reference_cols = 0;
+
+    if (!read_matrix(result_file, result, result_rows, result_cols) ||
+        !read_matrix(reference_file, reference, reference_rows, reference_cols))
+    {
+        delete[] result;
+        delete[] reference;
+        return false;
+    }
+
+    if (result_rows != reference_rows || result_cols != reference_cols)
+    {
+        std::cerr << "Dimension mismatch: result is " << result_rows << "x" << result_cols
+                  << ", reference is " << reference_rows << "x" << reference_cols << std::endl;
+        delete[] result;
+        delete[] reference;
+        return false;
+    }
+
+    const float epsilon = 5e-2f;
+    const uint32_t count = result_rows * result_cols;
+    for (uint32_t i = 0; i < count; i++)
+    {
+        const float diff = std::fabs(result[i] - reference[i]);
+        if (diff > epsilon)
+        {
+            std::cerr << "Value mismatch at index " << i << ": result=" << result[i]
+                      << ", reference=" << reference[i] << ", diff=" << diff << std::endl;
+            delete[] result;
+            delete[] reference;
+            return false;
+        }
+    }
+
+    delete[] result;
+    delete[] reference;
+    return true;
+}
+
+int main(int argc, char *argv[])
+{
+    if (argc != 2)
+    {
         std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
         return 1;
     }
 
     int case_number = std::atoi(argv[1]);
-    if (case_number < 0 || case_number > 9) {
+    if (case_number < 0 || case_number > 9)
+    {
         std::cerr << "Case number must be between 0 and 9" << std::endl;
         return 1;
     }
@@ -42,60 +198,124 @@ int main(int argc, char *argv[]) {
     std::string result_file = folder + "result.raw";
     std::string reference_file = folder + "output.raw";
 
-    // TODO Read input0.raw (matrix A)
+    float *A = nullptr;
+    float *B = nullptr;
+    uint32_t m = 0;
+    uint32_t n = 0;
+    uint32_t b_rows = 0;
+    uint32_t p = 0;
 
+    if (!read_matrix(input0_file, A, m, n))
+    {
+        return 1;
+    }
 
-    // TODO Read input1.raw (matrix B)
+    if (!read_matrix(input1_file, B, b_rows, p))
+    {
+        delete[] A;
+        return 1;
+    }
 
+    if (n != b_rows)
+    {
+        std::cerr << "Incompatible matrix dimensions: A is " << m << "x" << n
+                  << ", B is " << b_rows << "x" << p << std::endl;
+        delete[] A;
+        delete[] B;
+        return 1;
+    }
 
     // Allocate memory for result matrices
     float *C_naive = new float[m * p];
     float *C_blocked = new float[m * p];
     float *C_parallel = new float[m * p];
 
-    // Measure performance of naive_matmul
     double start_time = omp_get_wtime();
-    naive_matmul(C_naive, A, B, m, n, p);
-    double naive_time = omp_get_wtime() - start_time;
-
-    // TODO Write naive result to file
+    double elapsed_time = 0.0;
+    int timing_repetitions = 0;
+    do
+    {
+        naive_matmul(C_naive, A, B, m, n, p);
+        timing_repetitions++;
+        elapsed_time = omp_get_wtime() - start_time;
+    } while (elapsed_time < MIN_TIMING_SECONDS);
+    double naive_time = elapsed_time / timing_repetitions;
 
+    if (!write_matrix(result_file, C_naive, m, p))
+    {
+        delete[] A;
+        delete[] B;
+        delete[] C_naive;
+        delete[] C_blocked;
+        delete[] C_parallel;
+        return 1;
+    }
 
     // Validate naive result
     bool naive_correct = validate_result(result_file, reference_file);
-    if (!naive_correct) {
+    if (!naive_correct)
+    {
         std::cerr << "Naive result validation failed for case " << case_number << std::endl;
     }
 
-    // Measure performance of blocked_matmul (use block_size = 32 as default)
     start_time = omp_get_wtime();
-    blocked_matmul(C_blocked, A, B, m, n, p, 32);
-    double blocked_time = omp_get_wtime() - start_time;
-
-    // TODO Write blocked result to file
+    elapsed_time = 0.0;
+    timing_repetitions = 0;
+    do
+    {
+        blocked_matmul(C_blocked, A, B, m, n, p, 32);
+        timing_repetitions++;
+        elapsed_time = omp_get_wtime() - start_time;
+    } while (elapsed_time < MIN_TIMING_SECONDS);
+    double blocked_time = elapsed_time / timing_repetitions;
 
+    if (!write_matrix(result_file, C_blocked, m, p))
+    {
+        delete[] A;
+        delete[] B;
+        delete[] C_naive;
+        delete[] C_blocked;
+        delete[] C_parallel;
+        return 1;
+    }
 
     // Validate blocked result
     bool blocked_correct = validate_result(result_file, reference_file);
-    if (!blocked_correct) {
+    if (!blocked_correct)
+    {
         std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
     }
 
-    // Measure performance of parallel_matmul
     start_time = omp_get_wtime();
-    parallel_matmul(C_parallel, A, B, m, n, p);
-    double parallel_time = omp_get_wtime() - start_time;
-
-    // TODO Write parallel result to file
+    elapsed_time = 0.0;
+    timing_repetitions = 0;
+    do
+    {
+        parallel_matmul(C_parallel, A, B, m, n, p);
+        timing_repetitions++;
+        elapsed_time = omp_get_wtime() - start_time;
+    } while (elapsed_time < MIN_TIMING_SECONDS);
+    double parallel_time = elapsed_time / timing_repetitions;
 
+    if (!write_matrix(result_file, C_parallel, m, p))
+    {
+        delete[] A;
+        delete[] B;
+        delete[] C_naive;
+        delete[] C_blocked;
+        delete[] C_parallel;
+        return 1;
+    }
 
     // Validate parallel result
     bool parallel_correct = validate_result(result_file, reference_file);
-    if (!parallel_correct) {
+    if (!parallel_correct)
+    {
         std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
     }
 
     // Print performance results
+    std::cout << std::fixed << std::setprecision(9);
     std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
     std::cout << "Naive time: " << naive_time << " seconds\n";
     std::cout << "Blocked time: " << blocked_time << " seconds\n";
@@ -111,4 +331,4 @@ int main(int argc, char *argv[]) {
     delete[] C_parallel;
 
     return 0;
-}
\ No newline at end of file
+}