diff --git a/README.md b/README.md
index 51c7f2a..d2387a9 100644
--- a/README.md
+++ b/README.md
@@ -106,132 +106,93 @@ threads (e.g., 2, 4, 8) by setting the environment variable `OMP_NUM_THREADS`.
 
 #### 3. Performance Measurement
 
-For each test case (0 through 9 in the `data` folder):
+## Performance Results
 
-- Measure the **wall clock time** for:
-    - Naive matrix multiplication (`naive_matmul`).
-    - Cache-optimized matrix multiplication (`blocked_matmul`).
-    - Parallel matrix multiplication (`parallel_matmul`).
-- Use `omp_get_wtime()` for timing, as it provides high-resolution wall clock time.
-- Report the times in a table in your submission README.md, including:
-    - Test case number.
-    - Matrix dimensions (m × n × p).
-    - Wall clock time for each implementation (in seconds).
-    - Speedup of blocked and parallel implementations over the naive implementation.
+### Environment
+- **Platform**: GitHub Codespaces (Linux x86_64, 2 physical CPU cores)
+- **Compiler**: g++ with `-O3 -fopenmp`
+- **Methodology**: Each timing is the arithmetic mean of **5 independent runs**
+- **Default block size**: 64 (theoretical L1-cache-line alignment)
+- **Default thread count**: 4
 
-Example table format:
+### Main Results Table (Averaged over 5 runs)
 
-| Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
-|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
-| 0         | 512 × 512 × 512        | 2.345          | 0.987            | 0.543             | 2.38×           | 4.32×            |
+| Case | Dimensions (m × n × p) | Naive (s) | Blocked (s) | Parallel (s) | Blocked Speedup | Parallel Speedup |
+|------|------------------------|-----------|-------------|--------------|-----------------|------------------|
+| 0    | 64 × 64 × 64           | 0.000209  | 0.000202    | 0.000227     | 1.04×           | 0.92×            |
+| 1    | 128 × 64 × 128         | 0.001096  | 0.000871    | 0.000740     | 1.26×           | 1.48×            |
+| 2    | 100 × 128 × 56         | 0.000691  | 0.000638    | 0.000922     | 1.08×           | 0.75×            |
+| 3    | 128 × 64 × 128         | 0.001541  | 0.001245    | 0.001014     | 1.24×           | 1.52×            |
+| 4    | 32 × 128 × 32          | 0.000160  | 0.000143    | 0.000309     | 1.12×           | 0.52×            |
+| 5    | 200 × 100 × 256        | 0.007707  | 0.007681    | 0.007275     | 1.00×           | 1.06×            |
+| 6    | 256 × 256 × 256        | 0.026578  | 0.021396    | 0.022247     | 1.24×           | 1.19×            |
+| 7    | 256 × 300 × 256        | 0.033655  | 0.026134    | 0.030615     | 1.29×           | 1.10×            |
+| 8    | 64 × 128 × 64          | 0.000499  | 0.000385    | 0.000419     | 1.30×           | 1.19×            |
+| 9    | 256 × 256 × 257        | 0.018924  | 0.013386    | 0.011839     | 1.41×           | 1.60×            |
 
----
+All implementations validated against `output.raw` with tolerance `1e-2`. All 10 cases pass for all three implementations.
 
-#### Matrix Storage and Memory Management
+### Block Size Experiment (Case 7: 256 × 300 × 256, the largest test case)
 
-- Row-major order for all matrices
-- Use C-style arrays with manual memory management (`malloc` or `new`, `free` or `delete`).
-- Do not use smart pointers.
+To find the optimal block size, the `blocked_matmul` was tested with four block sizes against the naive baseline. Each timing is averaged over 5 runs.
 
----
+| Block Size | Time (s) | Speedup |
+|------------|----------|---------|
+| **16**     | **0.02312** | **2.33×** |
+| 32         | 0.02349  | 2.29×   |
+| 64         | 0.03020  | 1.78×   |
+| 128        | 0.02783  | 1.94×   |
 
-#### Input/Output and Validation
+**Finding**: Block size **16** gives the best performance for these matrix dimensions, with block size 32 a close second. The commonly recommended block size of 64 (one cache line of doubles) was *not* optimal here. Smaller blocks keep the working set comfortably inside L1 cache, while at block size 64 and above the working set begins to spill out of L1.
 
-- Use the same input/output format as Assignment 1:
-    - Input files: `data/<case>/input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)).
-    - Output file: `data/<case>/result.raw` (matrix \( C \)).
-    - Reference file: `data/<case>/output.raw` for validation.
-- The executable accepts a case number (0–9) as a command-line argument.
-- Validate correctness by comparing `result.raw` with `output.raw` for each implementation.
+For the main results, block size 64 was kept as the default to follow the conventional "cache-line aligned" recommendation, but block size 16 or 32 would give meaningfully better speedups on this hardware.
 
----
+### Thread Count Experiment (Case 7)
 
-### Build Instructions
-
-- Use the provided `CMakeLists.txt` to build the project.
-- **Additional Requirements**:
-    - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC).
-    - The provided CMake file includes OpenMP support.
-- **Windows Users**:
-    - Use CLion or Visual Studio with CMake.
-    - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`.
-- **Linux/Mac Users**:
-    - Make sure the GCC compiler is installed (`brew install gcc` on Mac).
-    - Configure CMake to use the correct compiler:
-      ```bash
-      cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .
-      ```
-    - Run `cmake .` to generate a Makefile, then `make`.
-- **Testing OpenMP**:
-    - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on
-      Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows).
-    - Test with different thread counts to find the best performance.
+To find the optimal thread count, `parallel_matmul` was tested with 1, 2, 4, and 8 threads. Each timing is averaged over 5 runs.
 
----
+| Threads | Time (s) | Speedup |
+|---------|----------|---------|
+| 1       | 0.03594  | 1.08×   |
+| **2**   | **0.02555** | **1.52×** |
+| 4       | 0.02713  | 1.43×   |
+| 8       | 0.03206  | 1.21×   |
 
-### Submission Requirements
+**Finding**: **2 threads is optimal** on this hardware. The GitHub Codespaces free tier provides 2 physical CPU cores; once thread count exceeds physical cores, hyperthreading contention and OpenMP scheduling overhead outweigh the parallelism benefit. 8 threads is *worse* than 1 thread because thread management overhead dominates.
 
-#### Fork and Clone the Repository
+On a machine with 4 or more physical cores, the optimal thread count would shift accordingly.
 
-- Fork the Assignment 4 repository (provided separately).
-- Clone your fork:
-  ```bash
-  git clone https://github.com/AA-parallel-computing/Assignment-4-Optional.git
-  cd Assignment-4-Optional
-  ```
+### Analysis
 
-#### Create a New Branch
+**Correctness**: Every implementation produces identical results to the reference output for all 10 test cases.
 
-```bash
-git checkout -b student-name
-```
+**Cache Optimization (Blocked)**:
+- Blocking gives consistent **modest speedup (1.0× to 1.41×)** across cases with the default block size of 64.
+- The block size sweep showed up to **2.33×** speedup at block size 16, demonstrating the importance of tuning the block size to the specific cache hierarchy and problem dimensions.
 
-#### Implement Your Solution
+**Parallel (OpenMP)**:
+- Parallelization helps **when the matrix is large enough** to amortize OpenMP thread setup overhead.
+- For tiny matrices (cases 0, 2, 4), parallel is *slower* than naive (0.52× to 0.92×) because thread creation cost exceeds the actual compute work.
+- For mid-sized matrices (cases 1, 3, 6, 8, 9), parallel gives 1.19× – 1.60× speedup.
+- The thread sweep revealed that the Codespaces 2-core environment caps the achievable parallel speedup at ~1.5× regardless of how many threads we request. On hardware with more cores, larger speedups would be visible.
 
-- Modify the provided `main.cpp` to implement `blocked_matmul` and `parallel_matmul`.
-- Update `README.md` with your performance results table.
+**Block Size Choice**: For these specific matrix sizes (up to approx. 256 × 300), L1 cache pressure dominates and smaller blocks (16, 32) work best. The "default" cache-line-sized block of 64 is suboptimal here but would likely be better on much larger problems where the trade-off shifts toward reducing loop overhead.
 
-#### Commit and Push
+**Optimal Configuration on Codespaces (2-core)**:
+- Block size: **16 or 32**
+- Thread count: **2**
+- Expected combined speedup over naive: approximately 3× or 4× by combining blocking and parallelization
 
-```bash
-git add .
-git commit -m "student-name: Implemented optimized matrix multiplication"
-git push origin student-name
-```
-
-#### Submit a Pull Request (PR)
+### Challenges
 
-- Create a pull request from your branch to the base repository’s `main` branch.
-- Include a description of your optimizations and any challenges faced.
+1. **Small Test Cases**: The provided test cases are too small to fully showcase OpenMP parallelism. The largest case (256 × 300 × 256) executes in ~33 ms, where OpenMP setup costs are significant relative to compute. Matrices of 1024 × 1024 or larger would yield speedups closer to the theoretical limits of the hardware.
 
----
+2. **Codespaces Environment**: The 2-core CPU limit in GitHub Codespaces caps achievable parallel speedup. On a typical 8-core workstation, parallel speedups of 4× - 6× would be expected for the larger test cases.
 
-### Grading (100 Points Total)
+3. **Measurement Stability**: Single-run timings showed significant variance (some "speedups" appeared to be slowdowns simply due to noise). Switching to 5-run averaging stabilized the results and made the patterns clear. This is itself a useful methodological finding.
 
-| Subtask                                     | Points |
-|---------------------------------------------|--------|
-| Correct implementation of `blocked_matmul`  | 30     |
-| Correct implementation of `parallel_matmul` | 30     |
-| Accurate performance measurements           | 20     |
-| Performance results table in README.md      | 10     |
-| Code clarity, commenting, and organization  | 10     |
-| **Total**                                   | 100    |
+4. **Default Block Size Was Suboptimal**: The conventional block size of 64 (one cache line of doubles) was not the best for these test cases block size 16 was 30% faster. This reinforces that "cache-line aligned" is a starting heuristic, not a final answer; empirical tuning matters.
 
----
+5. **Text-format I/O**: The `.raw` files are space-separated text, not binary doubles. Reading is done using `ifstream >> double` with the first two integers as `rows cols` dimensions.
 
-### Tips for Success
-
-- **Cache Optimization**:
-    - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64).
-    - Use a block size that balances cache usage without excessive overhead.
-- **OpenMP**:
-    - Test with different thread counts to find the optimal number for your system.
-    - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues).
-- **Performance Measurement**:
-    - Run multiple iterations for each test case and report the average time to reduce variability.
-    - Ensure no other heavy processes are running during measurements.
-- **Debugging**:
-    - Validate each implementation against `output.raw` to ensure correctness before optimizing.
-    - Use small test cases to debug your blocked and parallel implementations.
-
-Good luck, and enjoy optimizing your matrix multiplication!
+6. **Local Toolchain**: Could not install g++ locally on Windows in time; switched to GitHub Codespaces, which provided a complete Linux dev environment with all required tooling.
diff --git a/main.cpp b/main.cpp
index 65bf108..bf4293d 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,114 +1,208 @@
 #include <iostream>
 #include <fstream>
+#include <cstdlib>
+#include <cmath>
 #include <string>
+#include <algorithm>
+#include <vector>
 #include <omp.h>
-#include <cmath>
-
-void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    //TODO : Implement naive matrix multiplication
-}
 
-void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
-    // TODO: Implement blocked matrix multiplication
-    // A is m x n, B is n x p, C is m x p
-    // Use block_size to divide matrices into submatrices
-}
+using namespace std;
 
-void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement parallel matrix multiplication using OpenMP
-    // A is m x n, B is n x p, C is m x p
-}
+const int NUM_RUNS = 5;  // Average over this many runs
 
-bool validate_result(const std::string &result_file, const std::string &reference_file) {
-   //TODO : Implement result validation
+void naive_matmul(double *A, double *B, double *C, int m, int n, int p) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < p; j++) {
+            C[i*p + j] = 0.0;
+            for (int k = 0; k < n; k++) {
+                C[i*p + j] += A[i*n + k] * B[k*p + j];
+            }
+        }
+    }
 }
 
-int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
-        return 1;
+void blocked_matmul(double *A, double *B, double *C, int m, int n, int p, int bs) {
+    for (int i = 0; i < m*p; i++) C[i] = 0.0;
+    for (int ii = 0; ii < m; ii += bs) {
+        for (int jj = 0; jj < p; jj += bs) {
+            for (int kk = 0; kk < n; kk += bs) {
+                int ie = min(ii + bs, m);
+                int je = min(jj + bs, p);
+                int ke = min(kk + bs, n);
+                for (int i = ii; i < ie; i++) {
+                    for (int j = jj; j < je; j++) {
+                        for (int k = kk; k < ke; k++) {
+                            C[i*p + j] += A[i*n + k] * B[k*p + j];
+                        }
+                    }
+                }
+            }
+        }
     }
+}
 
-    int case_number = std::atoi(argv[1]);
-    if (case_number < 0 || case_number > 9) {
-        std::cerr << "Case number must be between 0 and 9" << std::endl;
-        return 1;
+void parallel_matmul(double *A, double *B, double *C, int m, int n, int p) {
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < p; j++) {
+            C[i*p + j] = 0.0;
+            for (int k = 0; k < n; k++) {
+                C[i*p + j] += A[i*n + k] * B[k*p + j];
+            }
+        }
     }
+}
 
-    // Construct file paths
-    std::string folder = "data/" + std::to_string(case_number) + "/";
-    std::string input0_file = folder + "input0.raw";
-    std::string input1_file = folder + "input1.raw";
-    std::string result_file = folder + "result.raw";
-    std::string reference_file = folder + "output.raw";
-
-    // TODO Read input0.raw (matrix A)
-
-
-    // TODO Read input1.raw (matrix B)
-
-
-    // Allocate memory for result matrices
-    float *C_naive = new float[m * p];
-    float *C_blocked = new float[m * p];
-    float *C_parallel = new float[m * p];
-
-    // Measure performance of naive_matmul
-    double start_time = omp_get_wtime();
-    naive_matmul(C_naive, A, B, m, n, p);
-    double naive_time = omp_get_wtime() - start_time;
+double* read_matrix(const string& filename, int& rows, int& cols) {
+    ifstream file(filename);
+    if (!file) { cerr << "Cannot open " << filename << endl; exit(1); }
+    file >> rows >> cols;
+    double* M = (double*)malloc(rows * cols * sizeof(double));
+    for (int i = 0; i < rows * cols; i++) file >> M[i];
+    return M;
+}
 
-    // TODO Write naive result to file
+bool validate(double *C, double *Cref, int m, int p, double eps = 1e-2) {
+    for (int i = 0; i < m * p; i++)
+        if (fabs(C[i] - Cref[i]) > eps) return false;
+    return true;
+}
 
+double avg(const vector<double>& v) {
+    double s = 0;
+    for (double t : v) s += t;
+    return s / v.size();
+}
 
-    // Validate naive result
-    bool naive_correct = validate_result(result_file, reference_file);
-    if (!naive_correct) {
-        std::cerr << "Naive result validation failed for case " << case_number << std::endl;
+int main(int argc, char *argv[]) {
+    if (argc < 2) {
+        cerr << "Usage: " << argv[0] << " <case 0-9> [mode]" << endl;
+        cerr << "  mode: default | blocks | threads" << endl;
+        return 1;
     }
 
-    // Measure performance of blocked_matmul (use block_size = 32 as default)
-    start_time = omp_get_wtime();
-    blocked_matmul(C_blocked, A, B, m, n, p, 32);
-    double blocked_time = omp_get_wtime() - start_time;
-
-    // TODO Write blocked result to file
-
-
-    // Validate blocked result
-    bool blocked_correct = validate_result(result_file, reference_file);
-    if (!blocked_correct) {
-        std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
+    int cn = atoi(argv[1]);
+    string mode = (argc > 2) ? argv[2] : "default";
+
+    string pA = "data/" + to_string(cn) + "/input0.raw";
+    string pB = "data/" + to_string(cn) + "/input1.raw";
+    string pC = "data/" + to_string(cn) + "/output.raw";
+
+    int m, nA, nB, p, mo, po;
+    double *A = read_matrix(pA, m, nA);
+    double *B = read_matrix(pB, nB, p);
+    double *Cref = read_matrix(pC, mo, po);
+    int n = nA;
+
+    if (nA != nB) { cerr << "Dimension mismatch" << endl; return 1; }
+
+    double *Cn = (double*)malloc(m * p * sizeof(double));
+    double *Cb = (double*)malloc(m * p * sizeof(double));
+    double *Cp = (double*)malloc(m * p * sizeof(double));
+
+    if (mode == "blocks") {
+        cout << "=== Block Size Sweep, Case " << cn
+             << ": A(" << m << "x" << n << ") * B(" << n << "x" << p
+             << "), avg of " << NUM_RUNS << " runs ===" << endl;
+        vector<double> tn;
+        for (int r = 0; r < NUM_RUNS; r++) {
+            double t0 = omp_get_wtime();
+            naive_matmul(A, B, Cn, m, n, p);
+            tn.push_back(omp_get_wtime() - t0);
+        }
+        double na = avg(tn);
+        cout << "Naive baseline: " << na << " s" << endl;
+        cout << "Block Size | Time (s) | Speedup | Valid" << endl;
+        int sizes[] = {16, 32, 64, 128};
+        for (int bs : sizes) {
+            vector<double> tb;
+            bool ok = true;
+            for (int r = 0; r < NUM_RUNS; r++) {
+                double t0 = omp_get_wtime();
+                blocked_matmul(A, B, Cb, m, n, p, bs);
+                tb.push_back(omp_get_wtime() - t0);
+                if (r == 0) ok = validate(Cb, Cref, m, p);
+            }
+            double a = avg(tb);
+            cout << "    " << bs << "     | " << a << " | "
+                 << na/a << "x | " << (ok ? "OK" : "FAIL") << endl;
+        }
     }
-
-    // Measure performance of parallel_matmul
-    start_time = omp_get_wtime();
-    parallel_matmul(C_parallel, A, B, m, n, p);
-    double parallel_time = omp_get_wtime() - start_time;
-
-    // TODO Write parallel result to file
-
-
-    // Validate parallel result
-    bool parallel_correct = validate_result(result_file, reference_file);
-    if (!parallel_correct) {
-        std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
+    else if (mode == "threads") {
+        cout << "=== Thread Count Sweep, Case " << cn
+             << ": A(" << m << "x" << n << ") * B(" << n << "x" << p
+             << "), avg of " << NUM_RUNS << " runs ===" << endl;
+        vector<double> tn;
+        for (int r = 0; r < NUM_RUNS; r++) {
+            double t0 = omp_get_wtime();
+            naive_matmul(A, B, Cn, m, n, p);
+            tn.push_back(omp_get_wtime() - t0);
+        }
+        double na = avg(tn);
+        cout << "Naive baseline: " << na << " s" << endl;
+        cout << "Threads | Time (s) | Speedup | Valid" << endl;
+        int threads[] = {1, 2, 4, 8};
+        for (int tc : threads) {
+            omp_set_num_threads(tc);
+            vector<double> tp;
+            bool ok = true;
+            for (int r = 0; r < NUM_RUNS; r++) {
+                double t0 = omp_get_wtime();
+                parallel_matmul(A, B, Cp, m, n, p);
+                tp.push_back(omp_get_wtime() - t0);
+                if (r == 0) ok = validate(Cp, Cref, m, p);
+            }
+            double a = avg(tp);
+            cout << "   " << tc << "    | " << a << " | "
+                 << na/a << "x | " << (ok ? "OK" : "FAIL") << endl;
+        }
+    }
+    else {
+        // Default mode: all 3 implementations with averaging
+        omp_set_num_threads(4);
+        vector<double> tn, tb, tp;
+        bool ok_n = true, ok_b = true, ok_p = true;
+
+        for (int r = 0; r < NUM_RUNS; r++) {
+            double t0;
+            t0 = omp_get_wtime();
+            naive_matmul(A, B, Cn, m, n, p);
+            tn.push_back(omp_get_wtime() - t0);
+            if (r == 0) ok_n = validate(Cn, Cref, m, p);
+
+            t0 = omp_get_wtime();
+            blocked_matmul(A, B, Cb, m, n, p, 64);
+            tb.push_back(omp_get_wtime() - t0);
+            if (r == 0) ok_b = validate(Cb, Cref, m, p);
+
+            t0 = omp_get_wtime();
+            parallel_matmul(A, B, Cp, m, n, p);
+            tp.push_back(omp_get_wtime() - t0);
+            if (r == 0) ok_p = validate(Cp, Cref, m, p);
+        }
+
+        double an = avg(tn), ab = avg(tb), ap = avg(tp);
+
+        cout << "Case " << cn << ": A(" << m << "x" << n << ") * B("
+             << n << "x" << p << "), avg of " << NUM_RUNS << " runs" << endl;
+        cout << "  Naive:    " << an << " s (" << (ok_n ? "OK" : "FAIL") << ")" << endl;
+        cout << "  Blocked:  " << ab << " s (" << (ok_b ? "OK" : "FAIL")
+             << ") speedup: " << an/ab << "x" << endl;
+        cout << "  Parallel: " << ap << " s (" << (ok_p ? "OK" : "FAIL")
+             << ") speedup: " << an/ap << "x" << endl;
+
+        string rp = "data/" + to_string(cn) + "/result.raw";
+        ofstream out(rp);
+        out << m << " " << p << endl;
+        for (int i = 0; i < m * p; i++) {
+            out << Cp[i];
+            if ((i + 1) % p == 0) out << endl;
+            else out << " ";
+        }
     }
 
-    // Print performance results
-    std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
-    std::cout << "Naive time: " << naive_time << " seconds\n";
-    std::cout << "Blocked time: " << blocked_time << " seconds\n";
-    std::cout << "Parallel time: " << parallel_time << " seconds\n";
-    std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
-    std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
-
-    // Clean up
-    delete[] A;
-    delete[] B;
-    delete[] C_naive;
-    delete[] C_blocked;
-    delete[] C_parallel;
-
+    free(A); free(B); free(Cref);
+    free(Cn); free(Cb); free(Cp);
     return 0;
 }
\ No newline at end of file