From 454f20d6fc99b76a2ec5ea9bc37b809c64253010 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 22 Jan 2026 16:39:30 -0800
Subject: [PATCH 01/15] first draft, need to refine

---
 docs/docs.json                 |  3 +-
 docs/support-for-jit/index.mdx | 85 ++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 docs/support-for-jit/index.mdx

diff --git a/docs/docs.json b/docs/docs.json
index a36fc82dc..43949abb2 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -66,7 +66,8 @@
             "group": "🧠 Core Concepts",
             "pages": [
               "codeflash-concepts/how-codeflash-works",
-              "codeflash-concepts/benchmarking"
+              "codeflash-concepts/benchmarking",
+              "support-for-jit/index"
             ]
           },
           {
diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
new file mode 100644
index 000000000..3dfc68917
--- /dev/null
+++ b/docs/support-for-jit/index.mdx
@@ -0,0 +1,85 @@
+---
+title: "Support for Just-in-Time Compilation"
+description: "Learn how Codeflash optimizes code using JIT compilation with Numba, PyTorch, TensorFlow, and JAX"
+icon: "bolt"
+sidebarTitle: "JIT Compilation"
+keywords: ["JIT", "just-in-time", "numba", "pytorch", "tensorflow", "jax", "GPU", "CUDA", "compilation", "performance"]
+---
+
+# Support for Just-in-Time Compilation
+
+Codeflash supports optimizing code using Just-in-Time (JIT) compilation. This allows Codeflash to suggest optimizations that leverage JIT compilers from popular frameworks including **Numba**, **PyTorch**, **TensorFlow**, and **JAX**.
+
+## Supported JIT Frameworks
+
+Each framework uses different compilation strategies to accelerate Python code:
+
+### Numba
+
+Numba compiles Python functions to optimized machine code using the LLVM compiler infrastructure. Codeflash can suggest Numba optimizations that use:
+
+- **`@jit` / `@njit`** - General-purpose JIT compilation with `nopython` mode for removing Python interpreter overhead
+- **`parallel=True`** - Enables automatic SIMD parallelization
+- **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
+- **`@vectorize` / `@guvectorize`** - Creates NumPy universal functions (ufuncs)
+- **`@cuda.jit`** - Compiles functions to run on NVIDIA GPUs
+
+### PyTorch
+
+PyTorch provides multiple compilation approaches:
+
+- **`torch.compile()`** - The recommended compilation API that uses TorchDynamo to trace operations and create optimized CUDA graphs
+- **`torch.jit.script`** - Compiles functions using TorchScript
+- **`torch.jit.trace`** - Traces tensor operations to create optimized execution graphs
+
+### TensorFlow
+
+TensorFlow uses the XLA (Accelerated Linear Algebra) backend for JIT compilation:
+
+- **`@tf.function`** - Compiles Python functions into optimized TensorFlow graphs using XLA
+
+### JAX
+
+JAX captures side-effect-free operations and optimizes them:
+
+- **`@jax.jit`** - JIT compiles functions using XLA, with automatic operation fusion for improved performance
+
+## How Codeflash Optimizes with JIT
+
+When Codeflash identifies a function that could benefit from JIT compilation, it:
+
+1. **Rewrites the code** in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components
+2. **Generates appropriate tests** that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter type requirements
+3. **Adds GPU synchronization calls** for accurate profiling when code runs on GPU, since GPU operations are inherently non-blocking
+
+## Accurate Benchmarking with GPU Code
+
+Since GPU operations execute asynchronously, Codeflash automatically inserts synchronization barriers before measuring performance. This ensures timing measurements reflect actual computation time rather than just the time to queue operations:
+
+- **PyTorch**: Uses `torch.cuda.synchronize()` or `torch.mps.synchronize()` depending on the device
+- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete
+- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization
+
+## When JIT Compilation Helps
+
+JIT compilation is most effective for:
+
+- Numerical computations with loops that can't be easily vectorized
+- Custom algorithms not covered by existing optimized libraries
+- Functions that are called repeatedly with consistent input types
+- Code that benefits from hardware-specific optimizations (SIMD, GPU acceleration)
+
+## When JIT Compilation May Not Help
+
+JIT compilation may not provide speedups when:
+
+- The code already uses highly optimized libraries (e.g., NumPy with MKL, cuBLAS, cuDNN)
+- Functions have variable input types or shapes that prevent effective compilation
+- The compilation overhead exceeds the runtime savings for short-running functions
+- The code relies heavily on Python objects or dynamic features that JIT compilers can't optimize
+
+## Configuration
+
+JIT compilation support is **enabled automatically** in Codeflash. You don't need to modify any configuration to enable JIT-based optimizations. Codeflash will automatically detect when JIT compilation could improve performance and suggest appropriate optimizations.
+
+When running tests with coverage measurement, Codeflash temporarily disables JIT compilation to ensure accurate coverage data, then re-enables it for performance benchmarking.

From 9fe6ef797aa8fd8d33a72530027542eda8fff35c Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 22 Jan 2026 17:20:52 -0800
Subject: [PATCH 02/15] todo write more about the flags in torch/tensorflow and
 jax

---
 docs/support-for-jit/index.mdx | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index 3dfc68917..9aa91a6a4 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -8,49 +8,58 @@ keywords: ["JIT", "just-in-time", "numba", "pytorch", "tensorflow", "jax", "GPU"
 
 # Support for Just-in-Time Compilation
 
-Codeflash supports optimizing code using Just-in-Time (JIT) compilation. This allows Codeflash to suggest optimizations that leverage JIT compilers from popular frameworks including **Numba**, **PyTorch**, **TensorFlow**, and **JAX**.
+Codeflash supports optimizing numerical code using Just-in-Time (JIT) compilation via leveraging JIT compilers from popular frameworks including **Numba**, **PyTorch**, **TensorFlow**, and **JAX**.
 
 ## Supported JIT Frameworks
 
 Each framework uses different compilation strategies to accelerate Python code:
 
-### Numba
+### Numba (CPU Code)
 
 Numba compiles Python functions to optimized machine code using the LLVM compiler infrastructure. Codeflash can suggest Numba optimizations that use:
 
-- **`@jit` / `@njit`** - General-purpose JIT compilation with `nopython` mode for removing Python interpreter overhead
-- **`parallel=True`** - Enables automatic SIMD parallelization
-- **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
-- **`@vectorize` / `@guvectorize`** - Creates NumPy universal functions (ufuncs)
-- **`@cuda.jit`** - Compiles functions to run on NVIDIA GPUs
+- **`@jit`** - General-purpose JIT compilation with optional flags.
+  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
+  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
+  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
+  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
 
 ### PyTorch
 
 PyTorch provides multiple compilation approaches:
 
 - **`torch.compile()`** - The recommended compilation API that uses TorchDynamo to trace operations and create optimized CUDA graphs
-- **`torch.jit.script`** - Compiles functions using TorchScript
-- **`torch.jit.trace`** - Traces tensor operations to create optimized execution graphs
+  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
+  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
+  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
+  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
 
 ### TensorFlow
 
 TensorFlow uses the XLA (Accelerated Linear Algebra) backend for JIT compilation:
 
 - **`@tf.function`** - Compiles Python functions into optimized TensorFlow graphs using XLA
+  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
+  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
+  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
+  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
 
 ### JAX
 
 JAX captures side-effect-free operations and optimizes them:
 
 - **`@jax.jit`** - JIT compiles functions using XLA, with automatic operation fusion for improved performance
+  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
+  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
+  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
+  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
 
 ## How Codeflash Optimizes with JIT
 
 When Codeflash identifies a function that could benefit from JIT compilation, it:
 
-1. **Rewrites the code** in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components
-2. **Generates appropriate tests** that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter type requirements
-3. **Adds GPU synchronization calls** for accurate profiling when code runs on GPU, since GPU operations are inherently non-blocking
+1. **Rewrites the code** in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
+2. **Generates appropriate tests** that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
 
 ## Accurate Benchmarking with GPU Code
 

From 85344f5fd4ca8e5e6bef80832f4c19eae560f705 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 22 Jan 2026 17:26:25 -0800
Subject: [PATCH 03/15] keep editing

---
 docs/support-for-jit/index.mdx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index 9aa91a6a4..ee03d5220 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -18,9 +18,8 @@ Each framework uses different compilation strategies to accelerate Python code:
 
 Numba compiles Python functions to optimized machine code using the LLVM compiler infrastructure. Codeflash can suggest Numba optimizations that use:
 
-- **`@jit`** - General-purpose JIT compilation with optional flags.
+- **`@jit`** - General-purpose JIT compilation with optional flags. Here is a non-exhaustive options which codeflash would apply on the function to optimize it via numba jit compilation.
   - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
-  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
   - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
   - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
 

From eb9b3dff1ad69cdb0b523bcce13a2a76799fe8ce Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Fri, 23 Jan 2026 18:32:05 -0800
Subject: [PATCH 04/15] add examples

---
 docs/support-for-jit/index.mdx | 243 ++++++++++++++++++++++++++++-----
 1 file changed, 207 insertions(+), 36 deletions(-)

diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index ee03d5220..2c7f0b1aa 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -18,40 +18,36 @@ Each framework uses different compilation strategies to accelerate Python code:
 
 Numba compiles Python functions to optimized machine code using the LLVM compiler infrastructure. Codeflash can suggest Numba optimizations that use:
 
-- **`@jit`** - General-purpose JIT compilation with optional flags. Here is a non-exhaustive options which codeflash would apply on the function to optimize it via numba jit compilation.
-  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
-  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
-  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
+- **`@jit`** - General-purpose JIT compilation with optional flags.
+  - **`nopython=True`** - Compiles to machine code without falling back to the Python interpreter.
+  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag.
+  - **`cache=True`** - cache compiled function to disk which reduces future runtimes.
+  - **`parallel=True`** - Parallalizes code inside loops.
 
 ### PyTorch
 
-PyTorch provides multiple compilation approaches:
+PyTorch provides JIT compilation through `torch.compile()`, the recommended compilation API introduced in PyTorch 2.0. It uses TorchDynamo to capture Python bytecode and TorchInductor to generate optimized kernels.
 
-- **`torch.compile()`** - The recommended compilation API that uses TorchDynamo to trace operations and create optimized CUDA graphs
-  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
-  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
-  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
-  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
+- **`torch.compile()`** - Compiles a function or module for optimized execution.
+  - **`mode`** - Controls the compilation strategy:
+    - `"default"` - Balanced compilation with moderate optimization.
+    - `"reduce-overhead"` - Minimizes Python overhead using CUDA graphs, ideal for small batches.
+    - `"max-autotune"` - Spends more time autotuning to find the fastest kernels.
+  - **`fullgraph=True`** - Requires the entire function to be captured as a single graph. Raises an error if graph breaks occur, useful for ensuring complete optimization.
+  - **`dynamic=True`** - Enables dynamic shape support, allowing the compiled function to handle varying input sizes without recompilation.
 
 ### TensorFlow
 
-TensorFlow uses the XLA (Accelerated Linear Algebra) backend for JIT compilation:
+TensorFlow uses `@tf.function` to compile Python functions into optimized TensorFlow graphs. When combined with XLA (Accelerated Linear Algebra), it can generate highly optimized machine code for both CPU and GPU.
 
-- **`@tf.function`** - Compiles Python functions into optimized TensorFlow graphs using XLA
-  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
-  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
-  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
-  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
+- **`@tf.function`** - Converts Python functions into TensorFlow graphs for optimized execution.
+  - **`jit_compile=True`** - Enables XLA compilation, which performs whole-function optimization including operation fusion, memory layout optimization, and target-specific code generation.
 
 ### JAX
 
-JAX captures side-effect-free operations and optimizes them:
+JAX uses XLA to JIT compile pure functions into optimized machine code. It emphasizes functional programming patterns and captures side-effect-free operations for optimization.
 
-- **`@jax.jit`** - JIT compiles functions using XLA, with automatic operation fusion for improved performance
-  - **`noython=True`** - Compiles to machine code without falling back to the python interpreter.
-  - **`parallel=True`** - Enables automatic thread-level parallelization of the function across multiple CPU cores (no GIL!).
-  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag
-  - **`cache=True`** - Numba writes the result of function compilation to disk which significantly reduces future compilation times.
+- **`@jax.jit`** - JIT compiles functions using XLA with automatic operation fusion.
 
 ## How Codeflash Optimizes with JIT
 
@@ -60,34 +56,209 @@ When Codeflash identifies a function that could benefit from JIT compilation, it
 1. **Rewrites the code** in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
 2. **Generates appropriate tests** that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
 
-## Accurate Benchmarking with GPU Code
+## Accurate Benchmarking on Non-CPU devices
 
 Since GPU operations execute asynchronously, Codeflash automatically inserts synchronization barriers before measuring performance. This ensures timing measurements reflect actual computation time rather than just the time to queue operations:
 
-- **PyTorch**: Uses `torch.cuda.synchronize()` or `torch.mps.synchronize()` depending on the device
-- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete
-- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization
+- **PyTorch**: Uses `torch.cuda.synchronize()` (NVIDIA GPUs) or `torch.mps.synchronize()` (MacOS Metal Performance Shaders) depending on the device.
+- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete.
+- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization.
 
 ## When JIT Compilation Helps
 
 JIT compilation is most effective for:
 
-- Numerical computations with loops that can't be easily vectorized
-- Custom algorithms not covered by existing optimized libraries
-- Functions that are called repeatedly with consistent input types
+- Numerical computations with loops that can't be easily vectorized.
+- Custom algorithms not covered by existing optimized libraries.
+- Functions that are called repeatedly with consistent input types.
 - Code that benefits from hardware-specific optimizations (SIMD, GPU acceleration)
 
+### Example
+
+#### Function Definition
+
+```python
+import torch
+def complex_activation(x):
+    """A custom activation with many small operations - compile makes a huge difference"""
+    # Many sequential element-wise ops create kernel launch overhead
+    x = torch.sin(x)
+    x = x * torch.cos(x)
+    x = x + torch.exp(-x.abs())
+    x = x / (1 + x.pow(2))
+    x = torch.tanh(x) * torch.sigmoid(x)
+    x = x - 0.5 * x.pow(3)
+    return x
+```
+
+#### Benchmarking Snippet (replace `cuda` with `mps` to run on your Mac)
+
+```python
+import time
+# Create compiled version
+complex_activation_compiled = torch.compile(complex_activation)
+
+# Benchmark
+x = torch.randn(1000, 1000, device='cuda')
+
+# Warmup
+for _ in range(10):
+    _ = complex_activation(x)
+    _ = complex_activation_compiled(x)
+
+# Time uncompiled
+torch.cuda.synchronize()
+start = time.time()
+for _ in range(100):
+    y = complex_activation(x)
+torch.cuda.synchronize()
+uncompiled_time = time.time() - start
+
+# Time compiled
+torch.cuda.synchronize()
+start = time.time()
+for _ in range(100):
+    y = complex_activation_compiled(x)
+torch.cuda.synchronize()
+compiled_time = time.time() - start
+
+print(f"Uncompiled: {uncompiled_time:.4f}s")
+print(f"Compiled: {compiled_time:.4f}s")
+print(f"Speedup: {uncompiled_time/compiled_time:.2f}x")
+```
+
+Expected Output on CUDA
+
+```
+Uncompiled: 0.0176s
+Compiled: 0.0063s
+Speedup: 2.80x
+```
+
+Here, JIT compilation via `torch.compile` is the only viable option because
+1. Already vectorized - All operations are already PyTorch tensor ops.
+2. Multiple Kernel Launches - Uncompiled code launches ~10 separate kernels. torch.compile fuses them into 1-2 kernels, eliminating kernel launch overhead.
+3. No algorithmic improvement - The computation itself is already optimal.
+4. Python overhead elimination - Removes Python interpreter overhead between operations.
+
+
 ## When JIT Compilation May Not Help
 
 JIT compilation may not provide speedups when:
 
-- The code already uses highly optimized libraries (e.g., NumPy with MKL, cuBLAS, cuDNN)
-- Functions have variable input types or shapes that prevent effective compilation
-- The compilation overhead exceeds the runtime savings for short-running functions
-- The code relies heavily on Python objects or dynamic features that JIT compilers can't optimize
+- The code already uses highly optimized libraries (e.g., NumPy with MKL, cuBLAS, cuDNN).
+- Functions have variable input types or shapes that prevent effective compilation.
+- The compilation overhead exceeds the runtime savings for short-running functions.
+- The code relies heavily on Python objects or dynamic features that JIT compilers can't optimize.
 
-## Configuration
+### Example
+
+#### Function Definition
+
+```
+def adaptive_processing(x, threshold=0.5):
+    """Function with data-dependent control flow - compile struggles here"""
+    # Check how many values exceed threshold (data-dependent!)
+    mask = x > threshold
+    num_large = mask.sum().item()  # .item() causes graph break
+
+    if num_large > x.numel() * 0.3:
+        # Path 1: Many large values - use expensive operation
+        result = torch.matmul(x, x.T)  # Already optimized by cuBLAS
+        result = result.mean(dim=0)
+    else:
+        # Path 2: Few large values - use cheap operation
+        result = x.mean(dim=1)
+
+    return result
+```
+
+#### Benchmarking Snippet (replace `cuda` with `mps` to run on your Mac)
+
+```
+# Create compiled version
+adaptive_processing_compiled = torch.compile(adaptive_processing)
 
-JIT compilation support is **enabled automatically** in Codeflash. You don't need to modify any configuration to enable JIT-based optimizations. Codeflash will automatically detect when JIT compilation could improve performance and suggest appropriate optimizations.
+# Test with data that causes branch variation
+x = torch.randn(500, 500, device='cuda')
+
+# Warmup
+for _ in range(10):
+    _ = adaptive_processing(x)
+    _ = adaptive_processing_compiled(x)
+
+# Benchmark with varying data (causes recompilation)
+torch.cuda.synchronize()
+start = time.time()
+for i in range(100):
+    # Vary the data to trigger different branches
+    x_test = torch.randn(500, 500, device='cuda') + (i % 2)
+    y = adaptive_processing(x_test)
+torch.cuda.synchronize()
+uncompiled_time = time.time() - start
+
+torch.cuda.synchronize()
+start = time.time()
+for i in range(100):
+    x_test = torch.randn(500, 500, device='cuda') + (i % 2)
+    y = adaptive_processing_compiled(x_test)  # Recompiles frequently!
+torch.cuda.synchronize()
+compiled_time = time.time() - start
+
+print(f"Uncompiled: {uncompiled_time:.4f}s")
+print(f"Compiled: {compiled_time:.4f}s")
+print(f"Slowdown: {compiled_time/uncompiled_time:.2f}x")
+```
+
+Expected Output on CUDA
+
+```
+Uncompiled: 0.0296s
+Compiled: 0.2847s
+Slowdown: 9.63x
+```
+
+Why `torch.compile` is detrimental here:
+
+1. Graph breaks - `.item()` forces a graph break, negating compile benefits.
+2. Recompilation overhead - Different branches cause expensive recompilation each time.
+3. Dynamic control flow - Data-dependent conditionals can't be optimized away.
+4. Already optimized ops - `matmul` already uses `cuBLAS`; compile adds overhead without benefit.
+
+#### Better Optimization Strategy
+
+```python
+def optimized_version(x, threshold=0.5):
+    """Remove data-dependent control flow - vectorize instead"""
+    mask = (x > threshold).float()
+    weight = (mask.mean() > 0.3).float()  # Keep on GPU
+
+    # Compute both paths, blend based on weight (branchless)
+    expensive = torch.matmul(x, x.T).mean(dim=0)
+    cheap = x.mean(dim=1).squeeze()
+
+    # Pad cheap result to match expensive dimensions
+    cheap_padded = cheap.expand(expensive.shape[0])
+
+    result = weight * expensive + (1 - weight) * cheap_padded
+    return result
+```
+
+Expected Output on CUDA
+
+```
+Optimized: 0.0277s
+Speedup compared to Uncompiled: 1.57x
+```
+
+
+Key improvements:
+
+1. Eliminate `.item()` - Keep computation on GPU.
+2. Branchless execution - Compute both paths, blend results.
+3. Vectorization - Replace conditionals with masked operations.
+4. Reduce Python overhead - Minimize host-device synchronization.
+
+## Configuration
 
-When running tests with coverage measurement, Codeflash temporarily disables JIT compilation to ensure accurate coverage data, then re-enables it for performance benchmarking.
+JIT compilation support is **enabled automatically** in Codeflash. You don't need to modify any configuration to enable JIT-based optimizations. Codeflash will automatically detect when JIT compilation could improve performance and suggest appropriate optimizations.
\ No newline at end of file

From 15f4b6dd0e02373959da51a3e6ecfc94069dc67a Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Fri, 23 Jan 2026 18:47:08 -0800
Subject: [PATCH 05/15] typos

---
 docs/support-for-jit/index.mdx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index 2c7f0b1aa..d84a75912 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -22,7 +22,7 @@ Numba compiles Python functions to optimized machine code using the LLVM compile
   - **`nopython=True`** - Compiles to machine code without falling back to the Python interpreter.
   - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag.
   - **`cache=True`** - cache compiled function to disk which reduces future runtimes.
-  - **`parallel=True`** - Parallalizes code inside loops.
+  - **`parallel=True`** - Parallelizes code inside loops.
 
 ### PyTorch
 
@@ -53,12 +53,14 @@ JAX uses XLA to JIT compile pure functions into optimized machine code. It empha
 
 When Codeflash identifies a function that could benefit from JIT compilation, it:
 
-1. **Rewrites the code** in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
-2. **Generates appropriate tests** that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
+1. Rewrites the code in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
+2. Generates appropriate tests that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
+3. Disables JIT compilation while running coverage and tracer to get accurate coverage and trace information. Both of them rely on Python bytecode execution but JIT compiled code stops running as Python bytecode.
+4. Disables Line Profiler information collection whenever presented with JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
 
 ## Accurate Benchmarking on Non-CPU devices
 
-Since GPU operations execute asynchronously, Codeflash automatically inserts synchronization barriers before measuring performance. This ensures timing measurements reflect actual computation time rather than just the time to queue operations:
+Since Non-CPU operations execute asynchronously, Codeflash automatically inserts synchronization barriers before measuring performance. This ensures timing measurements reflect actual computation time rather than just the time to queue operations:
 
 - **PyTorch**: Uses `torch.cuda.synchronize()` (NVIDIA GPUs) or `torch.mps.synchronize()` (MacOS Metal Performance Shaders) depending on the device.
 - **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete.
@@ -71,7 +73,7 @@ JIT compilation is most effective for:
 - Numerical computations with loops that can't be easily vectorized.
 - Custom algorithms not covered by existing optimized libraries.
 - Functions that are called repeatedly with consistent input types.
-- Code that benefits from hardware-specific optimizations (SIMD, GPU acceleration)
+- Code that benefits from hardware-specific optimizations (SIMD, GPU acceleration).
 
 ### Example
 
@@ -146,7 +148,7 @@ Here, JIT compilation via `torch.compile` is the only viable option because
 
 JIT compilation may not provide speedups when:
 
-- The code already uses highly optimized libraries (e.g., NumPy with MKL, cuBLAS, cuDNN).
+- The code already uses highly optimized libraries (e.g., `NumPy` with `MKL`, `cuBLAS`, `cuDNN`).
 - Functions have variable input types or shapes that prevent effective compilation.
 - The compilation overhead exceeds the runtime savings for short-running functions.
 - The code relies heavily on Python objects or dynamic features that JIT compilers can't optimize.

From 06f5460803a56a2c4956ebab7458760e1dedf8c1 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 10:01:07 -0800
Subject: [PATCH 06/15] start testing

---
 code_to_optimize/complex_activation.py        | 11 +++
 .../tests/pytest/test_complex_activation.py   | 88 +++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 code_to_optimize/complex_activation.py
 create mode 100644 code_to_optimize/tests/pytest/test_complex_activation.py

diff --git a/code_to_optimize/complex_activation.py b/code_to_optimize/complex_activation.py
new file mode 100644
index 000000000..d9ed216d3
--- /dev/null
+++ b/code_to_optimize/complex_activation.py
@@ -0,0 +1,11 @@
+import torch
+def complex_activation(x):
+    """A custom activation with many small operations - compile makes a huge difference"""
+    # Many sequential element-wise ops create kernel launch overhead
+    x = torch.sin(x)
+    x = x * torch.cos(x)
+    x = x + torch.exp(-x.abs())
+    x = x / (1 + x.pow(2))
+    x = torch.tanh(x) * torch.sigmoid(x)
+    x = x - 0.5 * x.pow(3)
+    return x
\ No newline at end of file
diff --git a/code_to_optimize/tests/pytest/test_complex_activation.py b/code_to_optimize/tests/pytest/test_complex_activation.py
new file mode 100644
index 000000000..3fa8f0d12
--- /dev/null
+++ b/code_to_optimize/tests/pytest/test_complex_activation.py
@@ -0,0 +1,88 @@
+"""
+Unit tests for complex_activation function.
+
+Tests run on CUDA device with a single tensor shape.
+"""
+
+import pytest
+import torch
+
+from code_to_optimize.complex_activation import complex_activation
+
+
+@pytest.fixture
+def cuda_device():
+    """Return CUDA device, skip if not available."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    return torch.device("cuda")
+
+
+@pytest.fixture
+def input_tensor(cuda_device):
+    """Create a fixed-shape input tensor on CUDA."""
+    torch.manual_seed(42)
+    return torch.randn(32, 64, device=cuda_device, dtype=torch.float32)
+
+
+class TestComplexActivation:
+    """Tests for the complex_activation function."""
+
+    def test_output_shape(self, input_tensor):
+        """Test that output shape matches input shape."""
+        result = complex_activation(input_tensor)
+        assert result.shape == input_tensor.shape
+
+    def test_output_dtype(self, input_tensor):
+        """Test that output dtype matches input dtype."""
+        result = complex_activation(input_tensor)
+        assert result.dtype == input_tensor.dtype
+
+    def test_output_device(self, input_tensor, cuda_device):
+        """Test that output is on the same device as input."""
+        result = complex_activation(input_tensor)
+        assert result.device.type == cuda_device.type
+
+    def test_deterministic(self, input_tensor):
+        """Test that the function produces deterministic results."""
+        result1 = complex_activation(input_tensor.clone())
+        result2 = complex_activation(input_tensor.clone())
+        torch.testing.assert_close(result1, result2)
+
+    def test_output_is_finite(self, input_tensor):
+        """Test that output contains no NaN or Inf values."""
+        result = complex_activation(input_tensor)
+        assert torch.isfinite(result).all()
+
+    def test_output_bounded(self, input_tensor):
+        """Test that output values are bounded (activation should not explode)."""
+        result = complex_activation(input_tensor)
+        assert result.abs().max() < 10.0
+
+    def test_zero_input(self, cuda_device):
+        """Test behavior with zero input."""
+        x = torch.zeros(32, 64, device=cuda_device, dtype=torch.float32)
+        result = complex_activation(x)
+        assert torch.isfinite(result).all()
+        assert result.shape == x.shape
+
+    def test_positive_input(self, cuda_device):
+        """Test behavior with all positive inputs."""
+        x = torch.abs(torch.randn(32, 64, device=cuda_device, dtype=torch.float32)) + 0.1
+        result = complex_activation(x)
+        assert torch.isfinite(result).all()
+
+    def test_negative_input(self, cuda_device):
+        """Test behavior with all negative inputs."""
+        x = -torch.abs(torch.randn(32, 64, device=cuda_device, dtype=torch.float32)) - 0.1
+        result = complex_activation(x)
+        assert torch.isfinite(result).all()
+
+    def test_gradient_flow(self, cuda_device):
+        """Test that gradients can flow through the activation."""
+        x = torch.randn(32, 64, device=cuda_device, dtype=torch.float32, requires_grad=True)
+        result = complex_activation(x)
+        loss = result.sum()
+        loss.backward()
+        assert x.grad is not None
+        assert torch.isfinite(x.grad).all()
\ No newline at end of file

From 2a739558617bab48ea746ce627f1aa774bfb7c77 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 15:52:15 -0800
Subject: [PATCH 07/15] start cleaning up

---
 .../benchmarking-gpu-code.mdx                 | 118 ++++++++++++++++++
 docs/docs.json                                |   3 +-
 docs/support-for-jit/index.mdx                | 117 ++++++++---------
 3 files changed, 174 insertions(+), 64 deletions(-)
 create mode 100644 docs/codeflash-concepts/benchmarking-gpu-code.mdx

diff --git a/docs/codeflash-concepts/benchmarking-gpu-code.mdx b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
new file mode 100644
index 000000000..9a54e700c
--- /dev/null
+++ b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
@@ -0,0 +1,118 @@
+---
+title: "How Codeflash Measures Code Runtime on GPUs"
+description: "Learn how Codeflash accurately measures code performance on GPUs"
+icon: "stopwatch"
+sidebarTitle: "GPU Runtime Measurement"
+keywords: ["benchmarking", "performance", "timing", "measurement", "runtime", "noise reduction", "GPU", "MPS"]
+---
+
+## Accurate Benchmarking on GPU devices (NVIDIA GPUs and Mac Metal Performance Shaders)
+
+When a GPU operation is executed, it executes **asynchronously**. This means the CPU queues up work for the GPU and immediately continues to the next line of code - it doesn't wait for the GPU to finish. Accurate measurement of code execution on GPUs involves the insertion of synchronization barriers to ensure no pending GPU tasks are executing before and after the timing measurements are made.
+
+## Illustration
+
+### Without Synchronization
+
+```mermaid actions={false}
+%%{init: {'gantt': {'useWidth': 1200}}}%%
+gantt
+    title CPU vs CUDA Stream Timeline (Without Synchronization)
+    dateFormat X
+    axisFormat %s
+
+    section CPU
+    Timer Start           :milestone, m1, 0, 0
+    Launch Kernel 1       :active, cpu0, 0, 4
+    Launch Kernel 2       :active, cpu1, 4, 8
+    Launch Kernel 3       :active, cpu2, 8, 12
+    Timer End             :milestone, m2, 12, 12
+
+    section CUDA Stream
+    Waiting               :done, wait, 0, 4
+    Kernel 1              :active, k1, 4, 11
+    Kernel 2              :active, k2, 11, 18
+    Kernel 3              :active, k3, 18, 25
+
+    section Problem
+    Timer ends too early  :done, p1, after m2, 25
+```
+
+Here you can see that the timing statements are measuring the duration up till the end of the final kernel launch. The GPU computation hasn't completed yet, which means the timing measurement is not accurate and would affect any future inference based on this information.
+
+### With Synchronization
+
+```mermaid actions={false}
+%%{init: {'gantt': {'useWidth': 1200}}}%%
+gantt
+    title CPU vs CUDA Stream Timeline (With Synchronization)
+    dateFormat X
+    axisFormat %s
+
+    section CPU
+    Device Synchronization  :done, wait, 0, 4
+    Timer Start             :milestone, m1, 4, 4
+    Launch Kernel 1         :active, cpu0, 4, 8
+    Launch Kernel 2         :active, cpu1, 8, 12
+    Launch Kernel 3         :active, cpu2, 12, 16
+    Device Synchronization  :done, wait, 16, 29
+    Timer End               :milestone, m2, 29, 29
+
+    section CUDA Stream
+    Previous Work         :done, wait, 0, 4
+    Kernel 1              :active, k1, 4, 11
+    Kernel 2              :active, k2, 11, 18
+    Kernel 3              :active, k3, 18, 29
+```
+
+Here you can see that a device synchronization call is made before executing the code, this ensures that the CPU waits for any pending GPU tasks to finish before starting the timer. After the launch of the final kernel, another device synchronization call is made which ensures all pending GPU tasks are finished before measuring the runtime.
+
+
+
+## Pytorch Example
+
+Execute the following code in your Python Interpreter to get the kernel launch time (Replace `cuda` with `mps` everywhere to run on your Mac).
+```python
+import torch
+import time
+device = "cuda"
+x = torch.randn(8192, 8192, device=device)
+y = torch.randn(8192, 8192, device=device)
+t0 = time.perf_counter_ns()
+z = torch.matmul(x, y)
+t1 = time.perf_counter_ns()
+print(f"Without synchronize: {(t1 - t0) / 1e6:.3f} ms")
+```
+
+Now, **Restart** your interpreter and execute the following code to get the kernel execution time (Replace `cuda` with `mps` everywhere to run on your Mac).
+```python
+import torch
+import time
+device = "cuda"
+x = torch.randn(8192, 8192, device=device)
+y = torch.randn(8192, 8192, device=device)
+torch.cuda.synchronize()  # clear any pending work
+t0 = time.perf_counter_ns()
+z = torch.matmul(x, y)
+torch.cuda.synchronize()  # wait for GPU to finish
+t1 = time.perf_counter_ns()
+print(f"With synchronize: {(t1 - t0) / 1e6:.3f} ms")
+```
+
+
+Output on NVIDIA GPU
+
+```
+Without synchronize: 69.157 ms
+With synchronize: 152.277 ms
+```
+
+# How codeflash measures execution time involving GPUs
+
+Codeflash automatically inserts synchronization barriers before measuring performance. It currently supports GPU code written in `Pytorch`, `Tensorflow` and `JAX`.
+
+- **PyTorch**: Uses `torch.cuda.synchronize()` (NVIDIA GPUs) or `torch.mps.synchronize()` (MacOS Metal Performance Shaders) depending on the device.
+- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete. It works for both CUDA and MPS devices.
+- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization. It works for both CUDA and MPS devices.
+
+Codeflash would support ROCm and TPU devices in the near future.
\ No newline at end of file
diff --git a/docs/docs.json b/docs/docs.json
index 43949abb2..e81e21cd4 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -67,7 +67,8 @@
             "pages": [
               "codeflash-concepts/how-codeflash-works",
               "codeflash-concepts/benchmarking",
-              "support-for-jit/index"
+              "support-for-jit/index",
+              "codeflash-concepts/benchmarking-gpu-code"
             ]
           },
           {
diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index d84a75912..2321609d8 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -1,70 +1,14 @@
 ---
-title: "Support for Just-in-Time Compilation"
+title: "Just-in-Time Compilation"
 description: "Learn how Codeflash optimizes code using JIT compilation with Numba, PyTorch, TensorFlow, and JAX"
 icon: "bolt"
 sidebarTitle: "JIT Compilation"
-keywords: ["JIT", "just-in-time", "numba", "pytorch", "tensorflow", "jax", "GPU", "CUDA", "compilation", "performance"]
+keywords: ["JIT", "just-in-time", "numba", "pytorch", "tensorflow", "jax", "GPU", "CUDA", "MPS", "compilation", "performance"]
 ---
 
-# Support for Just-in-Time Compilation
+# Just-in-Time Compilation
 
-Codeflash supports optimizing numerical code using Just-in-Time (JIT) compilation via leveraging JIT compilers from popular frameworks including **Numba**, **PyTorch**, **TensorFlow**, and **JAX**.
-
-## Supported JIT Frameworks
-
-Each framework uses different compilation strategies to accelerate Python code:
-
-### Numba (CPU Code)
-
-Numba compiles Python functions to optimized machine code using the LLVM compiler infrastructure. Codeflash can suggest Numba optimizations that use:
-
-- **`@jit`** - General-purpose JIT compilation with optional flags.
-  - **`nopython=True`** - Compiles to machine code without falling back to the Python interpreter.
-  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag.
-  - **`cache=True`** - cache compiled function to disk which reduces future runtimes.
-  - **`parallel=True`** - Parallelizes code inside loops.
-
-### PyTorch
-
-PyTorch provides JIT compilation through `torch.compile()`, the recommended compilation API introduced in PyTorch 2.0. It uses TorchDynamo to capture Python bytecode and TorchInductor to generate optimized kernels.
-
-- **`torch.compile()`** - Compiles a function or module for optimized execution.
-  - **`mode`** - Controls the compilation strategy:
-    - `"default"` - Balanced compilation with moderate optimization.
-    - `"reduce-overhead"` - Minimizes Python overhead using CUDA graphs, ideal for small batches.
-    - `"max-autotune"` - Spends more time autotuning to find the fastest kernels.
-  - **`fullgraph=True`** - Requires the entire function to be captured as a single graph. Raises an error if graph breaks occur, useful for ensuring complete optimization.
-  - **`dynamic=True`** - Enables dynamic shape support, allowing the compiled function to handle varying input sizes without recompilation.
-
-### TensorFlow
-
-TensorFlow uses `@tf.function` to compile Python functions into optimized TensorFlow graphs. When combined with XLA (Accelerated Linear Algebra), it can generate highly optimized machine code for both CPU and GPU.
-
-- **`@tf.function`** - Converts Python functions into TensorFlow graphs for optimized execution.
-  - **`jit_compile=True`** - Enables XLA compilation, which performs whole-function optimization including operation fusion, memory layout optimization, and target-specific code generation.
-
-### JAX
-
-JAX uses XLA to JIT compile pure functions into optimized machine code. It emphasizes functional programming patterns and captures side-effect-free operations for optimization.
-
-- **`@jax.jit`** - JIT compiles functions using XLA with automatic operation fusion.
-
-## How Codeflash Optimizes with JIT
-
-When Codeflash identifies a function that could benefit from JIT compilation, it:
-
-1. Rewrites the code in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
-2. Generates appropriate tests that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
-3. Disables JIT compilation while running coverage and tracer to get accurate coverage and trace information. Both of them rely on Python bytecode execution but JIT compiled code stops running as Python bytecode.
-4. Disables Line Profiler information collection whenever presented with JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
-
-## Accurate Benchmarking on Non-CPU devices
-
-Since Non-CPU operations execute asynchronously, Codeflash automatically inserts synchronization barriers before measuring performance. This ensures timing measurements reflect actual computation time rather than just the time to queue operations:
-
-- **PyTorch**: Uses `torch.cuda.synchronize()` (NVIDIA GPUs) or `torch.mps.synchronize()` (MacOS Metal Performance Shaders) depending on the device.
-- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete.
-- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization.
+Just-in-time (JIT) compilation is a runtime technique where code is compiled into machine code on the fly, right before it is executed, to improve performance.. Codeflash supports optimizing numerical code using Just-in-Time (JIT) compilation via leveraging JIT compilers from popular frameworks including **Numba**, **PyTorch**, **TensorFlow**, and **JAX**.
 
 ## When JIT Compilation Helps
 
@@ -157,7 +101,7 @@ JIT compilation may not provide speedups when:
 
 #### Function Definition
 
-```
+```python
 def adaptive_processing(x, threshold=0.5):
     """Function with data-dependent control flow - compile struggles here"""
     # Check how many values exceed threshold (data-dependent!)
@@ -177,7 +121,7 @@ def adaptive_processing(x, threshold=0.5):
 
 #### Benchmarking Snippet (replace `cuda` with `mps` to run on your Mac)
 
-```
+```python
 # Create compiled version
 adaptive_processing_compiled = torch.compile(adaptive_processing)
 
@@ -253,7 +197,6 @@ Optimized: 0.0277s
 Speedup compared to Uncompiled: 1.57x
 ```
 
-
 Key improvements:
 
 1. Eliminate `.item()` - Keep computation on GPU.
@@ -261,6 +204,54 @@ Key improvements:
 3. Vectorization - Replace conditionals with masked operations.
 4. Reduce Python overhead - Minimize host-device synchronization.
 
+## Supported JIT Frameworks
+
+Each framework uses different compilation strategies to accelerate Python code:
+
+### Numba (CPU Code)
+
+Numba compiles Python functions to optimized machine code using the LLVM compiler infrastructure. Codeflash can suggest Numba optimizations that use:
+
+- **`@jit`** - General-purpose JIT compilation with optional flags.
+  - **`nopython=True`** - Compiles to machine code without falling back to the Python interpreter.
+  - **`fastmath=True`** - Uses aggressive floating-point optimizations via LLVM's fastmath flag.
+  - **`cache=True`** - cache compiled function to disk which reduces future runtimes.
+  - **`parallel=True`** - Parallelizes code inside loops.
+
+### PyTorch
+
+PyTorch provides JIT compilation through `torch.compile()`, the recommended compilation API introduced in PyTorch 2.0. It uses TorchDynamo to capture Python bytecode and TorchInductor to generate optimized kernels.
+
+- **`torch.compile()`** - Compiles a function or module for optimized execution.
+  - **`mode`** - Controls the compilation strategy:
+    - `"default"` - Balanced compilation with moderate optimization.
+    - `"reduce-overhead"` - Minimizes Python overhead using CUDA graphs, ideal for small batches.
+    - `"max-autotune"` - Spends more time auto-tuning to find the fastest kernels.
+  - **`fullgraph=True`** - Requires the entire function to be captured as a single graph. Raises an error if graph breaks occur, useful for ensuring complete optimization.
+  - **`dynamic=True`** - Enables dynamic shape support, allowing the compiled function to handle varying input sizes without recompilation.
+
+### TensorFlow
+
+TensorFlow uses `@tf.function` to compile Python functions into optimized TensorFlow graphs. When combined with XLA (Accelerated Linear Algebra), it can generate highly optimized machine code for both CPU and GPU.
+
+- **`@tf.function`** - Converts Python functions into TensorFlow graphs for optimized execution.
+  - **`jit_compile=True`** - Enables XLA compilation, which performs whole-function optimization including operation fusion, memory layout optimization, and target-specific code generation.
+
+### JAX
+
+JAX uses XLA to JIT compile pure functions into optimized machine code. It emphasizes functional programming patterns and captures side-effect-free operations for optimization.
+
+- **`@jax.jit`** - JIT compiles functions using XLA with automatic operation fusion.
+
+## How Codeflash Optimizes with JIT
+
+When Codeflash identifies a function that could benefit from JIT compilation, it:
+
+1. Rewrites the code in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
+2. Generates appropriate tests that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
+3. Disables JIT compilation while running coverage and tracer to get accurate coverage and trace information. Both of them rely on Python bytecode execution but JIT compiled code stops running as Python bytecode.
+4. Disables Line Profiler information collection whenever presented with JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
+
 ## Configuration
 
 JIT compilation support is **enabled automatically** in Codeflash. You don't need to modify any configuration to enable JIT-based optimizations. Codeflash will automatically detect when JIT compilation could improve performance and suggest appropriate optimizations.
\ No newline at end of file

From 446fdf9f1d97feeb4e5f7a46dd9aafbace96f3cb Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 15:58:53 -0800
Subject: [PATCH 08/15] sample code not needed right now

---
 code_to_optimize/complex_activation.py        | 11 ---
 .../tests/pytest/test_complex_activation.py   | 88 -------------------
 2 files changed, 99 deletions(-)
 delete mode 100644 code_to_optimize/complex_activation.py
 delete mode 100644 code_to_optimize/tests/pytest/test_complex_activation.py

diff --git a/code_to_optimize/complex_activation.py b/code_to_optimize/complex_activation.py
deleted file mode 100644
index d9ed216d3..000000000
--- a/code_to_optimize/complex_activation.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-def complex_activation(x):
-    """A custom activation with many small operations - compile makes a huge difference"""
-    # Many sequential element-wise ops create kernel launch overhead
-    x = torch.sin(x)
-    x = x * torch.cos(x)
-    x = x + torch.exp(-x.abs())
-    x = x / (1 + x.pow(2))
-    x = torch.tanh(x) * torch.sigmoid(x)
-    x = x - 0.5 * x.pow(3)
-    return x
\ No newline at end of file
diff --git a/code_to_optimize/tests/pytest/test_complex_activation.py b/code_to_optimize/tests/pytest/test_complex_activation.py
deleted file mode 100644
index 3fa8f0d12..000000000
--- a/code_to_optimize/tests/pytest/test_complex_activation.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-Unit tests for complex_activation function.
-
-Tests run on CUDA device with a single tensor shape.
-"""
-
-import pytest
-import torch
-
-from code_to_optimize.complex_activation import complex_activation
-
-
-@pytest.fixture
-def cuda_device():
-    """Return CUDA device, skip if not available."""
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA not available")
-    return torch.device("cuda")
-
-
-@pytest.fixture
-def input_tensor(cuda_device):
-    """Create a fixed-shape input tensor on CUDA."""
-    torch.manual_seed(42)
-    return torch.randn(32, 64, device=cuda_device, dtype=torch.float32)
-
-
-class TestComplexActivation:
-    """Tests for the complex_activation function."""
-
-    def test_output_shape(self, input_tensor):
-        """Test that output shape matches input shape."""
-        result = complex_activation(input_tensor)
-        assert result.shape == input_tensor.shape
-
-    def test_output_dtype(self, input_tensor):
-        """Test that output dtype matches input dtype."""
-        result = complex_activation(input_tensor)
-        assert result.dtype == input_tensor.dtype
-
-    def test_output_device(self, input_tensor, cuda_device):
-        """Test that output is on the same device as input."""
-        result = complex_activation(input_tensor)
-        assert result.device.type == cuda_device.type
-
-    def test_deterministic(self, input_tensor):
-        """Test that the function produces deterministic results."""
-        result1 = complex_activation(input_tensor.clone())
-        result2 = complex_activation(input_tensor.clone())
-        torch.testing.assert_close(result1, result2)
-
-    def test_output_is_finite(self, input_tensor):
-        """Test that output contains no NaN or Inf values."""
-        result = complex_activation(input_tensor)
-        assert torch.isfinite(result).all()
-
-    def test_output_bounded(self, input_tensor):
-        """Test that output values are bounded (activation should not explode)."""
-        result = complex_activation(input_tensor)
-        assert result.abs().max() < 10.0
-
-    def test_zero_input(self, cuda_device):
-        """Test behavior with zero input."""
-        x = torch.zeros(32, 64, device=cuda_device, dtype=torch.float32)
-        result = complex_activation(x)
-        assert torch.isfinite(result).all()
-        assert result.shape == x.shape
-
-    def test_positive_input(self, cuda_device):
-        """Test behavior with all positive inputs."""
-        x = torch.abs(torch.randn(32, 64, device=cuda_device, dtype=torch.float32)) + 0.1
-        result = complex_activation(x)
-        assert torch.isfinite(result).all()
-
-    def test_negative_input(self, cuda_device):
-        """Test behavior with all negative inputs."""
-        x = -torch.abs(torch.randn(32, 64, device=cuda_device, dtype=torch.float32)) - 0.1
-        result = complex_activation(x)
-        assert torch.isfinite(result).all()
-
-    def test_gradient_flow(self, cuda_device):
-        """Test that gradients can flow through the activation."""
-        x = torch.randn(32, 64, device=cuda_device, dtype=torch.float32, requires_grad=True)
-        result = complex_activation(x)
-        loss = result.sum()
-        loss.backward()
-        assert x.grad is not None
-        assert torch.isfinite(x.grad).all()
\ No newline at end of file

From 7b9d09af49ae13afb59953c41e765b34ea74c859 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 16:07:40 -0800
Subject: [PATCH 09/15] mintlify icon

---
 docs/codeflash-concepts/benchmarking-gpu-code.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/codeflash-concepts/benchmarking-gpu-code.mdx b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
index 9a54e700c..1bd24c268 100644
--- a/docs/codeflash-concepts/benchmarking-gpu-code.mdx
+++ b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
@@ -1,7 +1,7 @@
 ---
 title: "How Codeflash Measures Code Runtime on GPUs"
 description: "Learn how Codeflash accurately measures code performance on GPUs"
-icon: "stopwatch"
+icon: "microchip"
 sidebarTitle: "GPU Runtime Measurement"
 keywords: ["benchmarking", "performance", "timing", "measurement", "runtime", "noise reduction", "GPU", "MPS"]
 ---

From 3ab8fbbd812575ba2e9b56a4b9492ffd30aedff3 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 16:37:23 -0800
Subject: [PATCH 10/15] almost ready

---
 docs/codeflash-concepts/benchmarking-gpu-code.mdx | 14 ++++++--------
 docs/docs.json                                    |  4 ++--
 docs/support-for-jit/index.mdx                    | 14 +++++++-------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/docs/codeflash-concepts/benchmarking-gpu-code.mdx b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
index 1bd24c268..62f6c9607 100644
--- a/docs/codeflash-concepts/benchmarking-gpu-code.mdx
+++ b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
@@ -6,9 +6,9 @@ sidebarTitle: "GPU Runtime Measurement"
 keywords: ["benchmarking", "performance", "timing", "measurement", "runtime", "noise reduction", "GPU", "MPS"]
 ---
 
-## Accurate Benchmarking on GPU devices (NVIDIA GPUs and Mac Metal Performance Shaders)
+## Accurate Benchmarking on GPU devices
 
-When a GPU operation is executed, it executes **asynchronously**. This means the CPU queues up work for the GPU and immediately continues to the next line of code - it doesn't wait for the GPU to finish. Accurate measurement of code execution on GPUs involves the insertion of synchronization barriers to ensure no pending GPU tasks are executing before and after the timing measurements are made.
+When a GPU (Graphics Processing Unit) operation is executed, it executes **asynchronously**. This means the CPU (Central Processing Unit) queues up work for the GPU and immediately continues to the next line of code - it doesn't wait for the GPU to finish. Accurate measurement of code execution on GPUs involves the insertion of synchronization barriers to ensure no pending GPU tasks are executing before and after the timing measurements are made.
 
 ## Illustration
 
@@ -100,19 +100,17 @@ print(f"With synchronize: {(t1 - t0) / 1e6:.3f} ms")
 ```
 
 
-Output on NVIDIA GPU
+Expected Output on CUDA
 
 ```
 Without synchronize: 69.157 ms
 With synchronize: 152.277 ms
 ```
 
-# How codeflash measures execution time involving GPUs
+# How Codeflash measures execution time involving GPUs
 
-Codeflash automatically inserts synchronization barriers before measuring performance. It currently supports GPU code written in `Pytorch`, `Tensorflow` and `JAX`.
+Codeflash automatically inserts synchronization barriers before measuring performance. It currently supports GPU code written in `Pytorch`, `Tensorflow` and `JAX` for NVIDIA GPUs (CUDA) and MacOS Metal Performance Shaders (MPS).
 
-- **PyTorch**: Uses `torch.cuda.synchronize()` (NVIDIA GPUs) or `torch.mps.synchronize()` (MacOS Metal Performance Shaders) depending on the device.
+- **PyTorch**: Uses `torch.cuda.synchronize()` (CUDA) or `torch.mps.synchronize()` (MPS) depending on the device.
 - **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete. It works for both CUDA and MPS devices.
 - **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization. It works for both CUDA and MPS devices.
-
-Codeflash would support ROCm and TPU devices in the near future.
\ No newline at end of file
diff --git a/docs/docs.json b/docs/docs.json
index e81e21cd4..4baf42c7f 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -67,8 +67,8 @@
             "pages": [
               "codeflash-concepts/how-codeflash-works",
               "codeflash-concepts/benchmarking",
-              "support-for-jit/index",
-              "codeflash-concepts/benchmarking-gpu-code"
+              "codeflash-concepts/benchmarking-gpu-code",,
+              "support-for-jit/index"
             ]
           },
           {
diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index 2321609d8..970ecf934 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -8,7 +8,7 @@ keywords: ["JIT", "just-in-time", "numba", "pytorch", "tensorflow", "jax", "GPU"
 
 # Just-in-Time Compilation
 
-Just-in-time (JIT) compilation is a runtime technique where code is compiled into machine code on the fly, right before it is executed, to improve performance.. Codeflash supports optimizing numerical code using Just-in-Time (JIT) compilation via leveraging JIT compilers from popular frameworks including **Numba**, **PyTorch**, **TensorFlow**, and **JAX**.
+Just-in-time (JIT) compilation is a runtime technique where code is compiled into machine code on the fly, right before it is executed, to improve performance. Codeflash supports optimizing numerical code using Just-in-Time (JIT) compilation via leveraging JIT compilers from the **Numba**, **PyTorch**, **TensorFlow**, and **JAX** frameworks.
 
 ## When JIT Compilation Helps
 
@@ -17,7 +17,7 @@ JIT compilation is most effective for:
 - Numerical computations with loops that can't be easily vectorized.
 - Custom algorithms not covered by existing optimized libraries.
 - Functions that are called repeatedly with consistent input types.
-- Code that benefits from hardware-specific optimizations (SIMD, GPU acceleration).
+- Code that benefits from hardware-specific optimizations (SIMD acceleration).
 
 ### Example
 
@@ -47,7 +47,7 @@ complex_activation_compiled = torch.compile(complex_activation)
 # Benchmark
 x = torch.randn(1000, 1000, device='cuda')
 
-# Warmup
+# Warmup steps are slower as the JIT compiler is understanding the function execution to compile into machine code
 for _ in range(10):
     _ = complex_activation(x)
     _ = complex_activation_compiled(x)
@@ -83,7 +83,7 @@ Speedup: 2.80x
 
 Here, JIT compilation via `torch.compile` is the only viable option because
 1. Already vectorized - All operations are already PyTorch tensor ops.
-2. Multiple Kernel Launches - Uncompiled code launches ~10 separate kernels. torch.compile fuses them into 1-2 kernels, eliminating kernel launch overhead.
+2. Multiple Kernel Launches - Uncompiled code launches ~10 separate kernels. `torch.compile` fuses them into 1-2 kernels, eliminating kernel launch overhead.
 3. No algorithmic improvement - The computation itself is already optimal.
 4. Python overhead elimination - Removes Python interpreter overhead between operations.
 
@@ -128,7 +128,7 @@ adaptive_processing_compiled = torch.compile(adaptive_processing)
 # Test with data that causes branch variation
 x = torch.randn(500, 500, device='cuda')
 
-# Warmup
+# Warmup steps are slower as the JIT compiler is understanding the function execution to compile into machine code
 for _ in range(10):
     _ = adaptive_processing(x)
     _ = adaptive_processing_compiled(x)
@@ -249,8 +249,8 @@ When Codeflash identifies a function that could benefit from JIT compilation, it
 
 1. Rewrites the code in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
 2. Generates appropriate tests that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
-3. Disables JIT compilation while running coverage and tracer to get accurate coverage and trace information. Both of them rely on Python bytecode execution but JIT compiled code stops running as Python bytecode.
-4. Disables Line Profiler information collection whenever presented with JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
+3. Disables JIT compilation when running coverage and tracer. This ensures accurate coverage and trace data, since both rely on Python bytecode execution. JIT-compiled code bypasses Python bytecode, so it would prevent proper tracking.
+4. Disables the Line Profiler for JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
 
 ## Configuration
 

From 754eb6cc5e9130947979a13934d774ec00478023 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 16:49:41 -0800
Subject: [PATCH 11/15] improve gantt chart

---
 docs/codeflash-concepts/benchmarking-gpu-code.mdx | 13 +++++++------
 docs/docs.json                                    |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/codeflash-concepts/benchmarking-gpu-code.mdx b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
index 62f6c9607..5353af745 100644
--- a/docs/codeflash-concepts/benchmarking-gpu-code.mdx
+++ b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
@@ -2,7 +2,7 @@
 title: "How Codeflash Measures Code Runtime on GPUs"
 description: "Learn how Codeflash accurately measures code performance on GPUs"
 icon: "microchip"
-sidebarTitle: "GPU Runtime Measurement"
+sidebarTitle: "GPU Benchmarking"
 keywords: ["benchmarking", "performance", "timing", "measurement", "runtime", "noise reduction", "GPU", "MPS"]
 ---
 
@@ -55,14 +55,15 @@ gantt
     Launch Kernel 1         :active, cpu0, 4, 8
     Launch Kernel 2         :active, cpu1, 8, 12
     Launch Kernel 3         :active, cpu2, 12, 16
-    Device Synchronization  :done, wait, 16, 29
-    Timer End               :milestone, m2, 29, 29
+    Device Synchronization  :done, wait, 16, 33
+    Timer End               :milestone, m2, 33, 33
 
     section CUDA Stream
     Previous Work         :done, wait, 0, 4
-    Kernel 1              :active, k1, 4, 11
-    Kernel 2              :active, k2, 11, 18
-    Kernel 3              :active, k3, 18, 29
+    Waiting               :done, wait, 4, 8
+    Kernel 1              :active, k1, 8, 15
+    Kernel 2              :active, k2, 15, 22
+    Kernel 3              :active, k3, 22, 33
 ```
 
 Here you can see that a device synchronization call is made before executing the code, this ensures that the CPU waits for any pending GPU tasks to finish before starting the timer. After the launch of the final kernel, another device synchronization call is made which ensures all pending GPU tasks are finished before measuring the runtime.
diff --git a/docs/docs.json b/docs/docs.json
index 4baf42c7f..87236e236 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -67,7 +67,7 @@
             "pages": [
               "codeflash-concepts/how-codeflash-works",
               "codeflash-concepts/benchmarking",
-              "codeflash-concepts/benchmarking-gpu-code",,
+              "codeflash-concepts/benchmarking-gpu-code",
               "support-for-jit/index"
             ]
           },

From ec3eed6b8ad4065b3f2eb8d83bb269611f420c73 Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 16:54:04 -0800
Subject: [PATCH 12/15] ready to review

---
 docs/codeflash-concepts/benchmarking-gpu-code.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/codeflash-concepts/benchmarking-gpu-code.mdx b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
index 5353af745..41d4f1d89 100644
--- a/docs/codeflash-concepts/benchmarking-gpu-code.mdx
+++ b/docs/codeflash-concepts/benchmarking-gpu-code.mdx
@@ -110,8 +110,8 @@ With synchronize: 152.277 ms
 
 # How Codeflash measures execution time involving GPUs
 
-Codeflash automatically inserts synchronization barriers before measuring performance. It currently supports GPU code written in `Pytorch`, `Tensorflow` and `JAX` for NVIDIA GPUs (CUDA) and MacOS Metal Performance Shaders (MPS).
+Codeflash automatically inserts synchronization barriers before measuring performance. It currently supports GPU code written in `Pytorch`, `Tensorflow` and `JAX` for NVIDIA GPUs (`CUDA`) and MacOS Metal Performance Shaders (`MPS`).
 
-- **PyTorch**: Uses `torch.cuda.synchronize()` (CUDA) or `torch.mps.synchronize()` (MPS) depending on the device.
-- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete. It works for both CUDA and MPS devices.
-- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization. It works for both CUDA and MPS devices.
+- **PyTorch**: Uses `torch.cuda.synchronize()` (`CUDA`) or `torch.mps.synchronize()` (`MPS`) depending on the device.
+- **JAX**: Uses `jax.block_until_ready()` to wait for computation to complete. It works for both `CUDA` and `MPS` devices.
+- **TensorFlow**: Uses `tf.test.experimental.sync_devices()` for device synchronization. It works for both `CUDA` and `MPS` devices.

From 8c66acb450e36b6a6f980b4a54eaaaa6f20753ba Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Mon, 26 Jan 2026 17:33:58 -0800
Subject: [PATCH 13/15] reordering sections

---
 docs/support-for-jit/index.mdx | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/support-for-jit/index.mdx b/docs/support-for-jit/index.mdx
index 970ecf934..0da08dacc 100644
--- a/docs/support-for-jit/index.mdx
+++ b/docs/support-for-jit/index.mdx
@@ -10,6 +10,19 @@ keywords: ["JIT", "just-in-time", "numba", "pytorch", "tensorflow", "jax", "GPU"
 
 Just-in-time (JIT) compilation is a runtime technique where code is compiled into machine code on the fly, right before it is executed, to improve performance. Codeflash supports optimizing numerical code using Just-in-Time (JIT) compilation via leveraging JIT compilers from the **Numba**, **PyTorch**, **TensorFlow**, and **JAX** frameworks.
 
+## How Codeflash Optimizes with JIT
+
+When Codeflash identifies a function that could benefit from JIT compilation, it:
+
+1. Rewrites the code in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
+2. Generates appropriate tests that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
+3. Disables JIT compilation when running coverage and tracer. This ensures accurate coverage and trace data, since both rely on Python bytecode execution. JIT-compiled code bypasses Python bytecode, so it would prevent proper tracking.
+4. Disables the Line Profiler for JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
+
+## Configuration
+
+JIT compilation support is **enabled automatically** in Codeflash. You don't need to modify any configuration to enable JIT-based optimizations. Codeflash will automatically detect when JIT compilation could improve performance and suggest appropriate optimizations.
+
 ## When JIT Compilation Helps
 
 JIT compilation is most effective for:
@@ -241,17 +254,4 @@ TensorFlow uses `@tf.function` to compile Python functions into optimized Tensor
 
 JAX uses XLA to JIT compile pure functions into optimized machine code. It emphasizes functional programming patterns and captures side-effect-free operations for optimization.
 
-- **`@jax.jit`** - JIT compiles functions using XLA with automatic operation fusion.
-
-## How Codeflash Optimizes with JIT
-
-When Codeflash identifies a function that could benefit from JIT compilation, it:
-
-1. Rewrites the code in a JIT-compatible format, which may involve breaking down complex functions into separate JIT-compiled components.
-2. Generates appropriate tests that are compatible with JIT-compiled code, carefully handling data types since JIT compilers have stricter input type requirements.
-3. Disables JIT compilation when running coverage and tracer. This ensures accurate coverage and trace data, since both rely on Python bytecode execution. JIT-compiled code bypasses Python bytecode, so it would prevent proper tracking.
-4. Disables the Line Profiler for JIT compiled code. It could be possible to disable JIT compilation and run the line profiler, but that would lead to inaccurate information which could misguide the optimization process.
-
-## Configuration
-
-JIT compilation support is **enabled automatically** in Codeflash. You don't need to modify any configuration to enable JIT-based optimizations. Codeflash will automatically detect when JIT compilation could improve performance and suggest appropriate optimizations.
\ No newline at end of file
+- **`@jax.jit`** - JIT compiles functions using XLA with automatic operation fusion.
\ No newline at end of file

From b9cc789a2c011af294339e045adea595b57c70bc Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 29 Jan 2026 12:12:45 -0800
Subject: [PATCH 14/15] precommit fix

---
 codeflash/benchmarking/plugin/plugin.py           |  8 ++++----
 codeflash/cli_cmds/cmd_init.py                    | 12 ++----------
 codeflash/cli_cmds/init_javascript.py             |  2 +-
 codeflash/code_utils/code_utils.py                |  2 +-
 codeflash/code_utils/config_js.py                 |  4 ++--
 codeflash/code_utils/deduplicate_code.py          |  5 +----
 codeflash/code_utils/instrument_existing_tests.py |  4 ++--
 codeflash/context/code_context_extractor.py       |  4 ++--
 codeflash/languages/javascript/module_system.py   |  2 +-
 codeflash/languages/javascript/support.py         |  2 +-
 codeflash/lsp/features/perform_optimization.py    |  4 ++--
 codeflash/lsp/lsp_logger.py                       |  2 +-
 codeflash/telemetry/posthog_cf.py                 |  2 +-
 codeflash/verification/comparator.py              |  2 +-
 codeflash/verification/equivalence.py             |  4 +---
 codeflash/verification/pytest_plugin.py           |  2 +-
 codeflash/version.py                              |  2 +-
 17 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/codeflash/benchmarking/plugin/plugin.py b/codeflash/benchmarking/plugin/plugin.py
index 8c502d143..b5639ddf5 100644
--- a/codeflash/benchmarking/plugin/plugin.py
+++ b/codeflash/benchmarking/plugin/plugin.py
@@ -200,7 +200,7 @@ def get_benchmark_timings(trace_path: Path) -> dict[BenchmarkKey, int]:
 
     # Pytest hooks
     @pytest.hookimpl
-    def pytest_sessionfinish(self, session, exitstatus) -> None:  # noqa: ANN001
+    def pytest_sessionfinish(self, session, exitstatus) -> None:
         """Execute after whole test run is completed."""
         # Write any remaining benchmark timings to the database
         codeflash_trace.close()
@@ -236,20 +236,20 @@ class Benchmark:  # noqa: D106
         def __init__(self, request: pytest.FixtureRequest) -> None:
             self.request = request
 
-        def __call__(self, func, *args, **kwargs):  # noqa: ANN001, ANN002, ANN003, ANN204
+        def __call__(self, func, *args, **kwargs):  # noqa: ANN002, ANN003, ANN204
             """Handle both direct function calls and decorator usage."""
             if args or kwargs:
                 # Used as benchmark(func, *args, **kwargs)
                 return self._run_benchmark(func, *args, **kwargs)
 
             # Used as @benchmark decorator
-            def wrapped_func(*args, **kwargs):  # noqa: ANN002, ANN003, ANN202
+            def wrapped_func(*args, **kwargs):  # noqa: ANN002, ANN003
                 return func(*args, **kwargs)
 
             self._run_benchmark(func)
             return wrapped_func
 
-        def _run_benchmark(self, func, *args, **kwargs):  # noqa: ANN002, ANN003, ANN202
+        def _run_benchmark(self, func, *args, **kwargs):  # noqa: ANN002, ANN003
             """Actual benchmark implementation."""
             node_path = getattr(self.request.node, "path", None) or getattr(self.request.node, "fspath", None)
             if node_path is None:
diff --git a/codeflash/cli_cmds/cmd_init.py b/codeflash/cli_cmds/cmd_init.py
index 51ca1a4f2..7a83a9971 100644
--- a/codeflash/cli_cmds/cmd_init.py
+++ b/codeflash/cli_cmds/cmd_init.py
@@ -1474,11 +1474,7 @@ def customize_codeflash_yaml_content(
     return _customize_python_workflow_content(optimize_yml_content, git_root, benchmark_mode)
 
 
-def _customize_python_workflow_content(
-    optimize_yml_content: str,
-    git_root: Path,
-    benchmark_mode: bool = False,  # noqa: FBT001, FBT002
-) -> str:
+def _customize_python_workflow_content(optimize_yml_content: str, git_root: Path, benchmark_mode: bool = False) -> str:
     """Customize workflow content for Python projects."""
     # Get dependency installation commands
     toml_path = Path.cwd() / "pyproject.toml"
@@ -1513,11 +1509,7 @@ def _customize_python_workflow_content(
 
 
 # TODO:{claude} Refactor and move to support for language specific
-def _customize_js_workflow_content(
-    optimize_yml_content: str,
-    git_root: Path,
-    benchmark_mode: bool = False,  # noqa: FBT001, FBT002
-) -> str:
+def _customize_js_workflow_content(optimize_yml_content: str, git_root: Path, benchmark_mode: bool = False) -> str:
     """Customize workflow content for JavaScript/TypeScript projects."""
     from codeflash.cli_cmds.init_javascript import (
         get_js_codeflash_install_step,
diff --git a/codeflash/cli_cmds/init_javascript.py b/codeflash/cli_cmds/init_javascript.py
index 578b56ca5..22371982a 100644
--- a/codeflash/cli_cmds/init_javascript.py
+++ b/codeflash/cli_cmds/init_javascript.py
@@ -66,7 +66,7 @@ class JSSetupInfo:
 
 
 # Import theme from cmd_init to avoid duplication
-def _get_theme():  # noqa: ANN202
+def _get_theme():
     """Get the CodeflashTheme - imported lazily to avoid circular imports."""
     from codeflash.cli_cmds.cmd_init import CodeflashTheme
 
diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py
index bc23e844e..9244f6b11 100644
--- a/codeflash/code_utils/code_utils.py
+++ b/codeflash/code_utils/code_utils.py
@@ -436,7 +436,7 @@ def extract_unique_errors(pytest_output: str) -> set[str]:
     pattern = r"^E\s+(.*)$"
 
     for error_message in re.findall(pattern, pytest_output, re.MULTILINE):
-        error_message = error_message.strip()  # noqa: PLW2901
+        error_message = error_message.strip()
         if error_message:
             unique_errors.add(error_message)
 
diff --git a/codeflash/code_utils/config_js.py b/codeflash/code_utils/config_js.py
index 92f635c25..80cdbe216 100644
--- a/codeflash/code_utils/config_js.py
+++ b/codeflash/code_utils/config_js.py
@@ -105,7 +105,7 @@ def detect_module_root(project_root: Path, package_data: dict[str, Any]) -> str:
     return "."
 
 
-def detect_test_runner(project_root: Path, package_data: dict[str, Any]) -> str:  # noqa: ARG001
+def detect_test_runner(project_root: Path, package_data: dict[str, Any]) -> str:
     """Detect test runner from devDependencies or scripts.test.
 
     Detection order:
@@ -144,7 +144,7 @@ def detect_test_runner(project_root: Path, package_data: dict[str, Any]) -> str:
     return "jest"
 
 
-def detect_formatter(project_root: Path, package_data: dict[str, Any]) -> list[str] | None:  # noqa: ARG001
+def detect_formatter(project_root: Path, package_data: dict[str, Any]) -> list[str] | None:
     """Detect formatter from devDependencies.
 
     Detection order:
diff --git a/codeflash/code_utils/deduplicate_code.py b/codeflash/code_utils/deduplicate_code.py
index 097fbbb71..a69c52ef3 100644
--- a/codeflash/code_utils/deduplicate_code.py
+++ b/codeflash/code_utils/deduplicate_code.py
@@ -14,10 +14,7 @@
 
 
 def normalize_code(
-    code: str,
-    remove_docstrings: bool = True,
-    return_ast_dump: bool = False,
-    language: str | None = None,
+    code: str, remove_docstrings: bool = True, return_ast_dump: bool = False, language: str | None = None
 ) -> str:
     """Normalize code by parsing, cleaning, and normalizing variable names.
 
diff --git a/codeflash/code_utils/instrument_existing_tests.py b/codeflash/code_utils/instrument_existing_tests.py
index 6315830ce..4366468d0 100644
--- a/codeflash/code_utils/instrument_existing_tests.py
+++ b/codeflash/code_utils/instrument_existing_tests.py
@@ -89,7 +89,7 @@ def find_and_update_line_node(
         # it's much more efficient to visit nodes manually. We'll only descend into expressions/statements.
 
         # Helper for manual walk
-        def iter_ast_calls(node):  # noqa: ANN202
+        def iter_ast_calls(node):
             # Generator to yield each ast.Call in test_node, preserves node identity
             stack = [node]
             while stack:
@@ -102,7 +102,7 @@ def iter_ast_calls(node):  # noqa: ANN202
                     if isinstance(value, list):
                         for item in reversed(value):
                             if isinstance(item, ast.AST):
-                                stack.append(item)  # noqa: PERF401
+                                stack.append(item)
                     elif isinstance(value, ast.AST):
                         stack.append(value)
 
diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py
index 4bafc0aeb..28141dcb9 100644
--- a/codeflash/context/code_context_extractor.py
+++ b/codeflash/context/code_context_extractor.py
@@ -46,8 +46,8 @@ def build_testgen_context(
     helpers_of_fto_dict: dict[Path, set[FunctionSource]],
     helpers_of_helpers_dict: dict[Path, set[FunctionSource]],
     project_root_path: Path,
-    remove_docstrings: bool,  # noqa: FBT001
-    include_imported_classes: bool,  # noqa: FBT001
+    remove_docstrings: bool,
+    include_imported_classes: bool,
 ) -> CodeStringsMarkdown:
     """Build testgen context with optional imported class definitions and external base inits."""
     testgen_context = extract_code_markdown_context_from_files(
diff --git a/codeflash/languages/javascript/module_system.py b/codeflash/languages/javascript/module_system.py
index 6ed9d62f0..626cc2d32 100644
--- a/codeflash/languages/javascript/module_system.py
+++ b/codeflash/languages/javascript/module_system.py
@@ -185,7 +185,7 @@ def _get_relative_import_path(target_path: Path, source_path: Path) -> str:
 
 def add_js_extension(module_path: str) -> str:
     """Add .js extension to relative module paths for ESM compatibility."""
-    if module_path.startswith(("./", "../")):  # noqa: SIM102
+    if module_path.startswith(("./", "../")):
         if not module_path.endswith(".js") and not module_path.endswith(".mjs"):
             return module_path + ".js"
     return module_path
diff --git a/codeflash/languages/javascript/support.py b/codeflash/languages/javascript/support.py
index 3ca13c88e..86c258b52 100644
--- a/codeflash/languages/javascript/support.py
+++ b/codeflash/languages/javascript/support.py
@@ -1872,7 +1872,7 @@ def instrument_source_for_line_profiler(
             # Write instrumented code to source file
             source_file_path.write_text(instrumented_source, encoding="utf-8")
             logger.debug("Wrote instrumented source to %s", source_file_path)
-            return True  # noqa: TRY300
+            return True
         except Exception as e:
             logger.warning("Failed to instrument source for line profiling: %s", e)
             return False
diff --git a/codeflash/lsp/features/perform_optimization.py b/codeflash/lsp/features/perform_optimization.py
index 7f84b2e0e..d1f413a7a 100644
--- a/codeflash/lsp/features/perform_optimization.py
+++ b/codeflash/lsp/features/perform_optimization.py
@@ -51,10 +51,10 @@ def sync_perform_optimization(server: CodeflashLanguageServer, cancel_event: thr
     ctx_tests = contextvars.copy_context()
     ctx_opts = contextvars.copy_context()
 
-    def run_generate_tests():  # noqa: ANN202
+    def run_generate_tests():
         return function_optimizer.generate_and_instrument_tests(code_context)
 
-    def run_generate_optimizations():  # noqa: ANN202
+    def run_generate_optimizations():
         return function_optimizer.generate_optimizations(
             read_writable_code=code_context.read_writable_code,
             read_only_context_code=code_context.read_only_context_code,
diff --git a/codeflash/lsp/lsp_logger.py b/codeflash/lsp/lsp_logger.py
index 8f522ba39..eb4f2fe43 100644
--- a/codeflash/lsp/lsp_logger.py
+++ b/codeflash/lsp/lsp_logger.py
@@ -127,7 +127,7 @@ def enhanced_log(
 
 # Configure logging to stderr for VS Code output channel
 def setup_logging() -> logging.Logger:
-    global root_logger  # noqa: PLW0603
+    global root_logger
     if root_logger:
         return root_logger
     # Clear any existing handlers to prevent conflicts
diff --git a/codeflash/telemetry/posthog_cf.py b/codeflash/telemetry/posthog_cf.py
index 15df7d509..1638f1ffc 100644
--- a/codeflash/telemetry/posthog_cf.py
+++ b/codeflash/telemetry/posthog_cf.py
@@ -20,7 +20,7 @@ def initialize_posthog(*, enabled: bool = True) -> None:
     if not enabled:
         return
 
-    global _posthog  # noqa: PLW0603
+    global _posthog
     _posthog = Posthog(project_api_key="phc_aUO790jHd7z1SXwsYCz8dRApxueplZlZWeDSpKc5hol", host="https://us.posthog.com")
     _posthog.log.setLevel(logging.CRITICAL)  # Suppress PostHog logging
     ph("cli-telemetry-enabled")
diff --git a/codeflash/verification/comparator.py b/codeflash/verification/comparator.py
index f92b0d000..ad7c59ede 100644
--- a/codeflash/verification/comparator.py
+++ b/codeflash/verification/comparator.py
@@ -234,7 +234,7 @@ def comparator(orig: Any, new: Any, superset_obj=False) -> bool:
 
             try:
                 insp = sqlalchemy.inspection.inspect(orig)
-                insp = sqlalchemy.inspection.inspect(new)  # noqa: F841
+                insp = sqlalchemy.inspection.inspect(new)
                 orig_keys = orig.__dict__
                 new_keys = new.__dict__
                 for key in list(orig_keys.keys()):
diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index 0ebd48fea..f660e35ea 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -28,9 +28,7 @@ def safe_repr(obj: object) -> str:
 
 
 def compare_test_results(
-    original_results: TestResults,
-    candidate_results: TestResults,
-    pass_fail_only: bool = False,  # noqa: FBT001, FBT002
+    original_results: TestResults, candidate_results: TestResults, pass_fail_only: bool = False
 ) -> tuple[bool, list[TestDiff]]:
     # This is meant to be only called with test results for the first loop index
     if len(original_results) == 0 or len(candidate_results) == 0:
diff --git a/codeflash/verification/pytest_plugin.py b/codeflash/verification/pytest_plugin.py
index 40324dbcb..0b7144356 100644
--- a/codeflash/verification/pytest_plugin.py
+++ b/codeflash/verification/pytest_plugin.py
@@ -383,7 +383,7 @@ def pytest_runtestloop(self, session: Session) -> bool:
             count += 1
             loop_start = _ORIGINAL_PERF_COUNTER_NS()
             for index, item in enumerate(session.items):
-                item: pytest.Item = item  # noqa: PLW0127, PLW2901
+                item: pytest.Item = item  # noqa: PLW0127
                 item._report_sections.clear()  # clear reports for new test  # noqa: SLF001
 
                 if total_time > SHORTEST_AMOUNT_OF_TIME:
diff --git a/codeflash/version.py b/codeflash/version.py
index ec305ddad..9b187ab4b 100644
--- a/codeflash/version.py
+++ b/codeflash/version.py
@@ -1,2 +1,2 @@
 # These version placeholders will be replaced by uv-dynamic-versioning during build.
-__version__ = "0.20.0.post91.dev0+28f8eb18"
+__version__ = "0.20.0.post202.dev0+d020da82"

From 5f74b14e861d3cf77038b7a0adbe09bbbeee3b3e Mon Sep 17 00:00:00 2001
From: aseembits93 <aseem.bits@gmail.com>
Date: Thu, 29 Jan 2026 12:14:13 -0800
Subject: [PATCH 15/15] restore version

---
 codeflash/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codeflash/version.py b/codeflash/version.py
index 9b187ab4b..6225467e3 100644
--- a/codeflash/version.py
+++ b/codeflash/version.py
@@ -1,2 +1,2 @@
 # These version placeholders will be replaced by uv-dynamic-versioning during build.
-__version__ = "0.20.0.post202.dev0+d020da82"
+__version__ = "0.20.0"