stackav-oss
diff --git a/‎benchmarks/copy_blocks_benchmark.py‎
Lines changed: 21 additions & 4 deletions b/‎benchmarks/copy_blocks_benchmark.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎benchmarks/fused_add_rms_norm_benchmark.py‎
Lines changed: 23 additions & 4 deletions b/‎benchmarks/fused_add_rms_norm_benchmark.py‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎benchmarks/gelu_tanh_and_mul_benchmark.py‎
Lines changed: 23 additions & 4 deletions b/‎benchmarks/gelu_tanh_and_mul_benchmark.py‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎benchmarks/gemma_rms_norm_benchmark.py‎
Lines changed: 21 additions & 4 deletions b/‎benchmarks/gemma_rms_norm_benchmark.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎benchmarks/reshape_and_cache_benchmark.py‎
Lines changed: 23 additions & 6 deletions b/‎benchmarks/reshape_and_cache_benchmark.py‎
Lines changed: 23 additions & 6 deletions
@@ -98,6 +98,16 @@
     is_flag=True,
     help="Flag for printing results in CSV format",
 )
+@click.option(
+    "--compile-ref",
+    is_flag=True,
+    help="Flag to torch.compile() the reference impl",
+)
+@click.option(
+    "--compile-conch",
+    is_flag=True,
+    help="Flag to torch.compile() the Conch impl",
+)
 def main(
     head_dim: int,
     num_layers: int,
@@ -111,6 +121,8 @@ def main(
     verbose: bool,
     gpu: str,
     csv: bool,
+    compile_ref: bool,
+    compile_conch: bool,
 ) -> None:
     """Benchmark Conch copy_blocks operation.
 
@@ -127,6 +139,8 @@ def main(
         verbose: Flag to indicate whether or not to print verbose output.
         gpu: Which gpu to run on.
         csv: Flag to indicate whether or not to print results in CSV format.
+        compile_ref: Flag to torch.compile() the reference implementation.
+        compile_conch: Flag to torch.compile() the Conch implementation.
     """
     seed: Final = 0
     seed_everything(seed)
@@ -179,10 +193,13 @@ def main(
     # Convert mapping list to tensor
     block_mapping_tensor = torch.tensor(block_mapping, dtype=torch.int64, device=device).view(-1, 2)
 
+    copy_blocks_ref_fn = torch.compile(copy_blocks_reference) if compile_ref else copy_blocks_reference
+    copy_blocks_conch_fn = torch.compile(copy_blocks_conch) if compile_conch else copy_blocks_conch
+
     # Run the reference implementation.
-    copy_blocks_reference(cloned_key_caches, cloned_value_caches, block_mapping)
+    copy_blocks_ref_fn(cloned_key_caches, cloned_value_caches, block_mapping)
     # Call Conch kernel
-    copy_blocks_conch(key_caches, value_caches, block_mapping_tensor)
+    copy_blocks_conch_fn(key_caches, value_caches, block_mapping_tensor)
 
     # Compare the results.
     num_key_matched = 0
@@ -215,7 +232,7 @@ def main(
 
     # Benchmark Reference vs. Conch implementations
     baseline_result = benchmark_it(
-        lambda: copy_blocks_reference(
+        lambda: copy_blocks_ref_fn(
             cloned_key_caches,
             cloned_value_caches,
             block_mapping,
@@ -227,7 +244,7 @@ def main(
     )
 
     conch_result = benchmark_it(
-        lambda: copy_blocks_conch(
+        lambda: copy_blocks_conch_fn(
             key_caches,
             value_caches,
             block_mapping_tensor,
 
@@ -62,6 +62,16 @@
     is_flag=True,
     help="Flag for printing results in CSV format",
 )
+@click.option(
+    "--compile-ref",
+    is_flag=True,
+    help="Flag to torch.compile() the reference impl",
+)
+@click.option(
+    "--compile-conch",
+    is_flag=True,
+    help="Flag to torch.compile() the Conch impl",
+)
 def main(  # noqa: PLR0913
     hidden_size: int,
     num_tokens: int,
@@ -70,6 +80,8 @@ def main(  # noqa: PLR0913
     verbose: bool,
     gpu: str,
     csv: bool,
+    compile_ref: bool,
+    compile_conch: bool,
 ) -> None:
     """Benchmark Conch rms_norm op.
 
@@ -81,6 +93,8 @@ def main(  # noqa: PLR0913
         verbose: Flag to indicate whether or not to print verbose output.
         gpu: Which gpu to run on.
         csv: Flag to indicate whether or not to print results in CSV format.
+        compile_ref: Flag to torch.compile() the reference implementation.
+        compile_conch: Flag to torch.compile() the Conch implementation.
     """
     seed: Final = 0
     seed_everything(seed)
@@ -110,9 +124,14 @@ def main(  # noqa: PLR0913
     conch_residual = residual.clone()
     ref_residual = residual.clone()
 
-    conch_output, conch_residual = fused_add_rms_norm_conch(conch_x, conch_residual, weight, epsilon)
+    fused_add_rms_norm_conch_fn = torch.compile(fused_add_rms_norm_conch) if compile_conch else fused_add_rms_norm_conch
+    fused_add_rms_norm_ref_fn = (
+        torch.compile(fused_add_rms_norm_reference) if compile_ref else fused_add_rms_norm_reference
+    )
+
+    conch_output, conch_residual = fused_add_rms_norm_conch_fn(conch_x, conch_residual, weight, epsilon)
 
-    ref_output, ref_residual = fused_add_rms_norm_reference(ref_x, ref_residual, weight, epsilon)
+    ref_output, ref_residual = fused_add_rms_norm_ref_fn(ref_x, ref_residual, weight, epsilon)
 
     if not torch.allclose(ref_output, conch_output, atol=tolerance, rtol=tolerance):
         print(f"WARNING: Reference and Conch results differ! (atol={tolerance}, rtol={tolerance})", file=sys.stderr)
@@ -136,7 +155,7 @@ def main(  # noqa: PLR0913
 
     # Benchmark Reference vs. Conch implementations
     baseline_result = benchmark_it(
-        lambda: fused_add_rms_norm_reference(
+        lambda: fused_add_rms_norm_ref_fn(
             ref_x,
             ref_residual,
             weight,
@@ -149,7 +168,7 @@ def main(  # noqa: PLR0913
     )
 
     conch_result = benchmark_it(
-        lambda: fused_add_rms_norm_conch(
+        lambda: fused_add_rms_norm_conch_fn(
             conch_x,
             conch_residual,
             weight,
 
@@ -69,6 +69,16 @@
     is_flag=True,
     help="Flag for printing results in CSV format",
 )
+@click.option(
+    "--compile-ref",
+    is_flag=True,
+    help="Flag to torch.compile() the reference impl",
+)
+@click.option(
+    "--compile-conch",
+    is_flag=True,
+    help="Flag to torch.compile() the Conch impl",
+)
 def main(
     hidden_size: int,
     num_tokens: int,
@@ -78,6 +88,8 @@ def main(
     verbose: bool,
     gpu: str,
     csv: bool,
+    compile_ref: bool,
+    compile_conch: bool,
 ) -> None:
     """Benchmark Conch GeluTanhAndMul op.
 
@@ -90,6 +102,8 @@ def main(
         verbose: Flag to indicate whether or not to print verbose output.
         gpu: Which gpu to run on.
         csv: Flag for printing results in CSV format.
+        compile_ref: Flag to torch.compile() the reference implementation.
+        compile_conch: Flag to torch.compile() the Conch implementation.
     """
     seed: Final = 0
     seed_everything(seed)
@@ -107,8 +121,13 @@ def main(
 
     projections = torch.rand((num_tokens, hidden_size * 2), device=device)
 
-    ref_output = gelu_tanh_and_mul_reference(projections)
-    conch_output = gelu_tanh_and_mul_conch(projections)
+    gelu_tanh_and_mul_ref_fn = (
+        torch.compile(gelu_tanh_and_mul_reference) if compile_ref else gelu_tanh_and_mul_reference
+    )
+    gelu_tanh_and_mul_conch_fn = torch.compile(gelu_tanh_and_mul_conch) if compile_conch else gelu_tanh_and_mul_conch
+
+    ref_output = gelu_tanh_and_mul_ref_fn(projections)
+    conch_output = gelu_tanh_and_mul_conch_fn(projections)
 
     if not torch.allclose(ref_output, conch_output, atol=absolute_tolerance):
         print(f"WARNING: Reference and Conch results differ! (atol={absolute_tolerance})", file=sys.stderr)
@@ -121,15 +140,15 @@ def main(
         print(f"Results matched with atol={absolute_tolerance} :)", file=sys.stderr)
 
     baseline_result = benchmark_it(
-        lambda: gelu_tanh_and_mul_reference(projections),
+        lambda: gelu_tanh_and_mul_ref_fn(projections),
         tag="Baseline",
         metadata=metadata,
         iteration_time_ms=iteration_time_ms,
         warmup_time_ms=warmup_time_ms,
     )
 
     conch_result = benchmark_it(
-        lambda: gelu_tanh_and_mul_conch(projections),
+        lambda: gelu_tanh_and_mul_conch_fn(projections),
         tag="Conch",
         metadata=metadata,
         iteration_time_ms=iteration_time_ms,
 
@@ -69,6 +69,16 @@
     is_flag=True,
     help="Flag for printing results in CSV format",
 )
+@click.option(
+    "--compile-ref",
+    is_flag=True,
+    help="Flag to torch.compile() the reference impl",
+)
+@click.option(
+    "--compile-conch",
+    is_flag=True,
+    help="Flag to torch.compile() the Conch impl",
+)
 def main(
     embedding_size: int,
     num_tokens: int,
@@ -78,6 +88,8 @@ def main(
     verbose: bool,
     gpu: str,
     csv: bool,
+    compile_ref: bool,
+    compile_conch: bool,
 ) -> None:
     """Benchmark Conch GemmaRMSNorm op.
 
@@ -90,6 +102,8 @@ def main(
         verbose: Flag to indicate whether or not to print verbose output.
         gpu: Which gpu to run on.
         csv: Flag for printing results in CSV format.
+        compile_ref: Flag to torch.compile() the reference implementation.
+        compile_conch: Flag to torch.compile() the Conch implementation.
     """
     seed: Final = 0
     seed_everything(seed)
@@ -113,8 +127,11 @@ def main(
     x_ref = x.clone()
     x_conch = x.clone()
 
-    result_ref = gemma_rms_norm_reference(x_ref, weights, epsilon, residual=None)
-    result_conch = gemma_rms_norm_conch(x_conch, weights, epsilon, residual=None)
+    gemma_rms_norm_ref_fn = torch.compile(gemma_rms_norm_reference) if compile_ref else gemma_rms_norm_reference
+    gemma_rms_norm_conch_fn = torch.compile(gemma_rms_norm_conch) if compile_conch else gemma_rms_norm_conch
+
+    result_ref = gemma_rms_norm_ref_fn(x_ref, weights, epsilon, residual=None)
+    result_conch = gemma_rms_norm_conch_fn(x_conch, weights, epsilon, residual=None)
 
     # For mypy (if residual==None then result is single Tensor, not tuple[Tensor, Tensor])
     assert isinstance(result_ref, torch.Tensor)
@@ -131,15 +148,15 @@ def main(
         print(f"Results matched with atol={absolute_tolerance} :)", file=sys.stderr)
 
     baseline_result = benchmark_it(
-        lambda: gemma_rms_norm_reference(x_ref, weights, epsilon, residual=None),
+        lambda: gemma_rms_norm_ref_fn(x_ref, weights, epsilon, residual=None),
         tag="Baseline",
         metadata=metadata,
         iteration_time_ms=iteration_time_ms,
         warmup_time_ms=warmup_time_ms,
     )
 
     conch_result = benchmark_it(
-        lambda: gemma_rms_norm_conch(x_conch, weights, epsilon, residual=None),
+        lambda: gemma_rms_norm_conch_fn(x_conch, weights, epsilon, residual=None),
         tag="Conch",
         metadata=metadata,
         iteration_time_ms=iteration_time_ms,
 
@@ -98,6 +98,16 @@
     is_flag=True,
     help="Flag for printing results in CSV format",
 )
+@click.option(
+    "--compile-ref",
+    is_flag=True,
+    help="Flag to torch.compile() the reference impl",
+)
+@click.option(
+    "--compile-conch",
+    is_flag=True,
+    help="Flag to torch.compile() the Conch impl",
+)
 def main(
     head_dim: int,
     num_tokens: int,
@@ -111,6 +121,8 @@ def main(
     verbose: bool,
     gpu: str,
     csv: bool,
+    compile_ref: bool,
+    compile_conch: bool,
 ) -> None:
     """Benchmark Conch reshape_and_cache.
 
@@ -127,6 +139,8 @@ def main(
         verbose: Flag to indicate whether or not to print verbose output.
         gpu: Which gpu to run on.
         csv: Flag for printing results in CSV format.
+        compile_ref: Flag to torch.compile() the reference implementation.
+        compile_conch: Flag to torch.compile() the Conch implementation.
     """
     if kv_cache_dtype != "auto" and not current_platform.supports_fp8():
         error_msg = "Cannot use FP8 KV Cache because current platform does not support FP8"
@@ -184,13 +198,16 @@ def main(
     key_cache_conch = key_cache_ref.clone()
     value_cache_conch = value_cache_ref.clone()
 
-    # Run the reference implementation.
-    reshape_and_cache_reference(
-        key, value, key_cache_ref, value_cache_ref, slot_mapping, kv_cache_dtype, k_scale, v_scale
+    reshape_and_cache_ref_fn = (
+        torch.compile(reshape_and_cache_reference) if compile_ref else reshape_and_cache_reference
     )
+    reshape_and_cache_conch_fn = torch.compile(reshape_and_cache_conch) if compile_conch else reshape_and_cache_conch
+
+    # Run the reference implementation.
+    reshape_and_cache_ref_fn(key, value, key_cache_ref, value_cache_ref, slot_mapping, kv_cache_dtype, k_scale, v_scale)
 
     # Call Conch kernel
-    reshape_and_cache_conch(
+    reshape_and_cache_conch_fn(
         key, value, key_cache_conch, value_cache_conch, slot_mapping, kv_cache_dtype, k_scale, v_scale
     )
 
@@ -230,7 +247,7 @@ def main(
 
     # Benchmark Reference vs. Conch implementations
     baseline_result = benchmark_it(
-        lambda: reshape_and_cache_reference(
+        lambda: reshape_and_cache_ref_fn(
             key,
             value,
             key_cache_ref,
@@ -247,7 +264,7 @@ def main(
     )
 
     conch_result = benchmark_it(
-        lambda: reshape_and_cache_conch(
+        lambda: reshape_and_cache_conch_fn(
             key,
             value,
             key_cache_conch,