Merge pull request #26 from stackav-oss/feature/jmanning/update-benchmarks

jmanning-stackav · web-flow · commit bed5f6879a55 · 2025-06-12T16:44:00.000-04:00
Speedup varlen (again)
diff --git a/benchmarks/paged_attention_benchmark.py b/benchmarks/paged_attention_benchmark.py
@@ -26,7 +26,7 @@
     "--head-dim",
     required=True,
     type=int,
-    default=256,
+    default=128,
     help="Head dimension",
 )
 @click.option(
@@ -47,14 +47,14 @@
     "--batch-size",
     required=False,
     type=int,
-    default=4,
+    default=128,
     help="Batch size",
 )
 @click.option(
     "--num-query-heads",
     required=False,
     type=int,
-    default=8,
+    default=32,
     help="Number of query heads",
 )
 @click.option(
diff --git a/benchmarks/paged_attention_vs_flash_benchmark.py b/benchmarks/paged_attention_vs_flash_benchmark.py
@@ -26,7 +26,7 @@
     "--head-dim",
     required=True,
     type=int,
-    default=256,
+    default=128,
     help="Head dimension",
 )
 @click.option(
@@ -47,14 +47,14 @@
     "--batch-size",
     required=False,
     type=int,
-    default=4,
+    default=128,
     help="Batch size",
 )
 @click.option(
     "--num-query-heads",
     required=False,
     type=int,
-    default=8,
+    default=32,
     help="Number of query heads",
 )
 @click.option(
diff --git a/benchmarks/varlen_attention_benchmark.py b/benchmarks/varlen_attention_benchmark.py
@@ -27,14 +27,14 @@
     "--head-dim",
     required=True,
     type=int,
-    default=256,
+    default=128,
     help="Head dimension",
 )
 @click.option(
     "--seq-len",
     required=True,
     type=int,
-    default=1024,
+    default=512,
     help="Sequence length (for k/v)",
 )
 @click.option(
@@ -48,21 +48,21 @@
     "--batch-size",
     required=False,
     type=int,
-    default=10,
+    default=64,
     help="Batch size",
 )
 @click.option(
     "--num-query-heads",
     required=False,
     type=int,
-    default=8,
+    default=32,
     help="Number of query heads",
 )
 @click.option(
     "--num-kv-heads",
     required=False,
     type=int,
-    default=4,
+    default=8,
     help="Number of kv heads",
 )
 @click.option(
diff --git a/conch/kernels/attention/varlen_attention.py b/conch/kernels/attention/varlen_attention.py
@@ -13,70 +13,10 @@
 import triton.language as tl
 from triton.language.extra import libdevice  # type: ignore[attr-defined]
 
-# Note: adding `bool` or `str` type annotations to these load/store helper functions doesn't work
-
 # FP8 representation on CUDA and ROCm
 _FP8_DTYPES: Final = [torch.float8_e4m3fn, torch.float8_e4m3fnuz]
 
 
-@triton.jit  # type: ignore[misc]
-def _load_2d_block_ptr(  # type: ignore[no-untyped-def]
-    data_ptr: tl.tensor,
-    mask_first_dim,
-    mask_second_dim,
-    padding_option,
-) -> tl.tensor:
-    """Load a 2D tensor with custom strides and offsets."""
-    if mask_first_dim and mask_second_dim:
-        # Load with boundary check on both dimensions
-        data = tl.load(data_ptr, boundary_check=(0, 1), padding_option=padding_option)
-    elif mask_first_dim:
-        # Load with boundary check on first dimension only
-        data = tl.load(data_ptr, boundary_check=(0,), padding_option=padding_option)
-    elif mask_second_dim:
-        # Load with boundary check on second dimension only
-        data = tl.load(data_ptr, boundary_check=(1,), padding_option=padding_option)
-    else:
-        # Load without boundary check
-        data = tl.load(data_ptr)
-
-    return data
-
-
-@triton.jit  # type: ignore[misc]
-def _load(  # type: ignore[no-untyped-def]
-    data_ptr: tl.tensor,
-    use_mask,
-    mask: tl.tensor,
-    other,
-) -> tl.tensor:
-    """Load a 1D tensor with custom strides and offsets."""
-    if use_mask:
-        # Load with mask
-        data = tl.load(data_ptr, mask=mask, other=other)
-    else:
-        # Load without mask
-        data = tl.load(data_ptr)
-
-    return data
-
-
-@triton.jit  # type: ignore[misc]
-def _store(  # type: ignore[no-untyped-def]
-    data_ptr: tl.tensor,
-    value: tl.tensor,
-    use_mask,
-    mask: tl.tensor,
-) -> None:
-    """Store a 1D tensor with custom strides and offsets."""
-    if use_mask:
-        # Store with mask
-        tl.store(data_ptr, value, mask=mask)
-    else:
-        # Store without mask
-        tl.store(data_ptr, value)
-
-
 @triton.jit  # type: ignore[misc]
 def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
     # Pointers to tensors
@@ -261,15 +201,11 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
     # Mask out query elements that are just for padding
     query_mask = query_split_group_seq_mask[:, None] & query_split_group_head_mask[:, None] & head_mask[None, :]
 
-    # Determine whether or not we need masking for different dimensions
-    needs_query_split_mask = end_seqlen_q > this_query_length
-    needs_query_group_mask = query_group_size != cxpr_query_group_size_padded
-    needs_head_mask = head_size != cxpr_head_size_padded
-    needs_query_mask = (needs_query_split_mask or needs_query_group_mask) or needs_head_mask
+    # Only need causal masking if enabled and this program is not processing a decode
     needs_causal_mask = cxpr_is_causal and not is_pure_decode
 
     # Load queries
-    query = _load(query_ptr + query_offsets, use_mask=needs_query_mask, mask=query_mask, other=0.0)
+    query = tl.load(query_ptr + query_offsets, mask=query_mask, other=0.0)
 
     if cxpr_apply_fp8_scaling:
         q_scale = tl.load(q_scale_ptr)
@@ -301,9 +237,6 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
         cache_block_offsets = tl.arange(0, cxpr_cache_block_size)
         cache_block_mask = cache_block_offsets < num_entries_in_cache_block
 
-        needs_cache_block_mask = num_entries_in_cache_block != cxpr_cache_block_size
-        needs_qk_mask = (needs_query_split_mask or needs_query_group_mask) or needs_cache_block_mask
-
         # Offset from the block_table row for the current batch by the number of cache blocks
         current_cache_block_number_ptr = current_block_table_ptr + cache_block_index
         physical_cache_block_number = tl.load(current_cache_block_number_ptr)
@@ -319,9 +252,8 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
 
         key_block_mask = head_mask[:, None] & cache_block_mask[None, :]
 
-        key_block = _load(
+        key_block = tl.load(
             key_cache_ptr + kv_cache_block_index_offset + key_block_offsets,
-            use_mask=(needs_cache_block_mask or needs_head_mask),
             mask=key_block_mask,
             other=0.0,
         )
@@ -345,9 +277,7 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
             causal_mask = query_split_group_seq_offsets[:, None] >= effective_seqlen_k_offsets[None, :]
             qk_mask = qk_mask & causal_mask
 
-        if needs_qk_mask or needs_causal_mask:
-            # Set masked out elements to -inf
-            qk = tl.where(qk_mask, qk, -float("inf")).to(dtype)
+        qk = tl.where(qk_mask, qk, -float("inf")).to(dtype)
 
         # Handle softcapping
         if cxpr_is_softcap:
@@ -378,9 +308,8 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
 
         value_block_mask = cache_block_mask[:, None] & head_mask[None, :]
 
-        value_block = _load(
+        value_block = tl.load(
             value_cache_ptr + kv_cache_block_index_offset + value_block_offsets,
-            use_mask=(needs_cache_block_mask or needs_head_mask),
             mask=value_block_mask,
             other=0.0,
         )
@@ -415,10 +344,9 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
     )
 
     # Store output scratchpad results
-    _store(
+    tl.store(
         output_scratchpad_ptr + output_scratch_offsets,
         output,
-        use_mask=needs_query_mask,
         mask=query_mask,
     )
 
@@ -439,10 +367,9 @@ def _varlen_attention_compute_splits_kernel(  # noqa: PLR0913, PLR0915
         lse_mask = query_split_group_seq_mask & query_split_group_head_mask
 
         # Store lse scratchpad results
-        _store(
+        tl.store(
             lse_scratchpad_ptr + lse_scratch_offsets,
             lse,
-            use_mask=(needs_query_split_mask or needs_query_group_mask),
             mask=lse_mask,
         )
 
@@ -522,8 +449,6 @@ def _varlen_attention_reduce_splits_kernel(  # noqa: PLR0913
     head_offsets = tl.arange(0, cxpr_head_size_padded)
     # Mask to only read valid indices of the actual head size
     head_mask = head_offsets < head_size
-    # Only enable masking if its necessary
-    needs_head_mask = head_size != cxpr_head_size_padded
 
     # Iterate through every cache block for the current sequence
     for kv_split_index in range(num_kv_splits_this_seq):
@@ -537,9 +462,8 @@ def _varlen_attention_reduce_splits_kernel(  # noqa: PLR0913
         )
 
         # Load output for this cache block, shape -> (cxpr_head_size_padded,)
-        block_output = _load(
+        block_output = tl.load(
             output_scratchpad_ptr + output_scratchpad_offsets,
-            use_mask=needs_head_mask,
             mask=head_mask,
             other=0.0,
         )
@@ -583,10 +507,9 @@ def _varlen_attention_reduce_splits_kernel(  # noqa: PLR0913
     output_offsets = batch_index * output_batch_stride + query_head_index * output_head_stride + head_offsets
 
     # Store final result
-    _store(
+    tl.store(
         output_ptr + output_offsets,
         output,
-        use_mask=needs_head_mask,
         mask=head_mask,
     )
 
diff --git a/tools/create_benchmark_results_table.py b/tools/create_benchmark_results_table.py
@@ -24,6 +24,7 @@
     "GeLU, Tanh, and Mul": "gelu_tanh_and_mul_benchmark",
     "SiLU and Mul": "silu_and_mul_benchmark",
     "Paged Attention": "paged_attention_vs_flash_benchmark",
+    "Varlen Attention": "varlen_attention_benchmark",
     "Rotary Embedding": "rotary_embedding_benchmark",
     "RMS Norm (Gemma-style)": "gemma_rms_norm_benchmark",
     "RMS Norm (Llama-style)": "rms_norm_benchmark",
@@ -45,6 +46,11 @@
     "unknown": [],
 }
 
+# Add any extra flags for each benchmark here
+_EXTRA_BENCHMARK_FLAGS: Final = {
+    "varlen_attention_benchmark": ["--causal"],
+}
+
 
 @click.command()
 @click.option(
@@ -90,9 +96,12 @@ def main(results_directory: Path, use_cached_results: bool) -> None:
             # Run benchmark and redirect output
             print(f"Running benchmark for {op_name}...")
 
+            # Some benchmark args are flags to enable things that default false, so we add any per-benchmark here
+            extra_flags = _EXTRA_BENCHMARK_FLAGS[benchmark_name] if benchmark_name in _EXTRA_BENCHMARK_FLAGS else []
+
             with results_csv.open("w") as results_file:
                 run(
-                    ["python", f"benchmarks/{benchmark_name}.py", "--csv"],
+                    ["python", f"benchmarks/{benchmark_name}.py", "--csv"] + extra_flags,
                     check=True,
                     stdout=results_file,
                     env=os.environ,