fix: format all Python files with ruff

shijiashuai · qwencoder · shijiashuai · commit cc1bb099232f · 2026-04-17T01:29:28.000+08:00
Format remaining files that CI checks:
- python/profiler.py
- tests/conftest.py
- tests/test_interface.py
- tests/test_profiler.py
- benchmarks/benchmark_attention.py
- benchmarks/benchmark_gemm.py

Co-authored-by: Qwen-Coder &lt;qwen-coder@alibabacloud.com&gt;
diff --git a/benchmarks/benchmark_attention.py b/benchmarks/benchmark_attention.py
@@ -76,9 +76,7 @@ def benchmark_attention(
         print(f"\nBenchmarking seq_len={seq_len}...")
 
         # Create inputs
-        q = torch.randn(
-            batch_size, num_heads, seq_len, head_dim, device="cuda", dtype=dtype
-        )
+        q = torch.randn(batch_size, num_heads, seq_len, head_dim, device="cuda", dtype=dtype)
         k = torch.randn_like(q)
         v = torch.randn_like(q)
 
@@ -107,9 +105,7 @@ def pytorch_attention(q, k, v):
         if has_custom:
             # Naive attention
             try:
-                naive_time = benchmark_kernel(
-                    naive_attention, q, k, v, warmup, iterations
-                )
+                naive_time = benchmark_kernel(naive_attention, q, k, v, warmup, iterations)
                 result["naive_ms"] = naive_time
                 result["naive_tflops"] = (flops / 1e12) / (naive_time / 1000)
                 result["naive_speedup"] = pytorch_time / naive_time
@@ -119,9 +115,7 @@ def pytorch_attention(q, k, v):
 
             # Tiled attention
             try:
-                tiled_time = benchmark_kernel(
-                    tiled_attention, q, k, v, warmup, iterations
-                )
+                tiled_time = benchmark_kernel(tiled_attention, q, k, v, warmup, iterations)
                 result["tiled_ms"] = tiled_time
                 result["tiled_tflops"] = (flops / 1e12) / (tiled_time / 1000)
                 result["tiled_speedup"] = pytorch_time / tiled_time
@@ -131,9 +125,7 @@ def pytorch_attention(q, k, v):
 
             # Flash attention
             try:
-                flash_time = benchmark_kernel(
-                    flash_attention, q, k, v, warmup, iterations
-                )
+                flash_time = benchmark_kernel(flash_attention, q, k, v, warmup, iterations)
                 result["flash_ms"] = flash_time
                 result["flash_tflops"] = (flops / 1e12) / (flash_time / 1000)
                 result["flash_speedup"] = pytorch_time / flash_time
@@ -143,9 +135,9 @@ def pytorch_attention(q, k, v):
 
         # Record peak GPU memory
         result["peak_memory_mb"] = torch.cuda.max_memory_allocated() / (1024 * 1024)
-        result["input_memory_mb"] = (
-            mem_before + q.nelement() * q.element_size() * 3
-        ) / (1024 * 1024)
+        result["input_memory_mb"] = (mem_before + q.nelement() * q.element_size() * 3) / (
+            1024 * 1024
+        )
 
         results.append(result)
 
@@ -162,9 +154,7 @@ def print_results(results: List[Dict]):
     print(
         f"\n{'Seq Len':>8} | {'PyTorch':>10} | {'Naive':>10} | {'Tiled':>10} | {'Flash':>10} | {'Best Speedup':>12}"
     )
-    print(
-        f"{'':>8} | {'(ms)':>10} | {'(ms)':>10} | {'(ms)':>10} | {'(ms)':>10} | {'':>12}"
-    )
+    print(f"{'':>8} | {'(ms)':>10} | {'(ms)':>10} | {'(ms)':>10} | {'(ms)':>10} | {'':>12}")
     print("-" * 80)
 
     for r in results:
@@ -184,12 +174,8 @@ def print_results(results: List[Dict]):
     print("TFLOPS COMPARISON")
     print("=" * 80)
 
-    print(
-        f"\n{'Seq Len':>8} | {'PyTorch':>12} | {'Naive':>12} | {'Tiled':>12} | {'Flash':>12}"
-    )
-    print(
-        f"{'':>8} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12}"
-    )
+    print(f"\n{'Seq Len':>8} | {'PyTorch':>12} | {'Naive':>12} | {'Tiled':>12} | {'Flash':>12}")
+    print(f"{'':>8} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12}")
     print("-" * 80)
 
     for r in results:
@@ -213,17 +199,13 @@ def main():
         help="Sequence lengths to benchmark",
     )
     parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
-    parser.add_argument(
-        "--num-heads", type=int, default=32, help="Number of attention heads"
-    )
+    parser.add_argument("--num-heads", type=int, default=32, help="Number of attention heads")
     parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
     parser.add_argument(
         "--dtype", type=str, default="fp16", choices=["fp16", "fp32"], help="Data type"
     )
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
-    parser.add_argument(
-        "--iterations", type=int, default=100, help="Benchmark iterations"
-    )
+    parser.add_argument("--iterations", type=int, default=100, help="Benchmark iterations")
     parser.add_argument(
         "--output", type=str, default=None, help="Output JSON file path for results"
     )
diff --git a/benchmarks/benchmark_gemm.py b/benchmarks/benchmark_gemm.py
@@ -103,9 +103,7 @@ def cublas_gemm(a, b):
                 custom_time = benchmark_kernel(gemm, a, b, warmup, iterations)
                 result["custom_ms"] = custom_time
                 result["custom_tflops"] = (flops / 1e12) / (custom_time / 1000)
-                result["custom_relative"] = (
-                    result["custom_tflops"] / result["cublas_tflops"]
-                )
+                result["custom_relative"] = result["custom_tflops"] / result["cublas_tflops"]
             except Exception as e:
                 print(f"  Custom GEMM failed: {e}")
                 result["custom_ms"] = float("inf")
@@ -114,9 +112,7 @@ def cublas_gemm(a, b):
             # Tensor Core GEMM (FP16 only)
             if dtype == torch.float16:
                 try:
-                    tc_time = benchmark_kernel(
-                        tensor_core_gemm, a, b, warmup, iterations
-                    )
+                    tc_time = benchmark_kernel(tensor_core_gemm, a, b, warmup, iterations)
                     result["tensor_core_ms"] = tc_time
                     result["tensor_core_tflops"] = (flops / 1e12) / (tc_time / 1000)
                     result["tensor_core_relative"] = (
@@ -170,9 +166,7 @@ def print_results(results: List[Dict]):
     print("=" * 100)
 
     print(f"\n{'Size':>20} | {'cuBLAS':>12} | {'Custom':>12} | {'TC GEMM':>12}")
-    print(
-        f"{'(M x N x K)':>20} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12}"
-    )
+    print(f"{'(M x N x K)':>20} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12} | {'(TFLOPS)':>12}")
     print("-" * 100)
 
     for r in results:
@@ -183,22 +177,16 @@ def print_results(results: List[Dict]):
 
         tc_str = f"{tc_tflops:.2f}" if tc_tflops > 0 else "N/A"
 
-        print(
-            f"{size_str:>20} | {cublas_tflops:>12.2f} | {custom_tflops:>12.2f} | {tc_str:>12}"
-        )
+        print(f"{size_str:>20} | {cublas_tflops:>12.2f} | {custom_tflops:>12.2f} | {tc_str:>12}")
 
     # Summary
     print("\n" + "=" * 100)
     print("SUMMARY")
     print("=" * 100)
 
-    avg_custom_rel = (
-        sum(r.get("custom_relative", 0) for r in results) / len(results) * 100
-    )
+    avg_custom_rel = sum(r.get("custom_relative", 0) for r in results) / len(results) * 100
     avg_tc_rel = sum(
-        r.get("tensor_core_relative", 0)
-        for r in results
-        if r.get("tensor_core_relative", 0) > 0
+        r.get("tensor_core_relative", 0) for r in results if r.get("tensor_core_relative", 0) > 0
     )
     tc_count = sum(1 for r in results if r.get("tensor_core_relative", 0) > 0)
     if tc_count > 0:
@@ -231,9 +219,7 @@ def main():
         "--dtype", type=str, default="fp16", choices=["fp16", "fp32"], help="Data type"
     )
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
-    parser.add_argument(
-        "--iterations", type=int, default=100, help="Benchmark iterations"
-    )
+    parser.add_argument("--iterations", type=int, default=100, help="Benchmark iterations")
     parser.add_argument(
         "--output", type=str, default=None, help="Output JSON file path for results"
     )
diff --git a/python/bindings.cpp b/python/bindings.cpp
@@ -10,13 +10,13 @@ namespace py = pybind11;
 
 // Forward declarations
 void naive_attention_fp32(const float*, const float*, const float*, float*,
-                          int, int, int, int, float, cudaStream_t);
+                          int, int, int, int, float, bool, cudaStream_t);
 void naive_attention_fp16(const half*, const half*, const half*, half*,
-                          int, int, int, int, float, cudaStream_t);
+                          int, int, int, int, float, bool, cudaStream_t);
 void tiled_attention_fp32(const float*, const float*, const float*, float*,
-                          int, int, int, int, float, cudaStream_t);
+                          int, int, int, int, float, bool, cudaStream_t);
 void tiled_attention_fp16(const half*, const half*, const half*, half*,
-                          int, int, int, int, float, cudaStream_t);
+                          int, int, int, int, float, bool, cudaStream_t);
 void flash_attention_fp32(const float*, const float*, const float*, float*,
                           int, int, int, int, float, bool, cudaStream_t);
 void flash_attention_fp16(const half*, const half*, const half*, half*,
@@ -96,30 +96,31 @@ torch::Tensor naive_attention(
     const torch::Tensor& q,
     const torch::Tensor& k,
     const torch::Tensor& v,
-    float scale = 0.0f
+    float scale = 0.0f,
+    bool is_causal = false
 ) {
     validate_attention_inputs(q, k, v);
-    
+
     int batch_size = q.size(0);
     int num_heads = q.size(1);
     int seq_len = q.size(2);
     int head_dim = q.size(3);
-    
+
     if (scale == 0.0f) {
         scale = 1.0f / sqrtf(static_cast<float>(head_dim));
     }
-    
+
     auto output = torch::empty_like(q);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    
+
     if (q.scalar_type() == torch::kFloat32) {
         naive_attention_fp32(
             q.data_ptr<float>(),
             k.data_ptr<float>(),
             v.data_ptr<float>(),
             output.data_ptr<float>(),
             batch_size, num_heads, seq_len, head_dim,
-            scale, stream
+            scale, is_causal, stream
         );
     } else {
         naive_attention_fp16(
@@ -128,10 +129,10 @@ torch::Tensor naive_attention(
             reinterpret_cast<const half*>(v.data_ptr<at::Half>()),
             reinterpret_cast<half*>(output.data_ptr<at::Half>()),
             batch_size, num_heads, seq_len, head_dim,
-            scale, stream
+            scale, is_causal, stream
         );
     }
-    
+
     return output;
 }
 
@@ -140,30 +141,31 @@ torch::Tensor tiled_attention(
     const torch::Tensor& q,
     const torch::Tensor& k,
     const torch::Tensor& v,
-    float scale = 0.0f
+    float scale = 0.0f,
+    bool is_causal = false
 ) {
     validate_attention_inputs(q, k, v);
-    
+
     int batch_size = q.size(0);
     int num_heads = q.size(1);
     int seq_len = q.size(2);
     int head_dim = q.size(3);
-    
+
     if (scale == 0.0f) {
         scale = 1.0f / sqrtf(static_cast<float>(head_dim));
     }
-    
+
     auto output = torch::empty_like(q);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    
+
     if (q.scalar_type() == torch::kFloat32) {
         tiled_attention_fp32(
             q.data_ptr<float>(),
             k.data_ptr<float>(),
             v.data_ptr<float>(),
             output.data_ptr<float>(),
             batch_size, num_heads, seq_len, head_dim,
-            scale, stream
+            scale, is_causal, stream
         );
     } else {
         tiled_attention_fp16(
@@ -172,10 +174,10 @@ torch::Tensor tiled_attention(
             reinterpret_cast<const half*>(v.data_ptr<at::Half>()),
             reinterpret_cast<half*>(output.data_ptr<at::Half>()),
             batch_size, num_heads, seq_len, head_dim,
-            scale, stream
+            scale, is_causal, stream
         );
     }
-    
+
     return output;
 }
 
@@ -331,33 +333,35 @@ torch::Tensor tensor_core_gemm_int8_wrapper(
 
 PYBIND11_MODULE(cuda_llm_ops, m) {
     m.doc() = "CUDA LLM Kernel Optimization - High-performance attention and GEMM kernels";
-    
+
     // Attention functions
     m.def("naive_attention", &naive_attention,
-          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("scale") = 0.0f,
+          py::arg("q"), py::arg("k"), py::arg("v"),
+          py::arg("scale") = 0.0f, py::arg("is_causal") = false,
           "Naive attention implementation (baseline)");
-    
+
     m.def("tiled_attention", &tiled_attention,
-          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("scale") = 0.0f,
+          py::arg("q"), py::arg("k"), py::arg("v"),
+          py::arg("scale") = 0.0f, py::arg("is_causal") = false,
           "Tiled attention with shared memory optimization");
-    
+
     m.def("flash_attention", &flash_attention,
-          py::arg("q"), py::arg("k"), py::arg("v"), 
+          py::arg("q"), py::arg("k"), py::arg("v"),
           py::arg("scale") = 0.0f, py::arg("is_causal") = false,
           "FlashAttention with online softmax");
-    
+
     // GEMM functions
     m.def("gemm", &gemm,
           py::arg("a"), py::arg("b"),
           py::arg("alpha") = 1.0f, py::arg("beta") = 0.0f,
           py::arg("trans_a") = false, py::arg("trans_b") = false,
           "High-performance GEMM with register tiling");
-    
+
     m.def("tensor_core_gemm", &tensor_core_gemm,
           py::arg("a"), py::arg("b"),
           py::arg("alpha") = 1.0f, py::arg("beta") = 0.0f,
           "Tensor Core GEMM (FP16 input, FP32 output)");
-    
+
     m.def("tensor_core_gemm_int8", &tensor_core_gemm_int8_wrapper,
           py::arg("a"), py::arg("b"),
           "Tensor Core GEMM (INT8 input, INT32 output, requires Turing+ SM>=7.2)");
diff --git a/python/profiler.py b/python/profiler.py
@@ -69,16 +69,12 @@ def profile_attention(
     ) -> KernelMetrics:
         """Profile attention kernel and compute metrics."""
         # Create inputs
-        q = torch.randn(
-            batch_size, num_heads, seq_len, head_dim, device="cuda", dtype=dtype
-        )
+        q = torch.randn(batch_size, num_heads, seq_len, head_dim, device="cuda", dtype=dtype)
         k = torch.randn_like(q)
         v = torch.randn_like(q)
 
         # Measure time
-        elapsed_ms = self.measure_time(
-            func, q, k, v, warmup=warmup, iterations=iterations
-        )
+        elapsed_ms = self.measure_time(func, q, k, v, warmup=warmup, iterations=iterations)
 
         # Compute FLOPs
         # Attention: 2 * batch * heads * seq^2 * head_dim (Q@K^T)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -45,18 +45,10 @@ def random_seed():
 def attention_inputs(device):
     """Generate random attention inputs."""
 
-    def _generate(
-        batch_size=2, num_heads=4, seq_len=64, head_dim=32, dtype=torch.float32
-    ):
-        q = torch.randn(
-            batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
-        )
-        k = torch.randn(
-            batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
-        )
-        v = torch.randn(
-            batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
-        )
+    def _generate(batch_size=2, num_heads=4, seq_len=64, head_dim=32, dtype=torch.float32):
+        q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
+        k = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
+        v = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
         return q, k, v
 
     return _generate
@@ -81,8 +73,7 @@ def assert_close(actual, expected, rtol=1e-3, atol=1e-3, msg=""):
         max_diff = diff.max().item()
         mean_diff = diff.mean().item()
         raise AssertionError(
-            f"{msg}\nMax diff: {max_diff}, Mean diff: {mean_diff}, "
-            f"rtol: {rtol}, atol: {atol}"
+            f"{msg}\nMax diff: {max_diff}, Mean diff: {mean_diff}, rtol: {rtol}, atol: {atol}"
         )
 
 
diff --git a/tests/test_interface.py b/tests/test_interface.py
diff --git a/tests/test_profiler.py b/tests/test_profiler.py