Use time as the metric for measuring performance

voltjia · voltjia · commit 0dfaeb0258d6 · 2025-01-14T14:26:15.000+08:00
diff --git a/add.py b/add.py
@@ -87,15 +87,14 @@ def grid(meta):
             line_vals=["ninetoothed", "torch", "triton"],
             line_names=["NineToothed", "PyTorch", "Triton"],
             styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-            ylabel="GB/s",
+            ylabel="ms",
             plot_name="vector-addition-performance",
             args={},
         )
     )
     def benchmark(size, provider):
         lhs = torch.randn(size, device="cuda", dtype=torch.float16)
         rhs = torch.randn(size, device="cuda", dtype=torch.float16)
-        quantiles = [0.5, 0.2, 0.8]
 
         ninetoothed_output = add(lhs, rhs)
         torch_output = lhs + rhs
@@ -104,21 +103,12 @@ def benchmark(size, provider):
         assert torch.allclose(ninetoothed_output, triton_output, atol=0, rtol=0)
 
         if provider == "ninetoothed":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: add(lhs, rhs), quantiles=quantiles
-            )
+            ms = triton.testing.do_bench(lambda: add(lhs, rhs))
         elif provider == "torch":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: lhs + rhs, quantiles=quantiles
-            )
+            ms = triton.testing.do_bench(lambda: lhs + rhs)
         elif provider == "triton":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: triton_add(lhs, rhs), quantiles=quantiles
-            )
+            ms = triton.testing.do_bench(lambda: triton_add(lhs, rhs))
 
-        def gbps(ms):
-            return 3 * lhs.numel() * lhs.element_size() / ms * 1e-6
-
-        return gbps(ms), gbps(max_ms), gbps(min_ms)
+        return ms
 
     benchmark.run(print_data=True, show_plots=True, save_path=".")
diff --git a/attention.py b/attention.py
@@ -230,7 +230,7 @@ def grid(meta):
             line_vals=["ninetoothed", "torch", "triton"],
             line_names=["NineToothed", "PyTorch", "Triton"],
             styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-            ylabel="TFLOPS",
+            ylabel="ms",
             plot_name="attention-performance",
             args={},
         )
@@ -258,12 +258,6 @@ def benchmark(seq_len, provider):
         elif provider == "triton":
             ms = triton.testing.do_bench(lambda: triton_attention(q, k, v))
 
-        def perf(ms):
-            flops_per_matmul = 2 * batch_size * num_heads * seq_len * seq_len * emb_dim
-            total_flops = 2 * flops_per_matmul
-
-            return total_flops * 1e-12 / (ms * 1e-3)
-
-        return perf(ms)
+        return ms
 
     benchmark.run(show_plots=True, print_data=True, save_path=".")
diff --git a/conv2d.py b/conv2d.py
@@ -222,7 +222,7 @@ def grid(meta):
             line_vals=["ninetoothed", "torch", "triton"],
             line_names=["NineToothed", "PyTorch", "Triton"],
             styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-            ylabel="TFLOPS",
+            ylabel="ms",
             plot_name="2d-convolution-performance",
             args={},
         )
@@ -247,12 +247,6 @@ def benchmark(n, provider):
         elif provider == "triton":
             ms = triton.testing.do_bench(lambda: triton_conv2d(input, filter))
 
-        def perf(ms):
-            p = h - r + 1
-            q = w - s + 1
-
-            return 2 * n * k * p * q * c * r * s * 1e-12 / (ms * 1e-3)
-
-        return perf(ms)
+        return ms
 
     benchmark.run(show_plots=True, print_data=True, save_path=".")
diff --git a/matmul.py b/matmul.py
@@ -173,15 +173,14 @@ def grid(meta):
             line_vals=["ninetoothed", "torch", "triton"],
             line_names=["NineToothed", "PyTorch", "Triton"],
             styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-            ylabel="TFLOPS",
+            ylabel="ms",
             plot_name="matrix-multiplication-performance",
             args={},
         )
     )
     def benchmark(m, n, k, provider):
         lhs = torch.randn((m, k), device="cuda", dtype=torch.float16)
         rhs = torch.randn((k, n), device="cuda", dtype=torch.float16)
-        quantiles = [0.5, 0.2, 0.8]
 
         ninetoothed_output = matmul(lhs, rhs)
         torch_output = torch.matmul(lhs, rhs)
@@ -190,21 +189,12 @@ def benchmark(m, n, k, provider):
         assert torch.allclose(ninetoothed_output, triton_output, atol=0, rtol=0)
 
         if provider == "ninetoothed":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: matmul(lhs, rhs), quantiles=quantiles
-            )
+            ms = triton.testing.do_bench(lambda: matmul(lhs, rhs))
         elif provider == "torch":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: torch.matmul(lhs, rhs), quantiles=quantiles
-            )
+            ms = triton.testing.do_bench(lambda: torch.matmul(lhs, rhs))
         elif provider == "triton":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: triton_matmul(lhs, rhs), quantiles=quantiles
-            )
+            ms = triton.testing.do_bench(lambda: triton_matmul(lhs, rhs))
 
-        def perf(ms):
-            return 2 * m * n * k * 1e-12 / (ms * 1e-3)
-
-        return perf(ms), perf(max_ms), perf(min_ms)
+        return ms
 
     benchmark.run(show_plots=True, print_data=True, save_path=".")
diff --git a/performance_comparison.py b/performance_comparison.py
@@ -15,79 +15,50 @@
 @dataclass
 class KernelInformation:
     name: str
-    memory_bound: bool
-    compute_bound: bool
     perf_report_path: str
     independent_variable: str
 
 
-@dataclass
-class CategoryInformation:
-    kernels: tuple
-    y_label: str
-
-
 kernels = (
-    KernelInformation("add", True, False, "vector-addition-performance.csv", "Length"),
-    KernelInformation(
-        "softmax", True, False, "softmax-performance.csv", "Number of Columns"
-    ),
-    KernelInformation(
-        "rms_norm", True, False, "rms-norm-performance.csv", "Number of Columns"
-    ),
-    KernelInformation(
-        "matmul", False, True, "matrix-multiplication-performance.csv", "Sizes"
-    ),
-    KernelInformation(
-        "conv2d", False, True, "2d-convolution-performance.csv", "Batch Size"
-    ),
-    KernelInformation(
-        "attention", False, True, "attention-performance.csv", "Sequence Length"
-    ),
+    KernelInformation("add", "vector-addition-performance.csv", "Length"),
+    KernelInformation("softmax", "softmax-performance.csv", "Number of Columns"),
+    KernelInformation("rms_norm", "rms-norm-performance.csv", "Number of Columns"),
+    KernelInformation("matmul", "matrix-multiplication-performance.csv", "Sizes"),
+    KernelInformation("conv2d", "2d-convolution-performance.csv", "Batch Size"),
+    KernelInformation("attention", "attention-performance.csv", "Sequence Length"),
 )
 
 providers = ("Triton", "NineToothed")
 
-categories = (
-    CategoryInformation(
-        tuple(kernel for kernel in kernels if kernel.memory_bound), "GB/s"
-    ),
-    CategoryInformation(
-        tuple(kernel for kernel in kernels if kernel.compute_bound), "TFLOPS"
-    ),
-)
-
-num_rows = len(categories)
-num_cols = max(len(category.kernels) for category in categories)
+num_rows = 2
+num_cols = 3
 
 fig, axs = plt.subplots(num_rows, num_cols)
 
-performance_differences = []
-
-for row, category in enumerate(categories):
-    axs[row, 0].set_ylabel(category.y_label)
+performance_changes = []
 
-    for col, kernel in enumerate(category.kernels):
-        df = pd.read_csv(kernel.perf_report_path)
-        ax = axs[row, col]
+for i, kernel in enumerate(kernels):
+    df = pd.read_csv(kernel.perf_report_path)
+    ax = axs[i // num_cols, i % num_cols]
 
-        x = df.iloc[:, 0]
+    x = df.iloc[:, 0]
 
-        performance_differences.append((kernel, []))
+    performance_changes.append((kernel, []))
 
-        for provider in providers:
-            y = df[provider]
+    for provider in providers:
+        y = df[provider]
 
-            ax.plot(x, y, label=provider)
+        ax.plot(x, y, label=provider)
 
-            if provider == "NineToothed":
-                y_triton = df["Triton"]
-                diff = (y - y_triton) / y_triton * 100
-                performance_differences[-1][-1].append(diff)
+        if provider == "NineToothed":
+            y_triton = df["Triton"]
+            change = (y - y_triton) / y_triton * 100
+            performance_changes[-1][-1].append(change)
 
-            ax.set_title(kernel.name)
-            ax.set_xlabel(kernel.independent_variable)
-            ax.set_xscale("log", base=2)
+        ax.set_title(kernel.name)
+        ax.set_xlabel(kernel.independent_variable)
+        ax.set_ylabel("Execution Time (ms)")
+        ax.set_xscale("log", base=2)
 
 fig.legend(providers, loc="upper center", ncols=len(providers))
 fig.tight_layout()
@@ -96,24 +67,24 @@ class CategoryInformation:
 plt.show()
 plt.savefig("performance-comparison.png")
 
-all_differences = []
+all_changes = []
 stats_data = []
 
-for kernel, diffs in performance_differences:
-    all_differences.extend(diffs)
+for kernel, changes in performance_changes:
+    all_changes.extend(changes)
 
     kernel_stats = {
         "Kernel": kernel.name,
-        "Mean": np.mean(diffs),
-        "Median": np.median(diffs),
+        "Mean": np.mean(changes),
+        "Median": np.median(changes),
     }
 
     stats_data.append(kernel_stats)
 
 overall_stats = {
     "Kernel": "Overall",
-    "Mean": np.mean(all_differences),
-    "Median": np.median(all_differences),
+    "Mean": np.mean(all_changes),
+    "Median": np.median(all_changes),
 }
 
 stats_data.append(overall_stats)
diff --git a/rms_norm.py b/rms_norm.py
@@ -95,7 +95,7 @@ def triton_rms_norm(input, eps=1e-5):
             line_vals=["ninetoothed", "torch", "triton"],
             line_names=["NineToothed", "PyTorch", "Triton"],
             styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-            ylabel="GB/s",
+            ylabel="ms",
             plot_name="rms-norm-performance",
             args={"m": 4096},
         )
@@ -118,9 +118,6 @@ def benchmark(m, n, provider):
         elif provider == "triton":
             ms = triton.testing.do_bench(lambda: triton_rms_norm(input))
 
-        def gbps(ms):
-            return 2 * input.numel() * input.element_size() * 1e-6 / ms
-
-        return gbps(ms)
+        return ms
 
     benchmark.run(show_plots=True, print_data=True, save_path=".")
diff --git a/softmax.py b/softmax.py
@@ -96,7 +96,7 @@ def triton_softmax(input):
             line_vals=["ninetoothed", "torch", "triton"],
             line_names=["NineToothed", "PyTorch", "Triton"],
             styles=[("blue", "-"), ("green", "-"), ("orange", "-")],
-            ylabel="GB/s",
+            ylabel="ms",
             plot_name="softmax-performance",
             args={"m": 4096},
         )
@@ -117,9 +117,6 @@ def benchmark(m, n, provider):
         elif provider == "triton":
             ms = triton.testing.do_bench(lambda: triton_softmax(input))
 
-        def gbps(ms):
-            return 2 * input.numel() * input.element_size() * 1e-6 / ms
-
-        return gbps(ms)
+        return ms
 
     benchmark.run(show_plots=True, print_data=True, save_path=".")