Run tasks in run_experiments.py instead of compare_performance_metrics.py

voltjia · voltjia · commit 0f8477ce89d3 · 2025-05-21T18:43:39.000+08:00
diff --git a/compare_performance_metrics.py b/compare_performance_metrics.py
@@ -1,125 +1,19 @@
-import functools
-import random
-
 import matplotlib.pyplot as plt
 import pandas as pd
-import torch
-import torch.nn.functional
-import triton
 
-import ops.ninetoothed.torch
-import ops.triton.torch
-import rotary_position_embedding
 from compare_code_metrics import _BACKSLASH_CHAR
 
-
-def _run_task(op_name, dtype, device, *arg_shapes, **kwarg_shapes):
-    ninetoothed_op = getattr(ops.ninetoothed.torch, op_name)
-    triton_op = getattr(ops.triton.torch, op_name)
-
-    if op_name == "rotary_position_embedding":
-        torch_op = rotary_position_embedding.torch_rotary_position_embedding
-    else:
-        torch_op = (
-            getattr(torch, op_name)
-            if hasattr(torch, op_name)
-            else getattr(torch.nn.functional, op_name)
-        )
-
-    if op_name == "rms_norm":
-        torch_op = functools.partial(torch_op, normalized_shape=arg_shapes[0][-1:])
-    elif op_name == "softmax":
-        torch_op = functools.partial(torch_op, dim=-1)
-
-    args = tuple(
-        torch.randn(shape, dtype=dtype, device=device) if shape else random.gauss(0, 1)
-        for shape in arg_shapes
-    )
-    kwargs = {
-        key: torch.randn(shape, dtype=dtype, device=device)
-        if shape
-        else random.gauss(0, 1)
-        for key, shape in kwarg_shapes.items()
-    }
-
-    arg_shape_string = ", ".join(str(shape) for shape in arg_shapes)
-    kwarg_shape_string = ", ".join(
-        f"{key}={shape}" for key, shape in kwarg_shapes.items()
-    )
-    shape_string = (
-        f"{arg_shape_string}, {kwarg_shape_string}"
-        if kwarg_shape_string
-        else arg_shape_string
-    )
-
-    task_description = f"{op_name}({shape_string})"
-
-    return task_description, _benchmark_ops(
-        (ninetoothed_op, triton_op, torch_op), *args, **kwargs
-    )
-
-
-def _benchmark_ops(ops, *args, **kwargs):
-    assert all(
-        torch.allclose(
-            op(*args, **kwargs), ops[0](*args, **kwargs), rtol=0.01, atol=0.01
-        )
-        for op in ops[1:]
-    )
-
-    return tuple(triton.testing.do_bench(lambda: op(*args, **kwargs)) for op in ops)
-
-
 if __name__ == "__main__":
-    random.seed(0)
-    torch.manual_seed(0)
-
     plt.rcParams["figure.dpi"] = 600
     plt.rcParams["font.family"] = "Linux Biolinum"
 
-    dtype = torch.float16
-    device = "cuda"
-
-    tasks = (
-        ("add", ((4096 * 4096,), (4096 * 4096,)), {}),
-        (
-            "addmm",
-            ((4096, 4096), (4096, 4096), (4096, 4096)),
-            {"beta": (), "alpha": ()},
-        ),
-        ("bmm", ((4, 2048, 2048), (4, 2048, 2048)), {}),
-        ("conv2d", ((4, 512, 14, 14), (512, 512, 3, 3)), {}),
-        ("mm", ((4096, 4096), (4096, 4096)), {}),
-        ("rms_norm", ((4096, 4096),), {}),
-        ("rotary_position_embedding", ((4, 1024, 48, 64), (1024, 32), (1024, 32)), {}),
-        (
-            "scaled_dot_product_attention",
-            ((4, 48, 1024, 64), (4, 48, 1024, 64), (4, 48, 1024, 64)),
-            {},
-        ),
-        ("silu", ((4096 * 4096,),), {}),
-        ("softmax", ((4096, 4096),), {}),
-    )
-
-    data = {"Task": [], "NineToothed": [], "Triton": [], "PyTorch": []}
-
-    for name, args, kwargs in tasks:
-        description, results = _run_task(name, dtype, device, *args, **kwargs)
-
-        latex_item = f"\item {_BACKSLASH_CHAR}texttt{{{description.replace('scaled_dot_product_attention', 'sdpa').replace('rotary_position_embedding', 'rope').replace('_', f'{_BACKSLASH_CHAR}_')}}}"
+    df = pd.read_csv("performance-metrics.csv")
 
+    for task in df["Task"]:
+        latex_item = f"\item {_BACKSLASH_CHAR}texttt{{{task.replace('scaled_dot_product_attention', 'sdpa').replace('rotary_position_embedding', 'rope').replace('_', f'{_BACKSLASH_CHAR}_')}}}"
         print(latex_item)
 
-        data["Task"].append(description)
-
-        for i, provider in enumerate(("NineToothed", "Triton", "PyTorch")):
-            data[provider].append(results[i])
-
-    df = pd.DataFrame(data)
     df.index += 1
-
-    df.set_index("Task").to_csv("performance-metrics.csv")
-
     df.plot(kind="bar", rot=0)
     plt.ylabel("Execution Time (ms)")
     plt.xlabel("Task")
diff --git a/run_experiments.py b/run_experiments.py
@@ -1,6 +1,17 @@
 import argparse
+import functools
+import random
 import subprocess
 
+import pandas as pd
+import torch
+import torch.nn.functional
+import triton
+
+import ops.ninetoothed.torch
+import ops.triton.torch
+import rotary_position_embedding
+
 PROMPTS = (
     "The emergence of deep learning domain-specific languages (DSLs) has substantially reduced the obstacles in developing high-performance, cross-platform compute kernels, but current DSLs",
     "Driven by recent advancements in the AI industry, the AI accelerator sector has increasingly diversified, with vendors developing their own hardware architectures and programming models, such as NVIDIA",
@@ -15,6 +26,63 @@
 ALL_MAX_NEW_TOKENS = (128, 512, 2048)
 
 
+def _run_task(op_name, dtype, device, *arg_shapes, **kwarg_shapes):
+    ninetoothed_op = getattr(ops.ninetoothed.torch, op_name)
+    triton_op = getattr(ops.triton.torch, op_name)
+
+    if op_name == "rotary_position_embedding":
+        torch_op = rotary_position_embedding.torch_rotary_position_embedding
+    else:
+        torch_op = (
+            getattr(torch, op_name)
+            if hasattr(torch, op_name)
+            else getattr(torch.nn.functional, op_name)
+        )
+
+    if op_name == "rms_norm":
+        torch_op = functools.partial(torch_op, normalized_shape=arg_shapes[0][-1:])
+    elif op_name == "softmax":
+        torch_op = functools.partial(torch_op, dim=-1)
+
+    args = tuple(
+        torch.randn(shape, dtype=dtype, device=device) if shape else random.gauss(0, 1)
+        for shape in arg_shapes
+    )
+    kwargs = {
+        key: torch.randn(shape, dtype=dtype, device=device)
+        if shape
+        else random.gauss(0, 1)
+        for key, shape in kwarg_shapes.items()
+    }
+
+    arg_shape_string = ", ".join(str(shape) for shape in arg_shapes)
+    kwarg_shape_string = ", ".join(
+        f"{key}={shape}" for key, shape in kwarg_shapes.items()
+    )
+    shape_string = (
+        f"{arg_shape_string}, {kwarg_shape_string}"
+        if kwarg_shape_string
+        else arg_shape_string
+    )
+
+    task_description = f"{op_name}({shape_string})"
+
+    return task_description, _benchmark_ops(
+        (ninetoothed_op, triton_op, torch_op), *args, **kwargs
+    )
+
+
+def _benchmark_ops(ops, *args, **kwargs):
+    assert all(
+        torch.allclose(
+            op(*args, **kwargs), ops[0](*args, **kwargs), rtol=0.01, atol=0.01
+        )
+        for op in ops[1:]
+    )
+
+    return tuple(triton.testing.do_bench(lambda: op(*args, **kwargs)) for op in ops)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run experiments.")
 
@@ -29,6 +97,9 @@
 
     model_name_or_path = args.model
 
+    random.seed(0)
+    torch.manual_seed(0)
+
     radon_commands = (
         (
             "radon",
@@ -50,6 +121,44 @@
     with open("code_metrics.tex", "w") as f:
         subprocess.run(("python", "compare_code_metrics.py"), stdout=f, check=True)
 
+    dtype = torch.float16
+    device = "cuda"
+
+    tasks = (
+        ("add", ((4096 * 4096,), (4096 * 4096,)), {}),
+        (
+            "addmm",
+            ((4096, 4096), (4096, 4096), (4096, 4096)),
+            {"beta": (), "alpha": ()},
+        ),
+        ("bmm", ((4, 2048, 2048), (4, 2048, 2048)), {}),
+        ("conv2d", ((4, 512, 14, 14), (512, 512, 3, 3)), {}),
+        ("mm", ((4096, 4096), (4096, 4096)), {}),
+        ("rms_norm", ((4096, 4096),), {}),
+        ("rotary_position_embedding", ((4, 1024, 48, 64), (1024, 32), (1024, 32)), {}),
+        (
+            "scaled_dot_product_attention",
+            ((4, 48, 1024, 64), (4, 48, 1024, 64), (4, 48, 1024, 64)),
+            {},
+        ),
+        ("silu", ((4096 * 4096,),), {}),
+        ("softmax", ((4096, 4096),), {}),
+    )
+
+    data = {"Task": [], "NineToothed": [], "Triton": [], "PyTorch": []}
+
+    for name, args, kwargs in tasks:
+        description, results = _run_task(name, dtype, device, *args, **kwargs)
+
+        data["Task"].append(description)
+
+        for i, provider in enumerate(("NineToothed", "Triton", "PyTorch")):
+            data[provider].append(results[i])
+
+    df = pd.DataFrame(data)
+
+    df.set_index("Task").to_csv("performance-metrics.csv")
+
     for max_new_tokens in ALL_MAX_NEW_TOKENS:
         for backend in BACKENDS:
             with open(f"infer_{max_new_tokens}_{backend}.json", "w") as f: