Merge branch 'main' into cleanup-huggingface-hub-integration

Wauplin · web-flow · commit dc6ff64b512a · 2026-02-10T10:32:12.000+01:00
diff --git a/kernels/src/kernels/benchmark.py b/kernels/src/kernels/benchmark.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 import time
+import warnings
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@@ -471,6 +472,7 @@ def run_benchmark_class(
     iterations: int,
     warmup: int,
     repo_id: str,
+    is_local: bool,
     revision: str,
 ) -> tuple[dict[str, TimingResults], str]:
     results = {}
@@ -486,9 +488,13 @@ def run_benchmark_class(
         raise RuntimeError(f"No benchmark_* methods found in {benchmark_cls.__name__}")
 
     # Load kernel once for all workloads
-    from kernels import get_kernel
+    from kernels import get_local_kernel, get_kernel
+
+    if is_local:
+        kernel = get_local_kernel(Path(repo_id), "activation")
+    else:
+        kernel = get_kernel(repo_id, revision=revision)
 
-    kernel = get_kernel(repo_id, revision=revision)
     kernel_sha = get_kernel_sha_from_build_name(kernel)
     backend_name = backend() if TORCH_AVAILABLE else "cpu"
     # Map backend names to torch device names
@@ -647,6 +653,7 @@ def run_benchmark_script(
     warmup: int,
     cwd: Path,
     repo_id: str,
+    is_local: bool,
     revision: str,
 ) -> tuple[dict[str, TimingResults], str]:
     print(f"Running {script_path.name}...", file=sys.stderr)
@@ -674,6 +681,7 @@ def run_benchmark_script(
             iterations=iterations,
             warmup=warmup,
             repo_id=repo_id,
+            is_local=is_local,
             revision=revision,
         )
         for name, timing in results.items():
@@ -717,6 +725,24 @@ def run_benchmark(
     # Suppress progress bars for cleaner output (files are often cached)
     disable_progress_bars()
 
+    repo_id_path = Path(repo_id)
+
+    if repo_id_path.is_absolute():
+        is_local = repo_id_path.exists()
+    else:
+        is_local = (Path.cwd() / repo_id_path).exists()
+        repo_id_path = Path.cwd() / repo_id_path
+
+    if is_local:
+        if repo_id.count("/") == 1 and not repo_id.startswith(("./", "../")):
+            warnings.warn(
+                f"'{repo_id}' exists locally but looks like a repo_id. "
+                f"Use './{repo_id}' to be explicit.",
+                stacklevel=2,
+            )
+        branch = "local"
+        version = None
+
     # Requires either branch or version or parses from repo_id
     if branch is None and version is None:
         if "@" not in repo_id:
@@ -739,7 +765,11 @@ def run_benchmark(
     assert revision is not None  # Guaranteed by parsing logic above
 
     print(f"Downloading {repo_id}@{revision}...", file=sys.stderr)
-    repo_path = Path(str(_get_hf_api().snapshot_download(repo_id=repo_id, revision=revision)))
+
+    if is_local:
+        repo_path = repo_id_path.resolve()
+    else:
+        repo_path = Path(str(_get_hf_api().snapshot_download(repo_id=repo_id, revision=revision)))
 
     scripts = discover_benchmark_scripts(repo_id, repo_path)
 
@@ -753,6 +783,7 @@ def run_benchmark(
                 warmup=warmup,
                 cwd=repo_path,
                 repo_id=repo_id,
+                is_local=is_local,
                 revision=revision,
             )
             timing_results.update(results)
diff --git a/kernels/src/kernels/benchmarks/activation.py b/kernels/src/kernels/benchmarks/activation.py
@@ -9,8 +9,8 @@ class SiluAndMulBenchmark(Benchmark):
 
     # Workload: small
     def setup_small(self):
-        self.x = torch.randn(1, 128, 512, device="cuda", dtype=torch.float16)
-        self.out = torch.empty(1, 128, 256, device="cuda", dtype=torch.float16)
+        self.x = torch.randn(8, 1024, 2048, device=self.device, dtype=torch.float16)
+        self.out = torch.empty(8, 1024, 1024, device=self.device, dtype=torch.float16)
 
     def benchmark_small(self):
         self.kernel.silu_and_mul(self.out, self.x)
@@ -21,8 +21,8 @@ def verify_small(self) -> torch.Tensor:
 
     # Workload: medium
     def setup_medium(self):
-        self.x = torch.randn(4, 512, 1024, device="cuda", dtype=torch.float16)
-        self.out = torch.empty(4, 512, 512, device="cuda", dtype=torch.float16)
+        self.x = torch.randn(8, 2048, 4096, device=self.device, dtype=torch.float16)
+        self.out = torch.empty(8, 2048, 2048, device=self.device, dtype=torch.float16)
 
     def benchmark_medium(self):
         self.kernel.silu_and_mul(self.out, self.x)
@@ -33,12 +33,53 @@ def verify_medium(self) -> torch.Tensor:
 
     # Workload: large
     def setup_large(self):
-        self.x = torch.randn(8, 1024, 2048, device="cuda", dtype=torch.float16)
-        self.out = torch.empty(8, 1024, 1024, device="cuda", dtype=torch.float16)
+        self.x = torch.randn(8, 4096, 8192, device=self.device, dtype=torch.float16)
+        self.out = torch.empty(8, 4096, 4096, device=self.device, dtype=torch.float16)
 
     def benchmark_large(self):
         self.kernel.silu_and_mul(self.out, self.x)
+        self.kernel.silu_and_mul(self.out, self.x)
 
     def verify_large(self) -> torch.Tensor:
         d = self.x.shape[-1] // 2
         return F.silu(self.x[..., :d]) * self.x[..., d:]
+
+
+class GeluAndMulBenchmark(Benchmark):
+    seed: int = 42
+
+    # Workload: small
+    def setup_small(self):
+        self.x = torch.randn(8, 1024, 2048, device=self.device, dtype=torch.float16)
+        self.out = torch.empty(8, 1024, 1024, device=self.device, dtype=torch.float16)
+
+    def benchmark_small(self):
+        self.kernel.gelu_and_mul(self.out, self.x)
+
+    def verify_small(self) -> torch.Tensor:
+        d = self.x.shape[-1] // 2
+        return F.gelu(self.x[..., :d]) * self.x[..., d:]
+
+    # Workload: medium
+    def setup_medium(self):
+        self.x = torch.randn(8, 2048, 4096, device=self.device, dtype=torch.float16)
+        self.out = torch.empty(8, 2048, 2048, device=self.device, dtype=torch.float16)
+
+    def benchmark_medium(self):
+        self.kernel.gelu_and_mul(self.out, self.x)
+
+    def verify_medium(self) -> torch.Tensor:
+        d = self.x.shape[-1] // 2
+        return F.gelu(self.x[..., :d]) * self.x[..., d:]
+
+    # Workload: large
+    def setup_large(self):
+        self.x = torch.randn(8, 4096, 8192, device=self.device, dtype=torch.float16)
+        self.out = torch.empty(8, 4096, 4096, device=self.device, dtype=torch.float16)
+
+    def benchmark_large(self):
+        self.kernel.gelu_and_mul(self.out, self.x)
+
+    def verify_large(self) -> torch.Tensor:
+        d = self.x.shape[-1] // 2
+        return F.gelu(self.x[..., :d]) * self.x[..., d:]