|
14 | 14 |
|
15 | 15 | from huggingface_hub.utils import build_hf_headers, disable_progress_bars, get_session, hf_raise_for_status |
16 | 16 |
|
| 17 | +from kernels.benchmark import Benchmark |
17 | 18 | from kernels.utils import _get_hf_api, backend |
18 | 19 |
|
19 | 20 | MISSING_DEPS: list[str] = [] |
@@ -63,43 +64,6 @@ def _calculate_iqr_and_outliers( |
63 | 64 | return q1, q3, iqr, outliers |
64 | 65 |
|
65 | 66 |
|
66 | | -class Benchmark: |
67 | | - """Base class for kernel benchmarks. |
68 | | -
|
69 | | - Subclass this to create a benchmark script with automatic timing, |
70 | | - verification, and reproducibility support. The kernel is loaded |
71 | | - automatically from the repo_id specified in the CLI command. |
72 | | -
|
73 | | - Example: |
74 | | - class MyBenchmark(Benchmark): |
75 | | - seed = 42 |
76 | | -
|
77 | | - def setup(self): |
78 | | - self.x = torch.randn(128, 1024, device=self.device, dtype=torch.float16) |
79 | | - self.out = torch.empty(128, 512, device=self.device, dtype=torch.float16) |
80 | | -
|
81 | | - def benchmark_silu(self): |
82 | | - self.kernel.silu_and_mul(self.out, self.x) |
83 | | -
|
84 | | - def verify_silu(self) -> torch.Tensor: |
85 | | - # Return reference tensor; runner compares with self.out |
86 | | - return torch.nn.functional.silu(self.x[..., :512]) * self.x[..., 512:] |
87 | | -
|
88 | | - Run with: kernels benchmark <repo_id> |
89 | | - """ |
90 | | - |
91 | | - seed: int | None = None # Optional: seed for reproducibility |
92 | | - device: str = "cpu" # Set automatically by runner |
93 | | - |
94 | | - def __init__(self) -> None: |
95 | | - self.kernel: Any = None |
96 | | - self.out: Any = None # Output tensor, set by setup methods |
97 | | - |
98 | | - def setup(self) -> None: |
99 | | - """Override to set up tensors as instance attributes.""" |
100 | | - pass |
101 | | - |
102 | | - |
103 | 67 | @dataclass |
104 | 68 | class TimingResults: |
105 | 69 | mean_ms: float |
@@ -319,9 +283,7 @@ def _get_macos_gpu() -> tuple[str | None, int | None]: |
319 | 283 | from ctypes import POINTER, byref, c_char_p, c_int, c_int64, c_uint32, c_void_p |
320 | 284 |
|
321 | 285 | iokit = ctypes.CDLL("/System/Library/Frameworks/IOKit.framework/IOKit") |
322 | | - cf = ctypes.CDLL( |
323 | | - "/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation" |
324 | | - ) |
| 286 | + cf = ctypes.CDLL("/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation") |
325 | 287 |
|
326 | 288 | iokit.IOServiceMatching.restype = c_void_p |
327 | 289 | iokit.IOServiceMatching.argtypes = [c_char_p] |
@@ -382,9 +344,7 @@ def _get_macos_gpu() -> tuple[str | None, int | None]: |
382 | 344 | cf.CFRelease(key) |
383 | 345 |
|
384 | 346 | # Get GPU core count |
385 | | - key = cf.CFStringCreateWithCString( |
386 | | - None, b"gpu-core-count", kCFStringEncodingUTF8 |
387 | | - ) |
| 347 | + key = cf.CFStringCreateWithCString(None, b"gpu-core-count", kCFStringEncodingUTF8) |
388 | 348 | if key: |
389 | 349 | prop = iokit.IORegistryEntryCreateCFProperty(service, key, None, 0) |
390 | 350 | if prop: |
@@ -425,9 +385,7 @@ def collect_machine_info() -> MachineInfo: |
425 | 385 | if hasattr(torch.version, "hip") and torch.version.hip: |
426 | 386 | backend_type = f"ROCm {torch.version.hip}" |
427 | 387 | else: |
428 | | - backend_type = ( |
429 | | - f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA" |
430 | | - ) |
| 388 | + backend_type = f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA" |
431 | 389 | elif backend_name == "xpu": |
432 | 390 | gpu = torch.xpu.get_device_name(0) |
433 | 391 | backend_type = "XPU" |
@@ -479,16 +437,14 @@ def run_benchmark_class( |
479 | 437 |
|
480 | 438 | # Find all benchmark_* methods |
481 | 439 | benchmark_methods = [ |
482 | | - name |
483 | | - for name in dir(benchmark_cls) |
484 | | - if name.startswith("benchmark_") and callable(getattr(benchmark_cls, name)) |
| 440 | + name for name in dir(benchmark_cls) if name.startswith("benchmark_") and callable(getattr(benchmark_cls, name)) |
485 | 441 | ] |
486 | 442 |
|
487 | 443 | if not benchmark_methods: |
488 | 444 | raise RuntimeError(f"No benchmark_* methods found in {benchmark_cls.__name__}") |
489 | 445 |
|
490 | 446 | # Load kernel once for all workloads |
491 | | - from kernels import get_local_kernel, get_kernel |
| 447 | + from kernels import get_kernel, get_local_kernel |
492 | 448 |
|
493 | 449 | if is_local: |
494 | 450 | kernel = get_local_kernel(Path(repo_id), "activation") |
@@ -663,9 +619,7 @@ def run_benchmark_script( |
663 | 619 | raise RuntimeError(f"No Benchmark subclasses found in {script_path}") |
664 | 620 |
|
665 | 621 | machine_info = collect_machine_info() |
666 | | - gpu_cores_str = ( |
667 | | - f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else "" |
668 | | - ) |
| 622 | + gpu_cores_str = f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else "" |
669 | 623 | print(file=sys.stderr) |
670 | 624 | print(f" GPU {machine_info.gpu}{gpu_cores_str}", file=sys.stderr) |
671 | 625 | print(f" CPU {machine_info.cpu}", file=sys.stderr) |
@@ -736,8 +690,7 @@ def run_benchmark( |
736 | 690 | if is_local: |
737 | 691 | if repo_id.count("/") == 1 and not repo_id.startswith(("./", "../")): |
738 | 692 | warnings.warn( |
739 | | - f"'{repo_id}' exists locally but looks like a repo_id. " |
740 | | - f"Use './{repo_id}' to be explicit.", |
| 693 | + f"'{repo_id}' exists locally but looks like a repo_id. Use './{repo_id}' to be explicit.", |
741 | 694 | stacklevel=2, |
742 | 695 | ) |
743 | 696 | branch = "local" |
@@ -765,7 +718,6 @@ def run_benchmark( |
765 | 718 | assert revision is not None # Guaranteed by parsing logic above |
766 | 719 |
|
767 | 720 | print(f"Downloading {repo_id}@{revision}...", file=sys.stderr) |
768 | | - |
769 | 721 | if is_local: |
770 | 722 | repo_path = repo_id_path.resolve() |
771 | 723 | else: |
|
0 commit comments