1+ benchmarks :
2+ - benchmark_name : " single_device_bmm"
3+ trace_dir : " ../microbenchmarks/single_device_bmm_bf16"
4+ csv_path : " ../microbenchmarks/single_device_bmm_bf16"
5+ xlml_metrics_dir : " ../microbenchmarks/single_device_bmm_bf16"
6+ xla_dump_dir : " ../microbenchmarks/single_device_bmm_bf16/hlo_graphs"
7+ benchmark_sweep_params :
8+ - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'bfloat16'}
9+ - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'bfloat16'}
10+ - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'bfloat16'}
11+ - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'bfloat16'}
12+ - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'bfloat16'}
13+ - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'bfloat16'}
14+ - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'bfloat16'}
15+ - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'bfloat16'}
16+
17+ - benchmark_name : " single_device_bmm"
18+ trace_dir : " ../microbenchmarks/single_device_bmm_f32"
19+ csv_path : " ../microbenchmarks/single_device_bmm_f32"
20+ xlml_metrics_dir : " ../microbenchmarks/single_device_bmm_f32"
21+ xla_dump_dir : " ../microbenchmarks/single_device_bmm_f32/hlo_graphs"
22+ benchmark_sweep_params :
23+ - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float32'}
24+ - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float32'}
25+ - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float32'}
26+ - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float32'}
27+ - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float32'}
28+ - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float32'}
29+ - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float32'}
30+ - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float32'}
31+
32+ - benchmark_name : " single_device_bmm"
33+ trace_dir : " ../microbenchmarks/single_device_bmm_fp16"
34+ csv_path : " ../microbenchmarks/single_device_bmm_fp16"
35+ xlml_metrics_dir : " ../microbenchmarks/single_device_bmm_fp16"
36+ xla_dump_dir : " ../microbenchmarks/single_device_bmm_fp16/hlo_graphs"
37+ benchmark_sweep_params :
38+ - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float16'}
39+ - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float16'}
40+ - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float16'}
41+ - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float16'}
42+ - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float16'}
43+ - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float16'}
44+ - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float16'}
45+ - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float16'}
46+
47+ - benchmark_name : " single_device_bmm"
48+ trace_dir : " ../microbenchmarks/single_device_bmm_fp8"
49+ csv_path : " ../microbenchmarks/single_device_bmm_fp8"
50+ xlml_metrics_dir : " ../microbenchmarks/single_device_bmm_fp8"
51+ xla_dump_dir : " ../microbenchmarks/single_device_bmm_fp8/hlo_graphs"
52+ benchmark_sweep_params :
53+ - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float8'}
54+ - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float8'}
55+ - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float8'}
56+ - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float8'}
57+ - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float8'}
58+ - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float8'}
59+ - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float8'}
60+ - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float8'}
61+
62+ - benchmark_name : " single_device_bmm"
63+ trace_dir : " ../microbenchmarks/single_device_bmm_fp4"
64+ csv_path : " ../microbenchmarks/single_device_bmm_fp4"
65+ xlml_metrics_dir : " ../microbenchmarks/single_device_bmm_fp4"
66+ xla_dump_dir : " ../microbenchmarks/single_device_bmm_fp4/hlo_graphs"
67+ benchmark_sweep_params :
68+ - {b: 1, m: 128, k: 128, n: 128, num_runs: 100, dtype: 'float4'}
69+ - {b: 1, m: 256, k: 256, n: 256, num_runs: 100, dtype: 'float4'}
70+ - {b: 1, m: 512, k: 512, n: 512, num_runs: 100, dtype: 'float4'}
71+ - {b: 1, m: 1024, k: 1024, n: 1024, num_runs: 100, dtype: 'float4'}
72+ - {b: 1, m: 2048, k: 2048, n: 2048, num_runs: 100, dtype: 'float4'}
73+ - {b: 1, m: 4096, k: 4096, n: 4096, num_runs: 100, dtype: 'float4'}
74+ - {b: 1, m: 16384, k: 16384, n: 16384, num_runs: 100, dtype: 'float4'}
75+ - {b: 1, m: 32768, k: 32768, n: 32768, num_runs: 100, dtype: 'float4'}
0 commit comments