minor fixes

bashbaug · bashbaug · commit 2ae7f9b4fcce · 2026-06-01T22:47:57.000-07:00
diff --git a/include/bfloat16.hpp b/include/bfloat16.hpp
@@ -48,10 +48,10 @@ class bfloat16 {
   operator float() const { return to_float(value); }
 
   // Logical operators (!,||,&&) are covered if we can cast to bool
-  explicit operator bool() { return to_float(value) != 0.0f; }
+  explicit operator bool() const { return to_float(value) != 0.0f; }
 
   // Unary minus operator overloading
-  friend bfloat16 operator-(bfloat16 &lhs) {
+  friend bfloat16 operator-(const bfloat16 &lhs) {
     return -to_float(lhs.value);
   }
 
diff --git a/include/util.hpp b/include/util.hpp
@@ -8,6 +8,7 @@
 #include <CL/opencl.hpp>
 
 #include <fstream>
+#include <iterator>
 #include <string>
 
 static cl_version getDeviceOpenCLVersion(
@@ -79,11 +80,6 @@ static std::string readStringFromFile(
         return "";
     }
 
-    size_t filesize = 0;
-    is.seekg(0, std::ios::end);
-    filesize = (size_t)is.tellg();
-    is.seekg(0, std::ios::beg);
-
     std::string source{
         std::istreambuf_iterator<char>(is),
         std::istreambuf_iterator<char>() };
diff --git a/samples/20_matrixexperiments-bf16/README.md b/samples/20_matrixexperiments-bf16/README.md
@@ -2,7 +2,7 @@
 
 ## Sample Purpose
 
-This sample demonstrates various techniques to perform a large matrix multiplcation where the matrix elements contain 16-bit `bfloat16` data.
+This sample demonstrates various techniques to perform a large matrix multiplication where the matrix elements contain 16-bit `bfloat16` data.
 The sample includes many different implementations:
 
 1. The "naive" implementation is a very simple implementation.
@@ -46,7 +46,7 @@ This sample will optionally use the following OpenCL extensions:
 | `--iterations <int>` | 16 | Specify the number of iterations for performance testing.
 | `--validate` | n/a | Validate results for correctness.
 | `--zero` | n/a | Initialize all matrices to zero.
-| `--identity` | n/a | Initialize all matrices to to one.
+| `--identity` | n/a | Initialize all matrices to one.
 | `--fixed` | n/a | Initialize all matrices to values computed from the matrix row and column.
 | `--emulate` | n/a | Do not use specialized matrix multiplication extensions.
 | `--wallclock` | n/a | Measure performance using wallclock time instead of event profiling.
@@ -57,4 +57,4 @@ This sample will optionally use the following OpenCL extensions:
 
 By default, the source matrices are populated with random data.
 When validating results, it is recommended to use either "fixed" or "identity" data.
-For best performance, use "zero" data".
+For best performance, use "zero" data.
diff --git a/samples/20_matrixexperiments-bf16/main.cpp b/samples/20_matrixexperiments-bf16/main.cpp
@@ -758,6 +758,7 @@ int main(int argc, char** argv)
     if (deviceIndex >= devices.size()) {
         printf("Requested device index is %d, but only %zu devices were found.\n",
             deviceIndex, devices.size());
+        return -1;
     }
 
     cl::Device& device = devices[deviceIndex];
diff --git a/samples/20_matrixexperiments-bf16/matrix_helpers_bf16.cl b/samples/20_matrixexperiments-bf16/matrix_helpers_bf16.cl
@@ -43,6 +43,7 @@ float4 activation(float4 f)
     return res;
 }
 
+__attribute__((overloadable))
 float8 activation(float8 f)
 {
     float8 res;
diff --git a/samples/20_matrixexperiments-bf16/matrix_kernel_tiled_bf16.cl b/samples/20_matrixexperiments-bf16/matrix_kernel_tiled_bf16.cl
@@ -5,7 +5,7 @@
 */
 
 #if !defined(tK)
-#error "tK is undefined!  This should be defined as the K dimension of the matrix tiles, which is dependent on the elemement type, likely 16 or 32."
+#error "tK is undefined!  This should be defined as the K dimension of the matrix tiles, which is dependent on the element type, likely 16 or 32."
 #endif
 
 #if !defined(MM)
diff --git a/samples/20_matrixexperiments-i8/README.md b/samples/20_matrixexperiments-i8/README.md
@@ -2,7 +2,7 @@
 
 ## Sample Purpose
 
-This sample demonstrates various techniques to perform a large matrix multiplcation where the matrix elements contain 8-bit integer data.
+This sample demonstrates various techniques to perform a large matrix multiplication where the matrix elements contain 8-bit integer data.
 The sample includes many different implementations:
 
 1. The "naive" implementation is a very simple implementation.
@@ -40,13 +40,13 @@ This sample will optionally use the following OpenCL extensions:
 |:--|:-:|:--|
 | `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
 | `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
-| `--file <string>` | `matrix_kernels_bf16.cl` | Specify the name of the file with the OpenCL kernel source.
+| `--file <string>` | `matrix_kernels_i8.cl` | Specify the name of the file with the OpenCL kernel source.
 | `--options <string>` | None | Specify optional program build options.
 | `--matrixsize <int>` | 512 | Specify the dimensions of the matrix.
 | `--iterations <int>` | 16 | Specify the number of iterations for performance testing.
 | `--validate` | n/a | Validate results for correctness.
 | `--zero` | n/a | Initialize all matrices to zero.
-| `--identity` | n/a | Initialize all matrices to to one.
+| `--identity` | n/a | Initialize all matrices to one.
 | `--fixed` | n/a | Initialize all matrices to values computed from the matrix row and column.
 | `--emulate` | n/a | Do not use specialized matrix multiplication extensions.
 | `--wallclock` | n/a | Measure performance using wallclock time instead of event profiling.
@@ -57,4 +57,4 @@ This sample will optionally use the following OpenCL extensions:
 
 By default, the source matrices are populated with random data.
 When validating results, it is recommended to use either "fixed" or "identity" data.
-For best performance, use "zero" data".
+For best performance, use "zero" data.
diff --git a/samples/20_matrixexperiments-i8/main.cpp b/samples/20_matrixexperiments-i8/main.cpp
@@ -519,6 +519,7 @@ int main(int argc, char** argv)
     if (deviceIndex >= devices.size()) {
         printf("Requested device index is %d, but only %zu devices were found.\n",
             deviceIndex, devices.size());
+        return -1;
     }
 
     cl::Device& device = devices[deviceIndex];
diff --git a/samples/20_matrixexperiments-i8/matrix_helpers_i8.cl b/samples/20_matrixexperiments-i8/matrix_helpers_i8.cl
@@ -34,6 +34,7 @@ int4 activation(int4 i)
     return res;
 }
 
+__attribute__((overloadable))
 int8 activation(int8 i)
 {
     int8 res;
@@ -153,7 +154,7 @@ int8 emu_sub_group_i8_i8_matrix_mad_k32(int8 a, int8 b, int8 acc)
 __attribute__((overloadable))
 int  emu_sub_group_i8_i8_matrix_mad_k32(short  a, int8 b, int  acc)
 {
-    float res = acc;
+    int res = acc;
 
     res = dp4(as_uint((short2)(sub_group_broadcast(a,  0), sub_group_broadcast(a,  1))), b.s0) + res;
     res = dp4(as_uint((short2)(sub_group_broadcast(a,  2), sub_group_broadcast(a,  3))), b.s1) + res;
diff --git a/samples/20_matrixexperiments-i8/matrix_kernels_i8.cl b/samples/20_matrixexperiments-i8/matrix_kernels_i8.cl
@@ -36,7 +36,7 @@ kernel void i8_naive(global int* C, global char* A, global char* B, int K)
 // For all i8 kernels tK == 32:
 #define tK 32
 
-#if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_char) && defined(cl_intel_required_subgroup_size)
+#if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) && defined(cl_intel_subgroups_char) && defined(cl_intel_required_subgroup_size)
 
 #if HAS_SG8
 
@@ -582,6 +582,6 @@ kernel void i8_dpas_blockread_vnni_m8_n16(global int* C, global char* A, global
 
 #endif // cl_intel_subgroup_2d_block_io
 
-#endif // defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) && defined(cl_intel_required_subgroup_size)
+#endif // defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) && defined(cl_intel_subgroups_char) && defined(cl_intel_required_subgroup_size)
 
 #undef tK
diff --git a/samples/20_matrixexperiments-tf32/README.md b/samples/20_matrixexperiments-tf32/README.md
@@ -2,7 +2,7 @@
 
 ## Sample Purpose
 
-This sample demonstrates various techniques to perform a large matrix multiplcation where the matrix elements contain 32-bit `tf32` data.
+This sample demonstrates various techniques to perform a large matrix multiplication where the matrix elements contain 32-bit `tf32` data.
 The sample includes many different implementations:
 
 1. The "naive" implementation is a very simple implementation.
@@ -44,7 +44,7 @@ This sample will optionally use the following OpenCL extensions:
 | `--iterations <int>` | 16 | Specify the number of iterations for performance testing.
 | `--validate` | n/a | Validate results for correctness.
 | `--zero` | n/a | Initialize all matrices to zero.
-| `--identity` | n/a | Initialize all matrices to to one.
+| `--identity` | n/a | Initialize all matrices to one.
 | `--fixed` | n/a | Initialize all matrices to values computed from the matrix row and column.
 | `--emulate` | n/a | Do not use specialized matrix multiplication extensions.
 | `--wallclock` | n/a | Measure performance using wallclock time instead of event profiling.
@@ -55,4 +55,4 @@ This sample will optionally use the following OpenCL extensions:
 
 By default, the source matrices are populated with random data.
 When validating results, it is recommended to use either "fixed" or "identity" data.
-For best performance, use "zero" data".
+For best performance, use "zero" data.
diff --git a/samples/20_matrixexperiments-tf32/main.cpp b/samples/20_matrixexperiments-tf32/main.cpp
@@ -530,6 +530,7 @@ int main(int argc, char** argv)
     if (deviceIndex >= devices.size()) {
         printf("Requested device index is %d, but only %zu devices were found.\n",
             deviceIndex, devices.size());
+        return -1;
     }
 
     cl::Device& device = devices[deviceIndex];
diff --git a/samples/20_matrixexperiments-tf32/matrix_helpers_tf32.cl b/samples/20_matrixexperiments-tf32/matrix_helpers_tf32.cl
@@ -34,6 +34,7 @@ float4 activation(float4 f)
     return res;
 }
 
+__attribute__((overloadable))
 float8 activation(float8 f)
 {
     float8 res;
diff --git a/samples/20_matrixexperiments-tf32/matrix_kernel_tiled_tf32.cl b/samples/20_matrixexperiments-tf32/matrix_kernel_tiled_tf32.cl
@@ -5,7 +5,7 @@
 */
 
 #if !defined(tK)
-#error "tK is undefined!  This should be defined as the K dimension of the matrix tiles, which is dependent on the elemement type, likely 16 or 32."
+#error "tK is undefined!  This should be defined as the K dimension of the matrix tiles, which is dependent on the element type, likely 16 or 32."
 #endif
 
 #if !defined(MM)

Original file line number	Diff line number	Diff line change
`@@ -758,6 +758,7 @@ int main(int argc, char** argv)`
`758`	`758`	`if (deviceIndex >= devices.size()) {`
`759`	`759`	`printf("Requested device index is %d, but only %zu devices were found.\n",`
`760`	`760`	`deviceIndex, devices.size());`
	`761`	`+ return -1;`
`761`	`762`	`}`
`762`	`763`
`763`	`764`	`cl::Device& device = devices[deviceIndex];`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ float4 activation(float4 f)`
`43`	`43`	`return res;`
`44`	`44`	`}`
`45`	`45`
	`46`	`+__attribute__((overloadable))`
`46`	`47`	`float8 activation(float8 f)`
`47`	`48`	`{`
`48`	`49`	`float8 res;`
Original file line number	Diff line number	Diff line change
`@@ -519,6 +519,7 @@ int main(int argc, char** argv)`
`519`	`519`	`if (deviceIndex >= devices.size()) {`
`520`	`520`	`printf("Requested device index is %d, but only %zu devices were found.\n",`
`521`	`521`	`deviceIndex, devices.size());`
	`522`	`+ return -1;`
`522`	`523`	`}`
`523`	`524`
`524`	`525`	`cl::Device& device = devices[deviceIndex];`
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ int4 activation(int4 i)`
`34`	`34`	`return res;`
`35`	`35`	`}`
`36`	`36`
	`37`	`+__attribute__((overloadable))`
`37`	`38`	`int8 activation(int8 i)`
`38`	`39`	`{`
`39`	`40`	`int8 res;`
`@@ -153,7 +154,7 @@ int8 emu_sub_group_i8_i8_matrix_mad_k32(int8 a, int8 b, int8 acc)`
`153`	`154`	`__attribute__((overloadable))`
`154`	`155`	`int emu_sub_group_i8_i8_matrix_mad_k32(short a, int8 b, int acc)`
`155`	`156`	`{`
`156`		`- float res = acc;`
	`157`	`+ int res = acc;`
`157`	`158`
`158`	`159`	`res = dp4(as_uint((short2)(sub_group_broadcast(a, 0), sub_group_broadcast(a, 1))), b.s0) + res;`
`159`	`160`	`res = dp4(as_uint((short2)(sub_group_broadcast(a, 2), sub_group_broadcast(a, 3))), b.s1) + res;`