Skip to content

Commit 2ae7f9b

Browse files
committed
minor fixes
1 parent 2d734c0 commit 2ae7f9b

14 files changed

Lines changed: 24 additions & 22 deletions

File tree

include/bfloat16.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@ class bfloat16 {
4848
operator float() const { return to_float(value); }
4949

5050
// Logical operators (!,||,&&) are covered if we can cast to bool
51-
explicit operator bool() { return to_float(value) != 0.0f; }
51+
explicit operator bool() const { return to_float(value) != 0.0f; }
5252

5353
// Unary minus operator overloading
54-
friend bfloat16 operator-(bfloat16 &lhs) {
54+
friend bfloat16 operator-(const bfloat16 &lhs) {
5555
return -to_float(lhs.value);
5656
}
5757

include/util.hpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <CL/opencl.hpp>
99

1010
#include <fstream>
11+
#include <iterator>
1112
#include <string>
1213

1314
static cl_version getDeviceOpenCLVersion(
@@ -79,11 +80,6 @@ static std::string readStringFromFile(
7980
return "";
8081
}
8182

82-
size_t filesize = 0;
83-
is.seekg(0, std::ios::end);
84-
filesize = (size_t)is.tellg();
85-
is.seekg(0, std::ios::beg);
86-
8783
std::string source{
8884
std::istreambuf_iterator<char>(is),
8985
std::istreambuf_iterator<char>() };

samples/20_matrixexperiments-bf16/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## Sample Purpose
44

5-
This sample demonstrates various techniques to perform a large matrix multiplcation where the matrix elements contain 16-bit `bfloat16` data.
5+
This sample demonstrates various techniques to perform a large matrix multiplication where the matrix elements contain 16-bit `bfloat16` data.
66
The sample includes many different implementations:
77

88
1. The "naive" implementation is a very simple implementation.
@@ -46,7 +46,7 @@ This sample will optionally use the following OpenCL extensions:
4646
| `--iterations <int>` | 16 | Specify the number of iterations for performance testing.
4747
| `--validate` | n/a | Validate results for correctness.
4848
| `--zero` | n/a | Initialize all matrices to zero.
49-
| `--identity` | n/a | Initialize all matrices to to one.
49+
| `--identity` | n/a | Initialize all matrices to one.
5050
| `--fixed` | n/a | Initialize all matrices to values computed from the matrix row and column.
5151
| `--emulate` | n/a | Do not use specialized matrix multiplication extensions.
5252
| `--wallclock` | n/a | Measure performance using wallclock time instead of event profiling.
@@ -57,4 +57,4 @@ This sample will optionally use the following OpenCL extensions:
5757

5858
By default, the source matrices are populated with random data.
5959
When validating results, it is recommended to use either "fixed" or "identity" data.
60-
For best performance, use "zero" data".
60+
For best performance, use "zero" data.

samples/20_matrixexperiments-bf16/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,7 @@ int main(int argc, char** argv)
758758
if (deviceIndex >= devices.size()) {
759759
printf("Requested device index is %d, but only %zu devices were found.\n",
760760
deviceIndex, devices.size());
761+
return -1;
761762
}
762763

763764
cl::Device& device = devices[deviceIndex];

samples/20_matrixexperiments-bf16/matrix_helpers_bf16.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ float4 activation(float4 f)
4343
return res;
4444
}
4545

46+
__attribute__((overloadable))
4647
float8 activation(float8 f)
4748
{
4849
float8 res;

samples/20_matrixexperiments-bf16/matrix_kernel_tiled_bf16.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
*/
66

77
#if !defined(tK)
8-
#error "tK is undefined! This should be defined as the K dimension of the matrix tiles, which is dependent on the elemement type, likely 16 or 32."
8+
#error "tK is undefined! This should be defined as the K dimension of the matrix tiles, which is dependent on the element type, likely 16 or 32."
99
#endif
1010

1111
#if !defined(MM)

samples/20_matrixexperiments-i8/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## Sample Purpose
44

5-
This sample demonstrates various techniques to perform a large matrix multiplcation where the matrix elements contain 8-bit integer data.
5+
This sample demonstrates various techniques to perform a large matrix multiplication where the matrix elements contain 8-bit integer data.
66
The sample includes many different implementations:
77

88
1. The "naive" implementation is a very simple implementation.
@@ -40,13 +40,13 @@ This sample will optionally use the following OpenCL extensions:
4040
|:--|:-:|:--|
4141
| `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
4242
| `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
43-
| `--file <string>` | `matrix_kernels_bf16.cl` | Specify the name of the file with the OpenCL kernel source.
43+
| `--file <string>` | `matrix_kernels_i8.cl` | Specify the name of the file with the OpenCL kernel source.
4444
| `--options <string>` | None | Specify optional program build options.
4545
| `--matrixsize <int>` | 512 | Specify the dimensions of the matrix.
4646
| `--iterations <int>` | 16 | Specify the number of iterations for performance testing.
4747
| `--validate` | n/a | Validate results for correctness.
4848
| `--zero` | n/a | Initialize all matrices to zero.
49-
| `--identity` | n/a | Initialize all matrices to to one.
49+
| `--identity` | n/a | Initialize all matrices to one.
5050
| `--fixed` | n/a | Initialize all matrices to values computed from the matrix row and column.
5151
| `--emulate` | n/a | Do not use specialized matrix multiplication extensions.
5252
| `--wallclock` | n/a | Measure performance using wallclock time instead of event profiling.
@@ -57,4 +57,4 @@ This sample will optionally use the following OpenCL extensions:
5757

5858
By default, the source matrices are populated with random data.
5959
When validating results, it is recommended to use either "fixed" or "identity" data.
60-
For best performance, use "zero" data".
60+
For best performance, use "zero" data.

samples/20_matrixexperiments-i8/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,7 @@ int main(int argc, char** argv)
519519
if (deviceIndex >= devices.size()) {
520520
printf("Requested device index is %d, but only %zu devices were found.\n",
521521
deviceIndex, devices.size());
522+
return -1;
522523
}
523524

524525
cl::Device& device = devices[deviceIndex];

samples/20_matrixexperiments-i8/matrix_helpers_i8.cl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ int4 activation(int4 i)
3434
return res;
3535
}
3636

37+
__attribute__((overloadable))
3738
int8 activation(int8 i)
3839
{
3940
int8 res;
@@ -153,7 +154,7 @@ int8 emu_sub_group_i8_i8_matrix_mad_k32(int8 a, int8 b, int8 acc)
153154
__attribute__((overloadable))
154155
int emu_sub_group_i8_i8_matrix_mad_k32(short a, int8 b, int acc)
155156
{
156-
float res = acc;
157+
int res = acc;
157158

158159
res = dp4(as_uint((short2)(sub_group_broadcast(a, 0), sub_group_broadcast(a, 1))), b.s0) + res;
159160
res = dp4(as_uint((short2)(sub_group_broadcast(a, 2), sub_group_broadcast(a, 3))), b.s1) + res;

samples/20_matrixexperiments-i8/matrix_kernels_i8.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ kernel void i8_naive(global int* C, global char* A, global char* B, int K)
3636
// For all i8 kernels tK == 32:
3737
#define tK 32
3838

39-
#if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_char) && defined(cl_intel_required_subgroup_size)
39+
#if defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) && defined(cl_intel_subgroups_char) && defined(cl_intel_required_subgroup_size)
4040

4141
#if HAS_SG8
4242

@@ -582,6 +582,6 @@ kernel void i8_dpas_blockread_vnni_m8_n16(global int* C, global char* A, global
582582

583583
#endif // cl_intel_subgroup_2d_block_io
584584

585-
#endif // defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) && defined(cl_intel_required_subgroup_size)
585+
#endif // defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short) && defined(cl_intel_subgroups_char) && defined(cl_intel_required_subgroup_size)
586586

587587
#undef tK

0 commit comments

Comments
 (0)