Skip to content

Commit 69a8f9c

Browse files
Merge pull request #577 from janhq/update-dev-from-master-2026-07-02-01-12
Sync master with upstream release b9859
2 parents 165bae6 + 4fc4ec5 commit 69a8f9c

46 files changed

Lines changed: 5821 additions & 3106 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

common/arg.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -496,13 +496,15 @@ void common_models_handler_apply(common_models_handler & handler, common_params
496496
}
497497

498498
// handle hf_plan tasks
499-
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
499+
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files,
500+
const hf_cache::hf_file & primary,
501+
common_params_model & model) {
500502
for (size_t i = 0; i < model_files.size(); ++i) {
501503
auto & model_file = model_files[i];
502-
bool is_first = (i == 0);
503-
tasks.emplace_back(model_file, opts, [&, is_first]() {
504-
if (is_first) {
505-
// only use first part as model path
504+
bool is_primary = (model_file.path == primary.path);
505+
tasks.emplace_back(model_file, opts, [&, is_primary]() {
506+
if (is_primary) {
507+
// the primary file is the first split (00001-of), use it as model path
506508
model.path = hf_cache::finalize_file(model_file);
507509
} else {
508510
hf_cache::finalize_file(model_file);
@@ -511,7 +513,7 @@ void common_models_handler_apply(common_models_handler & handler, common_params
511513
}
512514
};
513515
if (!plan.model_files.empty()) {
514-
add_tasks(plan.model_files, params.model);
516+
add_tasks(plan.model_files, plan.primary, params.model);
515517
}
516518
if (!plan.mmproj.local_path.empty()) {
517519
tasks.emplace_back(plan.mmproj, opts, [&]() {
@@ -539,12 +541,12 @@ void common_models_handler_apply(common_models_handler & handler, common_params
539541

540542
// handle plan_spec (e.g. --spec-draft-hf)
541543
if (!plan_spec.model_files.empty()) {
542-
add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
544+
add_tasks(plan_spec.model_files, plan_spec.primary, params.speculative.draft.mparams);
543545
}
544546

545547
// handle vocoder plan (e.g. --hf-repo-v)
546548
if (!plan_voc.model_files.empty()) {
547-
add_tasks(plan_voc.model_files, params.vocoder.model);
549+
add_tasks(plan_voc.model_files, plan_voc.primary, params.vocoder.model);
548550
}
549551

550552
// run all tasks in parallel

docs/backend/OPENCL.md

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,26 @@
11
# llama.cpp for OpenCL
22

3-
- [Background](#background)
4-
- [OS](#os)
5-
- [Hardware](#hardware)
6-
- [DataType Supports](#datatype-supports)
7-
- [Model Preparation](#model-preparation)
8-
- [CMake Options](#cmake-options)
9-
- [Android](#android)
10-
- [Windows 11 Arm64](#windows-11-arm64)
11-
- [Linux](#Linux)
12-
- [Known Issue](#known-issues)
13-
- [TODO](#todo)
3+
- [llama.cpp for OpenCL](#llamacpp-for-opencl)
4+
- [Background](#background)
5+
- [Llama.cpp + OpenCL](#llamacpp--opencl)
6+
- [OS](#os)
7+
- [Hardware](#hardware)
8+
- [Adreno GPU](#adreno-gpu)
9+
- [DataType Supports](#datatype-supports)
10+
- [Model Preparation](#model-preparation)
11+
- [Binary Kernel Library](#binary-kernel-library)
12+
- [CMake Options](#cmake-options)
13+
- [Android](#android)
14+
- [I. Setup Environment](#i-setup-environment)
15+
- [II. Build llama.cpp](#ii-build-llamacpp)
16+
- [Windows 11 Arm64](#windows-11-arm64)
17+
- [I. Setup Environment](#i-setup-environment-1)
18+
- [II. Build llama.cpp](#ii-build-llamacpp-1)
19+
- [Linux](#linux)
20+
- [I. Setup Environment](#i-setup-environment-2)
21+
- [II. Build llama.cpp](#ii-build-llamacpp-2)
22+
- [Known Issues](#known-issues)
23+
- [TODO](#todo)
1424

1525
## Background
1626

@@ -34,11 +44,13 @@ The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adren
3444

3545
**Verified devices**
3646

37-
| Adreno GPU | Status |
38-
|:------------------------------------:|:-------:|
39-
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
40-
| Adreno 830 (Snapdragon 8 Elite) | Support |
41-
| Adreno X85 (Snapdragon X Elite) | Support |
47+
| Adreno GPU | Status |
48+
|:-------------------------------------:|:-------:|
49+
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
50+
| Adreno 830 (Snapdragon 8 Elite) | Support |
51+
| Adreno 840 (Snapdragon 8 Elite Gen 5) | Support |
52+
| Adreno X1-85 (Snapdragon X Elite) | Support |
53+
| Adreno X2-90 (Snapdragon X2 Elite) | Support |
4254

4355
> A6x GPUs with a recent driver and compiler are supported; they are usually found in IoT platforms.
4456
However, A6x GPUs in phones are likely not supported due to the outdated driver and compiler.
@@ -47,42 +59,43 @@ However, A6x GPUs in phones are likely not supported due to the outdated driver
4759

4860
| DataType | Status |
4961
|:----------------------:|:--------------------------:|
62+
| Q1_0 | Support |
5063
| Q4_0 | Support |
51-
| Q6_K | Support, but not optimized |
64+
| Q4_1 | Support |
65+
| Q5_0 | Support |
66+
| Q5_1 | Support |
5267
| Q8_0 | Support |
68+
| Q4_K | Support |
69+
| Q5_K | Support |
70+
| Q6_K | Support |
5371
| MXFP4 | Support |
72+
| IQ4_NL | Support |
5473

5574
## Model Preparation
5675

57-
You can refer to the general [llama-quantize tool](/tools/quantize/README.md) for steps to convert a model in Hugging Face safetensor format to GGUF with quantization.
76+
Since common quantizations are supported now, it is recommanded to download GGUF models directly from Huggingface.
5877

59-
Currently we support `Q4_0` quantization and have optimized for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize` (i.e., make all weights in `Q4_0`). For example,
78+
## Binary Kernel Library
6079

61-
```sh
62-
./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
63-
```
64-
65-
Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
66-
67-
### `MXFP4` MoE Models
68-
69-
OpenAI gpt-oss models are MoE models in `MXFP4`. The quantized model will be in `MXFP4_MOE`, a mixture of `MXFP4` and `Q8_0`.
70-
For this quantization, there is no need to specify `--pure`.
71-
For gpt-oss-20b model, you can directly [download](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) the quantized GGUF file in `MXFP4_MOE` from Hugging Face.
80+
A prebuilt binary kernel library has been introduced for Adreno GPUs.
81+
It currently targets X2 GPUs (X2-90, X2-85 and X2-45) found in Snapdragon X2 SoC.
82+
The library currently contains kernels for MUL_MAT_ID with Q4_0, Q4_1, Q4_K, MXFP4.
83+
The library must be manually downloaded from https://softwarecenter.qualcomm.com/catalog/item/Adreno_Kernel_Library_GGML.
7284

73-
Although it is possible to quantize gpt-oss-20b model in pure `Q4_0` (all weights in `Q4_0`), it is not recommended since `MXFP4` has been optimized for MoE while `Q4_0` is not. In addition, accuracy should degrade with such pure `Q4_0` quantization.
74-
Hence, using the default `MXFP4_MOE` quantization (see the link above) is recommended for this model.
85+
To allow using the kernel library, add `-DGGML_OPENCL_USE_ADRENO_BIN_KERNELS=ON` when configuring with CMake.
86+
Then, extract `adreno-opencl-kernels.dll` from the zip file downloaded from the above URL and put it alongside the executables.
87+
If kernels compatible with the current GPU are found in the library, they will be loaded and used.
7588

76-
> Note that the `Q4_0` model found [here](https://huggingface.co/unsloth/gpt-oss-20b-GGUF/blob/main/gpt-oss-20b-Q4_0.gguf) is a mixture of `Q4_0`, `Q8_0` and `MXFP4` and gives better performance than `MXFP4_MOE` quantization.
7789

7890
## CMake Options
7991

8092
The OpenCL backend has the following CMake options that control the behavior of the backend.
8193

82-
| CMake options | Default value | Description |
83-
|:---------------------------------:|:--------------:|:------------------------------------------|
84-
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
85-
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
94+
| CMake options | Default value | Description |
95+
|:------------------------------------:|:--------------:|:------------------------------------------|
96+
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
97+
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
98+
| `GGML_OPENCL_USE_ADRENO_BIN_KERNELS` | `OFF` | Allow using binary kernel lib for Adreno. |
8699

87100
## Android
88101

@@ -277,6 +290,5 @@ ninja
277290

278291
## TODO
279292

280-
- Optimization for Q6_K
281-
- Support and optimization for Q4_K
282293
- Improve flash attention
294+
- Improve OpenCL C kernels performance

ggml/src/ggml-common.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,11 +1111,12 @@ GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
11111111
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
11121112
GGML_TABLE_END()
11131113

1114-
// e2m1 values (doubled)
1114+
// e2m1 values (doubled), shared by MXFP4 and NVFP4
11151115
// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
1116-
GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
1116+
GGML_TABLE_BEGIN(int8_t, kvalues_fp4, 16)
11171117
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
11181118
GGML_TABLE_END()
1119+
#define kvalues_mxfp4 kvalues_fp4
11191120

11201121
#define NGRID_IQ1S 2048
11211122
#define IQ1S_DELTA 0.125f

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@
8282
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
8383
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
8484
// quants.c
85-
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
8685
// repack.cpp
8786
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
8887
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4

ggml/src/ggml-cpu/arch/x86/quants.c

Lines changed: 142 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
934934

935935
#if defined __AVX2__
936936

937-
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
937+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
938938
const __m128i m4b = _mm_set1_epi8(0x0f);
939939
const __m256i mone = _mm256_set1_epi16(1);
940940

@@ -963,7 +963,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
963963
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
964964

965965
#elif defined __AVX__
966-
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
966+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
967967
const __m128i m4b = _mm_set1_epi8(0x0f);
968968

969969
__m256 accum = _mm256_setzero_ps();
@@ -993,14 +993,152 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
993993
int sumi1 = 0;
994994
int sumi2 = 0;
995995
for (int j = 0; j < QK_MXFP4/2; ++j) {
996-
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
997-
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
996+
sumi1 += y[ib].qs[j + 0] * kvalues_fp4[x[ib].qs[j] & 0xf];
997+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_fp4[x[ib].qs[j] >> 4];
998998
}
999999
sumf += d * (sumi1 + sumi2);
10001000
}
10011001
*s = sumf;
10021002
}
10031003

1004+
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1005+
assert(nrc == 1);
1006+
UNUSED(nrc);
1007+
UNUSED(bx);
1008+
UNUSED(by);
1009+
UNUSED(bs);
1010+
assert(n % QK_NVFP4 == 0);
1011+
1012+
const block_nvfp4 * GGML_RESTRICT x = vx;
1013+
const block_q8_0 * GGML_RESTRICT y = vy;
1014+
1015+
const int nb = n / QK_NVFP4;
1016+
int ib = 0;
1017+
float sumf = 0;
1018+
1019+
#if defined(__AVX2__)
1020+
1021+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
1022+
const __m128i m4b = _mm_set1_epi8(0x0f);
1023+
const __m256i mone = _mm256_set1_epi16(1);
1024+
1025+
__m256 accum = _mm256_setzero_ps();
1026+
for(; ib < nb; ib++){
1027+
1028+
const __m128i q4bits_01 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 0));
1029+
const __m128i q4bits_23 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 16));
1030+
1031+
const __m256i q8_01 = _mm256_loadu_si256((const __m256i *)y[2*ib + 0].qs);
1032+
const __m256i q8_23 = _mm256_loadu_si256((const __m256i *)y[2*ib + 1].qs);
1033+
1034+
const __m128i q4_01_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_01, m4b));
1035+
const __m128i q4_01_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_01, 4), m4b));
1036+
const __m128i q4_23_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_23, m4b));
1037+
const __m128i q4_23_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_23, 4), m4b));
1038+
1039+
//reordering
1040+
const __m256i q4_01 = MM256_SET_M128I(_mm_unpackhi_epi64(q4_01_lo,q4_01_hi), _mm_unpacklo_epi64(q4_01_lo,q4_01_hi));
1041+
const __m256i q4_23 = MM256_SET_M128I(_mm_unpackhi_epi64(q4_23_lo,q4_23_hi),_mm_unpacklo_epi64(q4_23_lo,q4_23_hi));
1042+
1043+
const __m256i p01 = mul_add_epi8(q4_01,q8_01);
1044+
const __m256i p_1 = _mm256_madd_epi16(p01, mone);
1045+
1046+
const __m256i p23 = mul_add_epi8(q4_23,q8_23);
1047+
const __m256i p_2 = _mm256_madd_epi16(p23, mone);
1048+
1049+
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
1050+
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
1051+
1052+
const float s0 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[0]) * dy0;
1053+
const float s1 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[1]) * dy0;
1054+
const float s2 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[2]) * dy1;
1055+
const float s3 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[3]) * dy1;
1056+
1057+
const __m256 scales01 = _mm256_set_m128(_mm_set1_ps(s1), _mm_set1_ps(s0));
1058+
const __m256 scales23 = _mm256_set_m128(_mm_set1_ps(s3), _mm_set1_ps(s2));
1059+
1060+
accum = _mm256_fmadd_ps(scales01, _mm256_cvtepi32_ps(p_1), accum);
1061+
accum = _mm256_fmadd_ps(scales23, _mm256_cvtepi32_ps(p_2), accum);
1062+
}
1063+
sumf = hsum_float_8(accum);
1064+
1065+
#elif defined(__AVX__)
1066+
1067+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
1068+
const __m128i m4b = _mm_set1_epi8(0x0f);
1069+
1070+
__m256 accum = _mm256_setzero_ps();
1071+
for(; ib < nb; ib++){
1072+
1073+
const __m128i q4bits_01 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 0));
1074+
const __m128i q4bits_23 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 16));
1075+
1076+
const __m128i q8_0 = _mm_loadu_si128((const __m128i *)(y[2*ib + 0].qs + 0));
1077+
const __m128i q8_1 = _mm_loadu_si128((const __m128i *)(y[2*ib + 0].qs + 16));
1078+
const __m128i q8_2 = _mm_loadu_si128((const __m128i *)(y[2*ib + 1].qs + 0));
1079+
const __m128i q8_3 = _mm_loadu_si128((const __m128i *)(y[2*ib + 1].qs + 16));
1080+
1081+
const __m128i q4_01_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_01, m4b));
1082+
const __m128i q4_01_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_01, 4), m4b));
1083+
const __m128i q4_23_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_23, m4b));
1084+
const __m128i q4_23_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_23, 4), m4b));
1085+
1086+
const __m128i q4_0 = _mm_unpacklo_epi64(q4_01_lo, q4_01_hi);
1087+
const __m128i q4_1 = _mm_unpackhi_epi64(q4_01_lo, q4_01_hi);
1088+
const __m128i q4_2 = _mm_unpacklo_epi64(q4_23_lo, q4_23_hi);
1089+
const __m128i q4_3 = _mm_unpackhi_epi64(q4_23_lo, q4_23_hi);
1090+
1091+
const __m128i p0_i32 = mul_sum_i8_pairs(q4_0, q8_0);
1092+
const __m128i p1_i32 = mul_sum_i8_pairs(q4_1, q8_1);
1093+
const __m128i p2_i32 = mul_sum_i8_pairs(q4_2, q8_2);
1094+
const __m128i p3_i32 = mul_sum_i8_pairs(q4_3, q8_3);
1095+
1096+
const __m128 p0 = _mm_cvtepi32_ps(p0_i32);
1097+
const __m128 p1 = _mm_cvtepi32_ps(p1_i32);
1098+
const __m128 p2 = _mm_cvtepi32_ps(p2_i32);
1099+
const __m128 p3 = _mm_cvtepi32_ps(p3_i32);
1100+
1101+
const __m256 p01 = _mm256_set_m128(p1, p0);
1102+
const __m256 p23 = _mm256_set_m128(p3, p2);
1103+
1104+
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
1105+
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
1106+
1107+
const float s0 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[0]) * dy0;
1108+
const float s1 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[1]) * dy0;
1109+
const float s2 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[2]) * dy1;
1110+
const float s3 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[3]) * dy1;
1111+
1112+
const __m256 scales01 = _mm256_set_m128(_mm_set1_ps(s1), _mm_set1_ps(s0));
1113+
const __m256 scales23 = _mm256_set_m128(_mm_set1_ps(s3), _mm_set1_ps(s2));
1114+
1115+
accum = _mm256_add_ps(accum, _mm256_mul_ps(p01, scales01));
1116+
accum = _mm256_add_ps(accum, _mm256_mul_ps(p23, scales23));
1117+
}
1118+
sumf = hsum_float_8(accum);
1119+
1120+
#endif
1121+
1122+
for (;ib < nb; ++ib) {
1123+
for (int s_idx = 0; s_idx < 4; ++s_idx) {
1124+
const float d = GGML_CPU_UE4M3_TO_FP32(x[ib].d[s_idx]);
1125+
const int q8_block = s_idx / 2;
1126+
const int q8_off = (s_idx % 2) * QK_NVFP4_SUB;
1127+
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
1128+
1129+
int sumi_lo = 0, sumi_hi = 0;
1130+
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
1131+
const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
1132+
sumi_lo += y[2*ib + q8_block].qs[q8_off + j + 0] * kvalues_fp4[qv & 0xf];
1133+
sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_fp4[qv >> 4];
1134+
}
1135+
1136+
sumf += dy * d * (sumi_lo + sumi_hi);
1137+
}
1138+
}
1139+
*s = sumf;
1140+
}
1141+
10041142
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10051143
const int qk = QK8_0;
10061144
const int nb = n / qk;

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ float ggml_table_f32_f16[1 << 16];
8282
// precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h)
8383
float ggml_table_f32_e8m0_half[1 << 8];
8484

85+
// precomputed f32 table for ue4m3 (1 KB) (simd-mappings.h)
86+
float ggml_table_f32_ue4m3[1 << 8];
87+
8588
#if defined(__ARM_ARCH)
8689
struct ggml_arm_arch_features_type {
8790
int sve_cnt;
@@ -3798,6 +3801,11 @@ void ggml_cpu_init(void) {
37983801
ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i);
37993802
}
38003803

3804+
// initialize UE4M3 table (256 entries)
3805+
for (int i = 0; i < (1 << 8); ++i) {
3806+
ggml_table_f32_ue4m3[i] = ggml_ue4m3_to_fp32(i);
3807+
}
3808+
38013809
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
38023810

38033811
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);

0 commit comments

Comments
 (0)