Skip to content

Commit ea74ec6

Browse files
committed
Add TODO for lookup table
1 parent 47b145d commit ea74ec6

2 files changed

Lines changed: 21 additions & 1 deletion

File tree

ggml/src/ggml-cpu/arch/x86/repack.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,6 +764,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
764764
std::is_same_v<block_tx8, block_iq4_nlx8>) {
765765
col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
766766
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
767+
//TODO: simd-ify
767768
col_scale_f32 = _mm512_set_ps(
768769
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[7]),
769770
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[6]),
@@ -974,6 +975,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
974975
std::is_same_v<block_tx8, block_iq4_nlx8>) {
975976
col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
976977
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
978+
//TODO: simd-ify
977979
col_scale_f32 = _mm512_set_ps(
978980
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[7]),
979981
GGML_CPU_E8M0_TO_FP32_HALF(b_ptr_1[b].e[6]),

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2974,7 +2974,25 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
29742974
}
29752975

29762976
if (node_n + 1 < cgraph->n_nodes) {
2977-
ggml_barrier(state->threadpool);
2977+
// Skip the barrier when both the current and next computable node have n_tasks == 1,
2978+
// since only thread 0 does work and no inter-thread synchronization is needed.
2979+
bool need_barrier = (ggml_get_n_tasks(node, params.nth) > 1);
2980+
if (!need_barrier) {
2981+
for (int j = node_n + 1; j < cgraph->n_nodes; j++) {
2982+
struct ggml_tensor * next = cgraph->nodes[j];
2983+
if (ggml_op_is_empty(next->op)) {
2984+
continue;
2985+
}
2986+
if ((next->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
2987+
continue;
2988+
}
2989+
need_barrier = (ggml_get_n_tasks(next, params.nth) > 1);
2990+
break;
2991+
}
2992+
}
2993+
if (need_barrier) {
2994+
ggml_barrier(state->threadpool);
2995+
}
29782996
}
29792997
}
29802998

0 commit comments

Comments
 (0)