Skip to content

Commit c159213

Browse files
Merge pull request #499 from janhq/update-dev-from-master-2026-04-28-01-03
Sync master with upstream release b8953
2 parents 4d0038c + 434b2a1 commit c159213

29 files changed

Lines changed: 936 additions & 132 deletions

common/debug.cpp

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,38 @@
11
#include "debug.h"
22

3+
#include "common.h"
34
#include "log.h"
45

56
#include <cmath>
7+
#include <regex>
68
#include <string>
9+
#include <vector>
10+
11+
struct common_debug_cb_user_data::impl {
12+
std::vector<uint8_t> data;
13+
std::vector<std::regex> tensor_filters;
14+
bool abort_on_nan{false};
15+
};
16+
17+
common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
18+
common_debug_cb_user_data::~common_debug_cb_user_data() = default;
19+
20+
common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
21+
: pimpl(std::make_unique<impl>())
22+
{
23+
for (const auto & pattern : filter_patterns) {
24+
try {
25+
std::string anchored_pattern = "^" + pattern;
26+
pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
27+
} catch (const std::regex_error & e) {
28+
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
29+
}
30+
}
31+
pimpl->abort_on_nan = abort_on_nan;
32+
33+
params.cb_eval = common_debug_cb_eval;
34+
params.cb_eval_user_data = this;
35+
}
736

837
static std::string common_ggml_ne_string(const ggml_tensor * t) {
938
std::string str;
@@ -47,8 +76,7 @@ static float common_ggml_get_float_value(const uint8_t * data,
4776

4877
#define INDENT " "
4978

50-
template <bool abort>
51-
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
79+
static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
5280
GGML_ASSERT(n > 0);
5381
float sum = 0;
5482
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -94,7 +122,7 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
94122
LOG(INDENT "sum = %f\n", sum);
95123
}
96124

97-
if constexpr (abort) {
125+
if (abort_on_nan) {
98126
if (std::isnan(sum)) {
99127
LOG("encountered NaN - aborting\n");
100128
exit(0);
@@ -112,8 +140,9 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
112140
* @param user_data user data to pass at each call back
113141
* @return true to receive data or continue the graph, false otherwise
114142
*/
115-
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
116-
auto * cb_data = (base_callback_data *) user_data;
143+
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
144+
auto * cb_data = (common_debug_cb_user_data *) user_data;
145+
auto * pimpl = cb_data->pimpl.get();
117146

118147
const struct ggml_tensor * src0 = t->src[0];
119148
const struct ggml_tensor * src1 = t->src[1];
@@ -122,10 +151,10 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
122151
return true; // Always retrieve data
123152
}
124153

125-
bool matches_filter = cb_data->tensor_filters.empty();
154+
bool matches_filter = pimpl->tensor_filters.empty();
126155

127156
if (!matches_filter) {
128-
for (const auto & filter : cb_data->tensor_filters) {
157+
for (const auto & filter : pimpl->tensor_filters) {
129158
if (std::regex_search(t->name, filter)) {
130159
matches_filter = true;
131160
break;
@@ -148,20 +177,14 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
148177

149178
if (!is_host) {
150179
auto n_bytes = ggml_nbytes(t);
151-
cb_data->data.resize(n_bytes);
152-
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
180+
pimpl->data.resize(n_bytes);
181+
ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
153182
}
154183

155184
if (!ggml_is_quantized(t->type) && matches_filter) {
156-
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
157-
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
185+
uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
186+
common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
158187
}
159188

160189
return true;
161190
}
162-
163-
// Explicit template instantiations
164-
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
165-
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
166-
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
167-
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);

common/debug.h

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,31 @@
11
#pragma once
2-
#include "common.h"
2+
3+
#include <memory>
34
#include <string>
45
#include <vector>
5-
#include <regex>
66

77
// common debug functions and structs
88

9-
// Print a tensor's detailed data
10-
// data - the tensor's data in byte format
11-
// type - the tensor's quantization type
12-
// ne - the tensor dimensions array
13-
// nb - the tensor strides array
14-
// n - the number of rows/columns to fully print
15-
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
9+
struct common_params;
1610

1711
// Intended to use as callback for ggml_backend_sched_eval_callback
1812
// prints tensors that are processed in the computation graph
19-
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
20-
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
21-
// The template parameter determines whether an error should be thrown whenever a NaN is encountered
13+
// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
14+
// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
15+
// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
2216
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
2317
// The callback data will be passed as the third parameter (user_data)
24-
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
25-
struct base_callback_data {
26-
std::vector<uint8_t> data;
27-
std::vector<std::regex> tensor_filters;
28-
29-
base_callback_data() = default;
30-
31-
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
32-
for (const auto & pattern : filter_patterns) {
33-
try {
34-
std::string anchored_pattern = "^" + pattern;
35-
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
36-
} catch (const std::regex_error & e) {
37-
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
38-
}
39-
}
40-
params.cb_eval = common_debug_cb_eval<false>;
41-
params.cb_eval_user_data = this;
42-
}
18+
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
19+
20+
struct common_debug_cb_user_data {
21+
struct impl;
22+
std::unique_ptr<impl> pimpl;
23+
24+
common_debug_cb_user_data();
25+
~common_debug_cb_user_data();
26+
27+
common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
28+
common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
29+
30+
common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
4331
};

common/download.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
627627
if (!tag.empty()) {
628628
tags.push_back(tag);
629629
} else {
630-
tags = {"Q4_K_M", "Q4_0"};
630+
tags = {"Q4_K_M", "Q8_0"};
631631
}
632632

633633
for (const auto & t : tags) {

common/fit.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
856856
ggml_backend_dev_memory(dev, &free, &total);
857857

858858
const size_t self = mb.model + mb.context + mb.compute;
859-
const size_t unaccounted = total - self - free;
859+
const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
860860

861861
table_data.push_back({
862862
template_gpu,
@@ -867,7 +867,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
867867
std::to_string(mb.model / MiB),
868868
std::to_string(mb.context / MiB),
869869
std::to_string(mb.compute / MiB),
870-
std::to_string(unaccounted / MiB)});
870+
std::to_string(unaccounted / static_cast<int64_t>(MiB))});
871871
}
872872

873873
// print memory breakdown for host:

convert_hf_to_gguf.py

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,22 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
272272

273273
return tensors
274274

275+
@staticmethod
276+
def _scale_is_trivial(scale: Tensor) -> bool:
277+
return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
278+
279+
def _write_scale_tensor(self, scale_name: str, scale: Tensor):
280+
if not self._scale_is_trivial(scale):
281+
scale_f32 = scale.float().numpy().flatten()
282+
logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
283+
self.gguf_writer.add_tensor(scale_name, scale_f32)
284+
285+
def _write_scales_tensor(self, scale_name: str, scales: list[float]):
286+
if not np.allclose(scales, 1.0, atol=1e-6):
287+
scale_vals = np.array(scales, dtype=np.float32)
288+
logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])")
289+
self.gguf_writer.add_tensor(scale_name, scale_vals)
290+
275291
def dequant_model(self):
276292
# If all quantized tensors were already handled (e.g. pure NVFP4), skip
277293
if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -494,7 +510,7 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T
494510
s = self.model_tensors[name]
495511
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
496512
tensors_to_remove.append(name)
497-
if name.endswith((".k_scale", ".v_scale")):
513+
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
498514
tensors_to_remove.append(name)
499515
elif quant_method is not None:
500516
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -602,10 +618,6 @@ def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
602618
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
603619
return raw, [out_features, n_super * 64]
604620

605-
@staticmethod
606-
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
607-
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
608-
609621
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
610622
if "language_model." in name:
611623
name = name.replace("language_model.", "")
@@ -616,19 +628,8 @@ def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor
616628
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
617629
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
618630

619-
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
620-
if not self._nvfp4_scale2_is_trivial(scale2):
621-
scale2_f32 = scale2.float().numpy().flatten()
622-
scale_name = new_name.replace(".weight", ".scale")
623-
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
624-
self.gguf_writer.add_tensor(scale_name, scale2_f32)
625-
626-
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
627-
if not self._nvfp4_scale2_is_trivial(input_scale):
628-
input_scale_f32 = input_scale.float().numpy().flatten()
629-
input_scale_name = new_name.replace(".weight", ".input_scale")
630-
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
631-
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
631+
self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
632+
self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)
632633

633634
def _generate_nvfp4_tensors(self):
634635
# Per-layer expert merging to avoid holding all experts in memory
@@ -719,21 +720,11 @@ def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_s
719720
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
720721
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
721722

722-
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
723723
scales.sort(key=lambda x: x[0])
724-
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
725-
if not np.allclose(scale_vals, 1.0, atol=1e-6):
726-
scale_name = new_name.replace(".weight", ".scale")
727-
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
728-
self.gguf_writer.add_tensor(scale_name, scale_vals)
724+
self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])
729725

730-
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
731726
input_scales.sort(key=lambda x: x[0])
732-
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
733-
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
734-
input_scale_name = new_name.replace(".weight", ".input_scale")
735-
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
736-
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
727+
self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])
737728

738729
del experts, merged
739730

examples/debug/debug.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,14 @@ static bool run(llama_context * ctx, const common_params & params) {
202202
print_tokenized_prompt(ctx, tokens, params.prompt);
203203

204204
if (params.save_logits) {
205-
output_data output {ctx, model, params};
206-
std::filesystem::path model_path{params.model.path};
207-
std::string model_name{model_path.stem().string()};
208-
save_output_data(output, model_name, params.logits_output_dir);
205+
try {
206+
output_data output {ctx, model, params};
207+
std::filesystem::path model_path{params.model.path};
208+
std::string model_name{model_path.stem().string()};
209+
save_output_data(output, model_name, params.logits_output_dir);
210+
} catch (const std::exception & e) {
211+
LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
212+
}
209213
}
210214

211215
return true;
@@ -223,7 +227,7 @@ int main(int argc, char ** argv) {
223227
llama_backend_init();
224228
llama_numa_init(params.numa);
225229

226-
std::optional<base_callback_data> cb_data;
230+
std::optional<common_debug_cb_user_data> cb_data;
227231
if (!params.save_logits) {
228232
cb_data.emplace(params, params.tensor_filter);
229233
}

examples/eval-callback/eval-callback.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "debug.h"
44
#include "log.h"
55
#include "llama.h"
6-
#include "llama-cpp.h"
76

87
#include <clocale>
98
#include <string>
@@ -38,7 +37,7 @@ static bool run(llama_context * ctx, const common_params & params) {
3837
int main(int argc, char ** argv) {
3938
std::setlocale(LC_NUMERIC, "C");
4039

41-
base_callback_data cb_data;
40+
common_debug_cb_user_data cb_data;
4241

4342
common_params params;
4443

@@ -53,7 +52,7 @@ int main(int argc, char ** argv) {
5352

5453
// pass the callback to the backend scheduler
5554
// it will be executed for each node during the graph computation
56-
params.cb_eval = common_debug_cb_eval<false>;
55+
params.cb_eval = common_debug_cb_eval;
5756
params.cb_eval_user_data = &cb_data;
5857
params.warmup = false;
5958

ggml/src/ggml-cpu/amx/mmq.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
20052005
const int lda = KB * sizeof(TA);
20062006
//const int ldb = KB * sizeof(TB);
20072007

2008-
static thread_local packed_B_t Tile0[TILE_N * TILE_K];
2009-
static thread_local packed_B_t Tile1[TILE_N * TILE_K];
2010-
static thread_local int8_t Tile23[TILE_M * TILE_K];
2008+
alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
2009+
alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
2010+
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
20112011

2012-
static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
2013-
static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
2012+
alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
2013+
alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
20142014

20152015
// double buffering C to interleave avx512 and amx
20162016
int32_t * C_cur = TileC0;
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
21872187
const int m1 = std::max(M - TILE_M, 0);
21882188
//const int lda = KB * sizeof(TA);
21892189

2190-
static thread_local int8_t Tile0[TILE_N * TILE_K];
2191-
static thread_local int8_t Tile1[TILE_N * TILE_K];
2192-
static thread_local int8_t Tile23[TILE_M * TILE_K];
2190+
alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
2191+
alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
2192+
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
21932193

21942194
// mat mul result for each group
2195-
static thread_local int32_t Tile4[TILE_M * TILE_N];
2196-
static thread_local int32_t Tile5[TILE_M * TILE_N];
2197-
static thread_local int32_t Tile6[TILE_M * TILE_N];
2198-
static thread_local int32_t Tile7[TILE_M * TILE_N];
2195+
alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
2196+
alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
2197+
alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
2198+
alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];
21992199

22002200
// sum of each QK_K block, contains 8 groups, int32
2201-
static thread_local int32_t Sumi4[TILE_M * TILE_N];
2202-
static thread_local int32_t Sumi5[TILE_M * TILE_N];
2203-
static thread_local int32_t Sumi6[TILE_M * TILE_N];
2204-
static thread_local int32_t Sumi7[TILE_M * TILE_N];
2201+
alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
2202+
alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
2203+
alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
2204+
alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];
22052205

22062206
const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
22072207
for (int i = 0; i < KB; ++i) {

0 commit comments

Comments
 (0)