Skip to content

Commit 945a610

Browse files
Merge pull request #510 from janhq/update-dev-from-master-2026-05-09-01-03
Sync master with upstream release b9082
2 parents 3bfd8b3 + b46812d commit 945a610

88 files changed

Lines changed: 6618 additions & 4685 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

common/chat-auto-parser-generator.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
369369
arguments.name_suffix) +
370370
arguments.value_prefix +
371371
(schema_info.resolves_to_string(param_schema) ?
372-
p.tool_arg_string_value(p.schema(until_suffix,
373-
"tool-" + name + "-arg-" + param_name + "-schema",
374-
param_schema, true)) :
372+
p.tool_arg_string_value(until_suffix) :
375373
p.tool_arg_json_value(p.schema(
376374
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
377375
p.space()) +

common/reasoning-budget.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
158158
for (size_t i = 0; i < cur_p->size; i++) {
159159
if (cur_p->data[i].id != forced) {
160160
cur_p->data[i].logit = -INFINITY;
161-
} else {
162-
cur_p->data[i].logit = +INFINITY; // force the token
163161
}
164162
}
165163
}

convert_hf_to_gguf.py

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -710,15 +710,15 @@ def _generate_nvfp4_tensors(self):
710710
self._repack_nvfp4(name, weight, scale, scale2, input_scale)
711711

712712
# Flush any remaining experts (fallback if n_experts was unknown)
713-
for bid, proj_type in expert_blocks.keys():
713+
for bid, proj_type in list(expert_blocks.keys()):
714714
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
715715

716716
# Remove consumed tensors so get_tensors/modify_tensors won't see them
717717
for name in consumed:
718718
self.model_tensors.pop(name, None)
719719

720720
# Remove any remaining unused auxiliary tensors
721-
for name in self.model_tensors.keys():
721+
for name in list(self.model_tensors.keys()):
722722
if name.endswith((".k_scale", ".v_scale")):
723723
del self.model_tensors[name]
724724

@@ -7988,13 +7988,37 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
79887988
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
79897989
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
79907990

7991+
def _generate_nvfp4_tensors(self):
7992+
# Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales
7993+
# each expert's contribution. It's mathematically equivalent to a per-expert
7994+
# scalar on the down_proj output, which is exactly where ffn_down_exps_s is
7995+
# applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the
7996+
# existing NVFP4 path produces the right scales.
7997+
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
7998+
for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]:
7999+
bid_match = re.search(r"\.layers\.(\d+)\.", name)
8000+
if bid_match is None:
8001+
continue
8002+
bid = bid_match.group(1)
8003+
prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")]
8004+
w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)]
8005+
present = [w2 in self.model_tensors for w2 in w2_targets]
8006+
if not any(present):
8007+
continue
8008+
assert all(present), f"layer {bid}: partial NVFP4 quantization across experts"
8009+
r = self.model_tensors.pop(name)
8010+
for e, w2 in enumerate(w2_targets):
8011+
s = self.model_tensors[w2]
8012+
self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i]
8013+
super()._generate_nvfp4_tensors()
8014+
79918015
@classmethod
79928016
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
79938017
name, gen = item
79948018

79958019
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
79968020
name = name + ".weight"
7997-
if ".experts." in name and not name.endswith(".weight"):
8021+
if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
79988022
name += ".weight"
79998023

80008024
return super().filter_tensors((name, gen))
@@ -13684,6 +13708,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1368413708
yield from super().modify_tensors(data_torch, name, bid)
1368513709

1368613710

13711+
@ModelBase.register("Sarashina2VisionForCausalLM")
13712+
class Sarashina2VLTextModel(LlamaModel):
13713+
model_arch = gguf.MODEL_ARCH.LLAMA
13714+
13715+
@classmethod
13716+
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
13717+
name, gen = item
13718+
if name.startswith("llm."):
13719+
name = name.replace("llm.", "", 1)
13720+
elif name.startswith("norm."):
13721+
return None
13722+
return super().filter_tensors((name, gen))
13723+
13724+
13725+
@ModelBase.register("Sarashina2VisionForCausalLM")
13726+
class Sarashina2VLVisionModel(Qwen2VLVisionModel):
13727+
def __init__(self, *args, **kwargs):
13728+
super().__init__(*args, **kwargs)
13729+
self.global_config['model_type'] = "qwen2_vl"
13730+
13731+
1368713732
###### CONVERSION LOGIC ######
1368813733

1368913734

@@ -13940,7 +13985,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1394013985
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
1394113986
# For text conversion we route to a dedicated text-only class.
1394213987
# TODO: refactor this later to avoid adding exception here
13943-
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
13988+
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
1394413989
return arch
1394513990

1394613991
# if "architectures" is found in the sub-config, use that instead

examples/sycl/start-svr.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,14 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
111111
echo "Use $GGML_SYCL_DEVICE as main GPU"
112112
#use signle GPU only
113113
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
114-
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
114+
export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
115115
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
116116
else
117117
echo "Use all Intel GPUs, including iGPU & dGPU"
118118
GPUS_SETTING="-sm ${SPLIT_MODE}"
119119
fi
120120

121-
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
121+
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000"
122122
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
123123

124124

ggml/include/ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ extern "C" {
169169
// device type
170170
enum ggml_backend_dev_type type;
171171
// device id
172-
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
172+
// for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
173173
// if the id is unknown, this should be NULL
174174
const char * device_id;
175175
// device capabilities

ggml/src/ggml-backend.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
965965
}
966966
if (sched->debug > 1) {
967967
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
968-
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
968+
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_desc(node), node->name,
969969
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
970970
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
971971
for (int j = 0; j < GGML_MAX_SRC; j++) {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "ggml-cuda/rope.cuh"
4040
#include "ggml-cuda/roll.cuh"
4141
#include "ggml-cuda/scale.cuh"
42+
#include "ggml-cuda/snake.cuh"
4243
#include "ggml-cuda/softcap.cuh"
4344
#include "ggml-cuda/softmax.cuh"
4445
#include "ggml-cuda/ssm-conv.cuh"
@@ -3757,6 +3758,35 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
37573758
return 2;
37583759
}
37593760

3761+
// Snake activation: y = x + sin(a*x)^2 * inv_b
3762+
// Naive 5-op decomposition emitted by frontends: mul -> sin -> sqr -> mul -> add
3763+
if (ggml_can_fuse_subgraph(cgraph, i,
3764+
{ GGML_OP_MUL, GGML_OP_SIN, GGML_OP_SQR, GGML_OP_MUL, GGML_OP_ADD },
3765+
{ i + 4 })) {
3766+
const ggml_tensor * mul0 = cgraph->nodes[i];
3767+
const ggml_tensor * sqr = cgraph->nodes[i + 2];
3768+
const ggml_tensor * mul1 = cgraph->nodes[i + 3];
3769+
ggml_tensor * add = cgraph->nodes[i + 4];
3770+
3771+
// x carries the full activation shape, a is the broadcast operand
3772+
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
3773+
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
3774+
3775+
// mul1 reads sqr and inv_b in either operand order
3776+
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
3777+
3778+
// closure check: the trailing add must read the same x as the leading mul
3779+
const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
3780+
3781+
const bool type_ok = (x->type == GGML_TYPE_F32 || x->type == GGML_TYPE_F16 || x->type == GGML_TYPE_BF16);
3782+
const bool shape_ok = ggml_are_same_shape(a, inv_b) && a->ne[0] == 1 && a->ne[1] == x->ne[1];
3783+
3784+
if (type_ok && shape_ok && x_in_add == x && add->type == x->type) {
3785+
ggml_cuda_op_snake_fused(*cuda_ctx, x, a, inv_b, add);
3786+
return 4;
3787+
}
3788+
}
3789+
37603790
// multi-(add or mul)
37613791
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
37623792
int n_fuse = 0;
@@ -5434,6 +5464,9 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
54345464
char pci_bus_id[32] = {};
54355465
CUDA_CHECK(cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i));
54365466
dev_ctx->pci_bus_id = pci_bus_id;
5467+
for (char & c : dev_ctx->pci_bus_id) {
5468+
c = std::tolower(c);
5469+
}
54375470
dev_ctx->op_offload_min_batch_size = min_batch_size;
54385471

54395472
ggml_backend_dev_t dev = new ggml_backend_device {

ggml/src/ggml-cuda/snake.cu

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#include "snake.cuh"
2+
#include "convert.cuh"
3+
4+
// Fused Snake activation: y = x + sin^2(a * x) * inv_b
5+
// x: [T, C] (T contiguous), a: [1, C], inv_b: [1, C]
6+
// Supports F32, F16, BF16 data with F32 compute.
7+
8+
template <typename T>
9+
static __global__ void snake_kernel(
10+
const T * __restrict__ x,
11+
const float * __restrict__ a,
12+
const float * __restrict__ inv_b,
13+
T * __restrict__ dst,
14+
const int total,
15+
const uint3 T_len_fastdiv) {
16+
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
17+
if (idx >= total) return;
18+
19+
const int c = (int) fastdiv((uint32_t) idx, T_len_fastdiv);
20+
21+
const float xi = ggml_cuda_cast<float>(x[idx]);
22+
const float s = sinf(a[c] * xi);
23+
dst[idx] = ggml_cuda_cast<T>(xi + s * s * inv_b[c]);
24+
}
25+
26+
// Internal launcher with explicit x/a/inv_b/dst tensors.
27+
// Shared by the public op (reads dst->src) and the fusion path (explicit args).
28+
static void launch_snake(ggml_backend_cuda_context & ctx,
29+
const ggml_tensor * x,
30+
const ggml_tensor * a,
31+
const ggml_tensor * inv_b,
32+
ggml_tensor * dst) {
33+
const float * a_d = (const float *)a->data;
34+
const float * inv_b_d = (const float *)inv_b->data;
35+
36+
const int T = (int)x->ne[0];
37+
const int C = (int)x->ne[1];
38+
const int total = T * C;
39+
const uint3 T_len_fastdiv = init_fastdiv_values((uint64_t) T);
40+
41+
const int block_size = 256;
42+
const int grid_size = (total + block_size - 1) / block_size;
43+
44+
cudaStream_t stream = ctx.stream();
45+
46+
switch (x->type) {
47+
case GGML_TYPE_F32: {
48+
snake_kernel<<<grid_size, block_size, 0, stream>>>(
49+
(const float *)x->data, a_d, inv_b_d, (float *)dst->data, total, T_len_fastdiv);
50+
} break;
51+
case GGML_TYPE_F16: {
52+
snake_kernel<<<grid_size, block_size, 0, stream>>>(
53+
(const half *)x->data, a_d, inv_b_d, (half *)dst->data, total, T_len_fastdiv);
54+
} break;
55+
case GGML_TYPE_BF16: {
56+
snake_kernel<<<grid_size, block_size, 0, stream>>>(
57+
(const nv_bfloat16 *)x->data, a_d, inv_b_d, (nv_bfloat16 *)dst->data, total, T_len_fastdiv);
58+
} break;
59+
default:
60+
GGML_ABORT("snake: unsupported type");
61+
}
62+
}
63+
64+
// Fusion entry: caller supplies x/a/inv_b explicitly from the matched
65+
// mul -> sin -> sqr -> mul -> add pattern. The dst is the trailing add output.
66+
void ggml_cuda_op_snake_fused(ggml_backend_cuda_context & ctx,
67+
const ggml_tensor * x,
68+
const ggml_tensor * a,
69+
const ggml_tensor * inv_b,
70+
ggml_tensor * dst) {
71+
launch_snake(ctx, x, a, inv_b, dst);
72+
}

ggml/src/ggml-cuda/snake.cuh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#include "common.cuh"
2+
3+
// Fusion entry point. Caller supplies x/a/inv_b explicitly.
4+
void ggml_cuda_op_snake_fused(ggml_backend_cuda_context & ctx,
5+
const ggml_tensor * x,
6+
const ggml_tensor * a,
7+
const ggml_tensor * inv_b,
8+
ggml_tensor * dst);

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2420,8 +2420,8 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
24202420
return false;
24212421
}
24222422

2423-
// TODO: add support for non-contiguous elements within a row
2424-
if (!ggml_is_contiguous_rows(src0) || !ggml_is_contiguous_rows(dst)) {
2423+
// dst must be contiguous; src0 may be non-contiguous
2424+
if (!ggml_is_contiguous(dst)) {
24252425
return false;
24262426
}
24272427

@@ -2791,6 +2791,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
27912791
case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
27922792
case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
27932793
case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
2794+
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
27942795
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
27952796
case GGML_OP_SCALE: return HTP_OP_SCALE;
27962797
case GGML_OP_SQR: return HTP_OP_SQR;
@@ -3253,6 +3254,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
32533254
supp = ggml_hexagon_supported_add_id(sess, op);
32543255
break;
32553256

3257+
case GGML_OP_L2_NORM:
3258+
supp = ggml_hexagon_supported_unary(sess, op);
3259+
break;
3260+
32563261
case GGML_OP_RMS_NORM:
32573262
case GGML_OP_SCALE:
32583263
supp = ggml_hexagon_supported_unary(sess, op);

0 commit comments

Comments
 (0)