Skip to content

Commit d3aeb79

Browse files
ehsan6shaclaude
andcommitted
runtime: update ctypes wrappers + init flow for RKLLM v1.2.3 ABI
The v1.1.4 -> v1.2.3 runtime bump (committed earlier today) made the existing struct layout incompatible with rkllm_init. Lab device hit "E rkllm: The n_batch must be between 1 and 100, but got 0" on first call because RKLLMExtendParam in our Python wrapper still had the old "base_domain_id + 112 bytes reserved" layout. v1.2.3 split those reserved bytes into n_batch + several other required fields. This commit ports the ctypes definitions to the v1.2.3 ABI as documented in rkllm-runtime/Linux/librkllm_api/include/rkllm.h at the release-v1.2.3 tag. Struct changes: RKLLMExtendParam: added embed_flash, enabled_cpus_num, enabled_cpus_mask, n_batch, use_cross_attn; reserved shrunk 112 -> 104 bytes RKLLMParam: added n_keep between top_k and top_p RKLLMInput: restructured — role + enable_thinking + input_type now prefix the union (was just input_mode) RKLLMInferParam: added keep_history at the end RKLLMResult: added token_id + logits + perf fields; dropped legacy `size` (not in v1.2.3 C struct) RKLLMMultiModalInput: added n_image, image_width, image_height Callback signature: returns int (was void in v1.1.4). _on_token now ends with `return 0` and traceback-guards the queue puts so the callback never raises into the C side. init_model flow updates: - zero the full RKLLMParam via ctypes.memset before populating (defensive; with new fields any uninitialized bytes could be interpreted as garbage) - set n_keep = -1 (runtime default — typically keeps the system-prompt portion of KV cache when context shifts) - set extend_param.embed_flash = 1 (lower RAM) - set extend_param.enabled_cpus_num = 4 + enabled_cpus_mask targeting RK3588 big cores (4-7, Cortex-A76) for best per-token latency - set extend_param.n_batch = 1 (single-sample; v1.2.3 rejects 0) - set extend_param.use_cross_attn = 0 - call rkllm_set_chat_template(handle, "", "", "") AFTER init to make the runtime pass our pre-formatted ChatML through verbatim (we build the full envelope including <think>\n prefix ourselves in _build_chat_prompt; without this override the runtime would double-wrap with its built-in Qwen 3 template) generate flow updates: - zero RKLLMInput via memset - set role = "user" (default for our pre-formatted ChatML path) - set enable_thinking = False (we handle thinking-mode injection in _build_chat_prompt; setting True would re-inject and double-think) - set input_type = RKLLM_INPUT_PROMPT (was input_mode in v1.1.4) - set infer.keep_history = 0 (we manage multi-turn history ourselves per turn; let the runtime discard its KV history between calls) Existing unit tests 55/55 still pass — struct field additions are backward-compatible at the Python level since tests don't exercise the ctypes layout directly. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 6d48a48 commit d3aeb79

1 file changed

Lines changed: 133 additions & 22 deletions

File tree

src/runtime/rkllm_runtime.py

Lines changed: 133 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,33 @@
6262

6363

6464
# ---------------------------------------------------------------------------
65-
# ctypes structures
65+
# ctypes structures — RKLLM v1.2.3 ABI
6666
# ---------------------------------------------------------------------------
67+
#
68+
# These mirror the C structs in rkllm-runtime/Linux/librkllm_api/include/rkllm.h
69+
# at the release-v1.2.3 tag. The Rockchip runtime is strict about struct
70+
# layout — any drift here causes silent corruption OR explicit init errors
71+
# ("The n_batch must be between 1 and 100, but got 0" is the canary).
72+
#
73+
# v1.1.4 → v1.2.3 ABI changes captured here:
74+
# RKLLMExtendParam: added embed_flash, enabled_cpus_num, enabled_cpus_mask,
75+
# n_batch, use_cross_attn; reserved shrunk 112 → 104
76+
# RKLLMParam: added n_keep between top_k and top_p
77+
# RKLLMInput: restructured — role + enable_thinking + input_type
78+
# prefixed BEFORE the union (previously just input_mode)
79+
# RKLLMInferParam: added keep_history
80+
# RKLLMResult: added token_id + logits + perf fields
81+
# Callback: returns int now (was void)
6782

6883
class RKLLMExtendParam(ctypes.Structure):
6984
_fields_ = [
7085
("base_domain_id", ctypes.c_int32),
71-
("reserved", ctypes.c_uint8 * 112),
86+
("embed_flash", ctypes.c_int8),
87+
("enabled_cpus_num", ctypes.c_int8),
88+
("enabled_cpus_mask", ctypes.c_uint32),
89+
("n_batch", ctypes.c_uint8),
90+
("use_cross_attn", ctypes.c_int8),
91+
("reserved", ctypes.c_uint8 * 104),
7292
]
7393

7494

@@ -78,6 +98,7 @@ class RKLLMParam(ctypes.Structure):
7898
("max_context_len", ctypes.c_int32),
7999
("max_new_tokens", ctypes.c_int32),
80100
("top_k", ctypes.c_int32),
101+
("n_keep", ctypes.c_int32), # NEW in v1.2.3
81102
("top_p", ctypes.c_float),
82103
("temperature", ctypes.c_float),
83104
("repeat_penalty", ctypes.c_float),
@@ -109,11 +130,15 @@ class RKLLMTokenInput(ctypes.Structure):
109130
]
110131

111132

112-
class RKLLMMultiModelInput(ctypes.Structure):
133+
class RKLLMMultiModalInput(ctypes.Structure):
134+
"""v1.2.3 added image_width, image_height + n_image fields."""
113135
_fields_ = [
114136
("prompt", ctypes.c_char_p),
115137
("image_embed", ctypes.POINTER(ctypes.c_float)),
116138
("n_image_tokens", ctypes.c_size_t),
139+
("n_image", ctypes.c_size_t),
140+
("image_width", ctypes.c_size_t),
141+
("image_height", ctypes.c_size_t),
117142
]
118143

119144

@@ -122,13 +147,17 @@ class RKLLMInputUnion(ctypes.Union):
122147
("prompt_input", ctypes.c_char_p),
123148
("embed_input", RKLLMEmbedInput),
124149
("token_input", RKLLMTokenInput),
125-
("multimodal_input", RKLLMMultiModelInput),
150+
("multimodal_input", RKLLMMultiModalInput),
126151
]
127152

128153

129154
class RKLLMInput(ctypes.Structure):
155+
"""v1.2.3 restructured: role + enable_thinking + input_type now
156+
prefix the union. Previously: just input_mode (int)."""
130157
_fields_ = [
131-
("input_mode", ctypes.c_int),
158+
("role", ctypes.c_char_p),
159+
("enable_thinking", ctypes.c_bool),
160+
("input_type", ctypes.c_int), # RKLLMInputType enum
132161
("input_data", RKLLMInputUnion),
133162
]
134163

@@ -149,6 +178,7 @@ class RKLLMInferParam(ctypes.Structure):
149178
("mode", ctypes.c_int),
150179
("lora_params", ctypes.POINTER(RKLLMLoraParam)),
151180
("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
181+
("keep_history", ctypes.c_int), # NEW in v1.2.3
152182
]
153183

154184

@@ -160,11 +190,33 @@ class RKLLMResultLastHiddenLayer(ctypes.Structure):
160190
]
161191

162192

193+
class RKLLMResultLogits(ctypes.Structure):
194+
_fields_ = [
195+
("logits", ctypes.POINTER(ctypes.c_float)),
196+
("vocab_size", ctypes.c_int),
197+
("num_tokens", ctypes.c_int),
198+
]
199+
200+
201+
class RKLLMPerfStat(ctypes.Structure):
202+
_fields_ = [
203+
("prefill_time_ms", ctypes.c_float),
204+
("prefill_tokens", ctypes.c_int),
205+
("generate_time_ms", ctypes.c_float),
206+
("generate_tokens", ctypes.c_int),
207+
("memory_usage_mb", ctypes.c_float),
208+
]
209+
210+
163211
class RKLLMResult(ctypes.Structure):
212+
"""v1.2.3 added token_id, logits, perf fields. Drop the legacy
213+
`size` field (no longer in C struct)."""
164214
_fields_ = [
165215
("text", ctypes.c_char_p),
166-
("size", ctypes.c_int),
216+
("token_id", ctypes.c_int32),
167217
("last_hidden_layer", RKLLMResultLastHiddenLayer),
218+
("logits", RKLLMResultLogits),
219+
("perf", RKLLMPerfStat),
168220
]
169221

170222

@@ -212,9 +264,10 @@ class RKLLMLoadError(RuntimeError):
212264
pass
213265

214266

215-
# C callback signature
267+
# C callback signature — v1.2.3 returns int (was void in v1.1.4).
268+
# The callback MUST return 0 to indicate normal continuation.
216269
_CALLBACK_TYPE = ctypes.CFUNCTYPE(
217-
None,
270+
ctypes.c_int,
218271
ctypes.POINTER(RKLLMResult),
219272
ctypes.c_void_p,
220273
ctypes.c_int,
@@ -268,20 +321,25 @@ def _wire_symbols(self) -> None:
268321

269322
# Callback runs on a C-spawned thread; ctypes acquires the GIL for us
270323
# before invoking. Push the token text + state to the queue; the
271-
# generate() caller drains.
324+
# generate() caller drains. v1.2.3 callback MUST return 0 (was void
325+
# in v1.1.4).
272326
def _on_token(self, result_ptr, userdata, state):
273-
if state == RKLLM_RUN_NORMAL or state == RKLLM_RUN_WAITING:
274-
try:
275-
text_bytes = result_ptr.contents.text
276-
if text_bytes:
277-
self._token_queue.put(("token", text_bytes))
278-
except Exception: # noqa: BLE001
279-
# Never raise from the C callback - would corrupt rkllm state
280-
pass
281-
elif state == RKLLM_RUN_FINISH:
282-
self._token_queue.put(("finish", None))
283-
elif state == RKLLM_RUN_ERROR:
284-
self._token_queue.put(("error", None))
327+
try:
328+
if state == RKLLM_RUN_NORMAL or state == RKLLM_RUN_WAITING:
329+
try:
330+
text_bytes = result_ptr.contents.text
331+
if text_bytes:
332+
self._token_queue.put(("token", text_bytes))
333+
except Exception: # noqa: BLE001
334+
# Never raise from the C callback - would corrupt rkllm state
335+
pass
336+
elif state == RKLLM_RUN_FINISH:
337+
self._token_queue.put(("finish", None))
338+
elif state == RKLLM_RUN_ERROR:
339+
self._token_queue.put(("error", None))
340+
except Exception: # noqa: BLE001
341+
pass
342+
return 0
285343

286344
def init_model(
287345
self,
@@ -298,21 +356,39 @@ def init_model(
298356
top_p: float = 0.8,
299357
) -> None:
300358
p = RKLLMParam()
359+
ctypes.memset(ctypes.byref(p), 0, ctypes.sizeof(p))
301360
p.model_path = self.model_path.encode("utf-8")
302361
p.max_context_len = max_context_len
303362
p.max_new_tokens = max_new_tokens
304363
p.top_k = top_k
364+
# n_keep: number of KV cache tokens to keep at the start when the
365+
# context window shifts. -1 = use runtime default (typically the
366+
# system-prompt portion). New required field in v1.2.3.
367+
p.n_keep = -1
305368
p.top_p = top_p
306369
p.temperature = temperature
307370
p.repeat_penalty = 1.1
308371
p.frequency_penalty = 0.0
309372
p.presence_penalty = 0.0
373+
p.mirostat = 0
374+
p.mirostat_tau = 5.0
375+
p.mirostat_eta = 0.1
310376
p.skip_special_token = True
311377
p.is_async = False
312378
p.img_start = b""
313379
p.img_end = b""
314380
p.img_content = b""
381+
# Extend params — v1.2.3 added required NPU configuration here.
315382
p.extend_param.base_domain_id = 0
383+
p.extend_param.embed_flash = 1 # embed from flash (lower RAM)
384+
p.extend_param.enabled_cpus_num = 4
385+
# RK3588 big cores are 4-7 (A76); little cores 0-3 (A55). Pin
386+
# inference to big cores for best per-token latency.
387+
p.extend_param.enabled_cpus_mask = (1 << 4) | (1 << 5) | (1 << 6) | (1 << 7)
388+
# n_batch=1 — single-sample inference. v1.2.3 rejects 0 explicitly
389+
# ("The n_batch must be between 1 and 100, but got 0").
390+
p.extend_param.n_batch = 1
391+
p.extend_param.use_cross_attn = 0
316392

317393
rc = self._lib.rkllm_init(
318394
ctypes.byref(self._handle),
@@ -322,6 +398,29 @@ def init_model(
322398
if rc != 0:
323399
raise RKLLMLoadError(f"rkllm_init returned {rc}")
324400

401+
# Override the runtime's built-in Qwen 3 chat template with empty
402+
# strings — our Python code already builds the full ChatML
403+
# (system+user+assistant+tool envelope, including the `<think>\n`
404+
# prefix injection for thinking mode). With empty system/prefix/
405+
# postfix, the runtime passes our pre-formatted prompt through
406+
# verbatim and does NOT double-wrap or double-inject `<think>`.
407+
try:
408+
self._lib.rkllm_set_chat_template.restype = ctypes.c_int
409+
self._lib.rkllm_set_chat_template.argtypes = [
410+
ctypes.c_void_p, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p,
411+
]
412+
rc = self._lib.rkllm_set_chat_template(
413+
self._handle, b"", b"", b"",
414+
)
415+
if rc != 0:
416+
logger.warning("rkllm_set_chat_template returned %d "
417+
"(non-fatal; runtime template stays default)", rc)
418+
except AttributeError:
419+
# Older runtime without the symbol — would only happen if the
420+
# Dockerfile didn't get the v1.2.3 .so. Logged + continue.
421+
logger.warning("librkllmrt.so does not export rkllm_set_chat_template; "
422+
"thinking-mode wrapping may double-wrap")
423+
325424
def generate(self, prompt: str, timeout_s: float = 90.0) -> str:
326425
"""Blocking. Run inference for `prompt`, drain the callback queue,
327426
return the full decoded text. Raises RKLLMLoadError on rkllm_run
@@ -335,11 +434,23 @@ def generate(self, prompt: str, timeout_s: float = 90.0) -> str:
335434
break
336435

337436
inp = RKLLMInput()
338-
inp.input_mode = RKLLM_INPUT_PROMPT
437+
ctypes.memset(ctypes.byref(inp), 0, ctypes.sizeof(inp))
438+
# v1.2.3 restructured RKLLMInput. We use "user" role for the
439+
# full ChatML envelope (since rkllm_set_chat_template was
440+
# called with empty wrappers during init_model, the runtime
441+
# passes our prompt through verbatim).
442+
inp.role = b"user"
443+
# We handle `<think>\n` prefix injection ourselves in
444+
# _build_chat_prompt — keep this False to avoid double-think.
445+
inp.enable_thinking = False
446+
inp.input_type = RKLLM_INPUT_PROMPT
339447
inp.input_data.prompt_input = prompt.encode("utf-8")
340448
infer = RKLLMInferParam()
341449
ctypes.memset(ctypes.byref(infer), 0, ctypes.sizeof(infer))
342450
infer.mode = RKLLM_INFER_GENERATE
451+
# We manage multi-turn history ourselves (rebuild full prompt
452+
# per turn) → tell the runtime to discard its own KV history.
453+
infer.keep_history = 0
343454

344455
# rkllm_run blocks (is_async=False). The callback fires inline
345456
# for each token. When the model finishes, the callback is

0 commit comments

Comments
 (0)