Skip to content

Commit 8ffd459

Browse files
michalharakalclaude
andcommitted
feat(kllama-wasm): swap browser CLI to DSL path
Phase 5b consumer migration. Mirrors #122 / #123 / #125 for the wasm browser entry point. - Replaces `loadLlamaRuntimeWeights` + `LlamaRuntime` + `CpuAttentionBackend` with `DecoderGgufWeightLoader.loadToMap` (sequential `Source` variant) → `LlamaNetworkLoader.fromWeights` → `OptimizedLLMRuntime` DIRECT mode. - Wasm has no `MemorySegment`, so the converter step is skipped — the loader uses `QuantPolicy.DEQUANTIZE_TO_FP32` (no change from before; packed Q4/Q8 don't have a wasm-side fast path). - Tokenizer load via `GGUFTokenizer.fromSource(source)` is unchanged (sequential Source-friendly; not migrated to upstream byte-BPE in this PR — wasm browser is FP32 + Llama by default, byte-BPE is a separate concern). - Return type loosened from `Pair<LlamaRuntimeInterface<*>, Tokenizer>` to `Pair<InferenceRuntime<FP32>, Tokenizer>`. `:llm-runtime:kllama:compileKotlinWasmJs` clean. JVM and core tests unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a70ef93 commit 8ffd459

1 file changed

Lines changed: 33 additions & 22 deletions

File tree

  • llm-runtime/kllama/src/wasmJsMain/kotlin/sk/ainet/apps/kllama/browser

llm-runtime/kllama/src/wasmJsMain/kotlin/sk/ainet/apps/kllama/browser/Main.kt

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,25 @@ import kotlinx.browser.window
55
import kotlinx.coroutines.MainScope
66
import kotlinx.coroutines.await
77
import kotlinx.coroutines.launch
8-
import kotlinx.io.Source
9-
import kotlinx.io.buffered
108
import kotlinx.io.Buffer
119
import kotlinx.io.RawSource
10+
import kotlinx.io.Source
11+
import kotlinx.io.buffered
1212
import org.khronos.webgl.ArrayBuffer
1313
import org.khronos.webgl.DataView
1414
import org.w3c.fetch.Response
1515
import kotlin.js.Promise
16-
import sk.ainet.models.llama.LlamaRuntime
17-
import sk.ainet.models.llama.LlamaRuntimeInterface
1816
import sk.ainet.apps.kllama.GGUFTokenizer
17+
import sk.ainet.apps.llm.InferenceRuntime
18+
import sk.ainet.apps.llm.OptimizedLLMMode
19+
import sk.ainet.apps.llm.OptimizedLLMRuntime
20+
import sk.ainet.apps.llm.Tokenizer
21+
import sk.ainet.apps.llm.generate
1922
import sk.ainet.context.DirectCpuExecutionContext
2023
import sk.ainet.io.model.QuantPolicy
21-
import sk.ainet.models.llama.loadLlamaRuntimeWeights
22-
import sk.ainet.apps.llm.Tokenizer
24+
import sk.ainet.lang.types.FP32
25+
import sk.ainet.models.llama.DecoderGgufWeightLoader
26+
import sk.ainet.models.llama.LlamaNetworkLoader
2327

2428
private val scope = MainScope()
2529

@@ -47,18 +51,18 @@ fun main() {
4751
}
4852
}
4953

50-
// Run once on load
5154
runDemo()
52-
// Allow reruns
5355
runButton?.addEventListener("click", { scope.launch { runDemo() } })
5456
}
5557
}
5658

5759
@Suppress("UNCHECKED_CAST")
58-
private suspend fun loadRuntimeAndTokenizer(path: String): Pair<LlamaRuntimeInterface<*>, Tokenizer> {
60+
private suspend fun loadRuntimeAndTokenizer(path: String): Pair<InferenceRuntime<FP32>, Tokenizer> {
5961
val resp: Response = (window.fetch(path) as Promise<Response>).await()
6062
if (!resp.ok) error("Failed to fetch model: ${resp.statusText}")
61-
// On Wasm, use arrayBuffer() and feed bytes into a kotlinx-io Buffer as Source
63+
// On Wasm: arrayBuffer() → kotlinx-io Buffer → Source. Two sources are
64+
// built from the same bytes (one for weights, one for the tokenizer)
65+
// because each consumer drains its source.
6266
val buf: ArrayBuffer = (resp.arrayBuffer() as Promise<ArrayBuffer>).await()
6367
val view = DataView(buf)
6468
val length = view.byteLength
@@ -67,21 +71,28 @@ private suspend fun loadRuntimeAndTokenizer(path: String): Pair<LlamaRuntimeInte
6771
bytes[i] = view.getUint8(i).toByte()
6872
}
6973

70-
// Create source for loading weights
71-
val buffer1 = Buffer().apply { write(bytes) }
72-
val source1: Source = (buffer1 as RawSource).buffered()
7374
val ctx = DirectCpuExecutionContext()
74-
val weights = loadLlamaRuntimeWeights(
75+
76+
// Weights via the DSL path. Wasm has no MemorySegment, so we use
77+
// QuantPolicy.DEQUANTIZE_TO_FP32 — packed Q4/Q8 don't have a
78+
// wasm-side fast path anyway.
79+
val weightSource = (Buffer().apply { write(bytes) } as RawSource).buffered()
80+
val weights = DecoderGgufWeightLoader(
81+
sourceProvider = { weightSource },
82+
quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32,
83+
).loadToMap<FP32, Float>(ctx)
84+
85+
val model = LlamaNetworkLoader.fromWeights(weights)
86+
val runtime = OptimizedLLMRuntime(
87+
model = model,
7588
ctx = ctx,
76-
sourceProvider = { source1 },
77-
quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32
89+
mode = OptimizedLLMMode.DIRECT,
90+
dtype = FP32::class,
91+
bos = weights.metadata.bosTokenId,
7892
)
7993

80-
// Create source for loading tokenizer (need fresh buffer as source is consumed)
81-
val buffer2 = Buffer().apply { write(bytes) }
82-
val source2: Source = (buffer2 as RawSource).buffered()
83-
val tokenizer = GGUFTokenizer.fromSource(source2)
94+
val tokenizerSource: Source = (Buffer().apply { write(bytes) } as RawSource).buffered()
95+
val tokenizer = GGUFTokenizer.fromSource(tokenizerSource)
8496

85-
val backend = sk.ainet.apps.kllama.CpuAttentionBackend(ctx, weights, sk.ainet.lang.types.FP32::class)
86-
return LlamaRuntime(ctx, weights, backend, sk.ainet.lang.types.FP32::class) to tokenizer
97+
return runtime to tokenizer
8798
}

0 commit comments

Comments
 (0)