@@ -5,21 +5,25 @@ import kotlinx.browser.window
55import kotlinx.coroutines.MainScope
66import kotlinx.coroutines.await
77import kotlinx.coroutines.launch
8- import kotlinx.io.Source
9- import kotlinx.io.buffered
108import kotlinx.io.Buffer
119import kotlinx.io.RawSource
10+ import kotlinx.io.Source
11+ import kotlinx.io.buffered
1212import org.khronos.webgl.ArrayBuffer
1313import org.khronos.webgl.DataView
1414import org.w3c.fetch.Response
1515import kotlin.js.Promise
16- import sk.ainet.models.llama.LlamaRuntime
17- import sk.ainet.models.llama.LlamaRuntimeInterface
1816import sk.ainet.apps.kllama.GGUFTokenizer
17+ import sk.ainet.apps.llm.InferenceRuntime
18+ import sk.ainet.apps.llm.OptimizedLLMMode
19+ import sk.ainet.apps.llm.OptimizedLLMRuntime
20+ import sk.ainet.apps.llm.Tokenizer
21+ import sk.ainet.apps.llm.generate
1922import sk.ainet.context.DirectCpuExecutionContext
2023import sk.ainet.io.model.QuantPolicy
21- import sk.ainet.models.llama.loadLlamaRuntimeWeights
22- import sk.ainet.apps.llm.Tokenizer
24+ import sk.ainet.lang.types.FP32
25+ import sk.ainet.models.llama.DecoderGgufWeightLoader
26+ import sk.ainet.models.llama.LlamaNetworkLoader
2327
2428private val scope = MainScope ()
2529
@@ -47,18 +51,18 @@ fun main() {
4751 }
4852 }
4953
50- // Run once on load
5154 runDemo()
52- // Allow reruns
5355 runButton?.addEventListener(" click" , { scope.launch { runDemo() } })
5456 }
5557}
5658
5759@Suppress(" UNCHECKED_CAST" )
58- private suspend fun loadRuntimeAndTokenizer (path : String ): Pair <LlamaRuntimeInterface < * >, Tokenizer> {
60+ private suspend fun loadRuntimeAndTokenizer (path : String ): Pair <InferenceRuntime < FP32 >, Tokenizer> {
5961 val resp: Response = (window.fetch(path) as Promise <Response >).await()
6062 if (! resp.ok) error(" Failed to fetch model: ${resp.statusText} " )
61- // On Wasm, use arrayBuffer() and feed bytes into a kotlinx-io Buffer as Source
63+ // On Wasm: arrayBuffer() → kotlinx-io Buffer → Source. Two sources are
64+ // built from the same bytes (one for weights, one for the tokenizer)
65+ // because each consumer drains its source.
6266 val buf: ArrayBuffer = (resp.arrayBuffer() as Promise <ArrayBuffer >).await()
6367 val view = DataView (buf)
6468 val length = view.byteLength
@@ -67,21 +71,28 @@ private suspend fun loadRuntimeAndTokenizer(path: String): Pair<LlamaRuntimeInte
6771 bytes[i] = view.getUint8(i).toByte()
6872 }
6973
70- // Create source for loading weights
71- val buffer1 = Buffer ().apply { write(bytes) }
72- val source1: Source = (buffer1 as RawSource ).buffered()
7374 val ctx = DirectCpuExecutionContext ()
74- val weights = loadLlamaRuntimeWeights(
75+
76+ // Weights via the DSL path. Wasm has no MemorySegment, so we use
77+ // QuantPolicy.DEQUANTIZE_TO_FP32 — packed Q4/Q8 don't have a
78+ // wasm-side fast path anyway.
79+ val weightSource = (Buffer ().apply { write(bytes) } as RawSource ).buffered()
80+ val weights = DecoderGgufWeightLoader (
81+ sourceProvider = { weightSource },
82+ quantPolicy = QuantPolicy .DEQUANTIZE_TO_FP32 ,
83+ ).loadToMap<FP32 , Float >(ctx)
84+
85+ val model = LlamaNetworkLoader .fromWeights(weights)
86+ val runtime = OptimizedLLMRuntime (
87+ model = model,
7588 ctx = ctx,
76- sourceProvider = { source1 },
77- quantPolicy = QuantPolicy .DEQUANTIZE_TO_FP32
89+ mode = OptimizedLLMMode .DIRECT ,
90+ dtype = FP32 ::class ,
91+ bos = weights.metadata.bosTokenId,
7892 )
7993
80- // Create source for loading tokenizer (need fresh buffer as source is consumed)
81- val buffer2 = Buffer ().apply { write(bytes) }
82- val source2: Source = (buffer2 as RawSource ).buffered()
83- val tokenizer = GGUFTokenizer .fromSource(source2)
94+ val tokenizerSource: Source = (Buffer ().apply { write(bytes) } as RawSource ).buffered()
95+ val tokenizer = GGUFTokenizer .fromSource(tokenizerSource)
8496
85- val backend = sk.ainet.apps.kllama.CpuAttentionBackend (ctx, weights, sk.ainet.lang.types.FP32 ::class )
86- return LlamaRuntime (ctx, weights, backend, sk.ainet.lang.types.FP32 ::class ) to tokenizer
97+ return runtime to tokenizer
8798}
0 commit comments