Merge pull request #130 from SKaiNET-developers/feat/native-kllama-dsl-swap

michalharakal · web-flow · commit 6662c3584ede · 2026-05-05T00:07:32.000+02:00
feat(kllama-native): swap CLI to DSL path; drop GPU stubs
diff --git a/llm-runtime/kllama/src/nativeMain/kotlin/sk/ainet/apps/kllama/cli/Main.kt b/llm-runtime/kllama/src/nativeMain/kotlin/sk/ainet/apps/kllama/cli/Main.kt
@@ -4,56 +4,51 @@ import kotlinx.coroutines.runBlocking
 import kotlinx.io.buffered
 import kotlinx.io.files.Path
 import kotlinx.io.files.SystemFileSystem
+import kotlin.reflect.KClass
 import kotlin.time.measureTime
 import sk.ainet.apps.kllama.CpuAttentionBackend
 import sk.ainet.apps.kllama.GGUFTokenizer
-import sk.ainet.apps.kllama.LlamaIngestion
-import sk.ainet.apps.kllama.LlamaLoadConfig
+import sk.ainet.apps.kllama.Llama2DotCWeightLoader
+import sk.ainet.apps.kllama.TokenizerUtils
+import sk.ainet.apps.llm.InferenceRuntime
+import sk.ainet.apps.llm.OptimizedLLMMode
+import sk.ainet.apps.llm.OptimizedLLMRuntime
 import sk.ainet.apps.llm.Tokenizer
-import sk.ainet.apps.kllama.GpuAttentionBackend
 import sk.ainet.apps.llm.backend.BackendRegistry
 import sk.ainet.apps.llm.backend.availableNames
 import sk.ainet.apps.llm.backend.bestAvailable
 import sk.ainet.apps.llm.backend.find
-import sk.ainet.models.llama.LlamaRuntime
-import sk.ainet.models.llama.LlamaRuntimeInterface
-import sk.ainet.apps.kllama.Llama2DotCWeightLoader
-import sk.ainet.apps.kllama.TokenizerUtils
-import sk.ainet.models.llama.LlamaRuntimeWeights
-import sk.ainet.io.model.QuantPolicy
+import sk.ainet.apps.llm.generate
 import sk.ainet.context.ExecutionContext
+import sk.ainet.io.model.QuantPolicy
 import sk.ainet.lang.types.DType
 import sk.ainet.lang.types.FP16
 import sk.ainet.lang.types.FP32
-import kotlin.reflect.KClass
+import sk.ainet.models.llama.DecoderGgufWeightLoader
+import sk.ainet.models.llama.LlamaNetworkLoader
+import sk.ainet.models.llama.LlamaRuntime
+import sk.ainet.models.llama.LlamaRuntimeWeights
 
 private fun usage(): Nothing {
-    println("Usage: kllama <model> [tokenizer] <prompt> [steps=64] [temperature=0.8] [--backend=cpu] [--gpu-opt] [--dtype=fp16|fp32]")
+    println("Usage: kllama <model> [tokenizer] <prompt> [steps=64] [temperature=0.8] [--backend=cpu] [--dtype=fp16|fp32]")
     println("  <model>         Path to .gguf or .bin model")
     println("  <tokenizer>     Path to tokenizer.bin (required for .bin, optional for .gguf)")
     println("  <prompt>        Text prompt")
     println("  --backend=NAME  Execution backend (default: ${BackendRegistry.bestAvailable().name})")
-    println("  --gpu-opt       Use GPU-optimized runtime (reduces CPU roundtrips)")
-    println("  --graph         Use MPSGraph compiled execution (Metal backend only)")
     println("  --dtype=TYPE    Tensor dtype: fp16 or fp32 (default: fp32)")
     println("  --list-backends List available backends and exit")
     println("Available backends: ${BackendRegistry.availableNames().joinToString(", ")}")
     throw IllegalArgumentException("Invalid arguments")
 }
 
 fun main(args: Array<String>) = runBlocking {
-    // Register platform-specific backends
     registerPlatformBackends()
 
     var backendName: String? = null
-    var useGpuOpt = false
-    var useGraph = false
     var dtypeStr = "fp32"
     val filteredArgs = args.filter { arg ->
         when {
             arg.startsWith("--backend=") -> { backendName = arg.substringAfter("="); false }
-            arg == "--gpu-opt" -> { useGpuOpt = true; false }
-            arg == "--graph" -> { useGraph = true; useGpuOpt = true; false }
             arg.startsWith("--dtype=") -> { dtypeStr = arg.substringAfter("=").lowercase(); false }
             arg == "--list-backends" -> {
                 val providers = BackendRegistry.providers()
@@ -107,52 +102,62 @@ fun main(args: Array<String>) = runBlocking {
     val ctx = provider.createContext()
 
     when (dtypeStr) {
-        "fp16" -> runInference<FP16>(ctx, FP16::class, isGguf, modelPathStr, modelPath, useGpuOpt, useGraph, tokenizerPathStr, prompt, steps, temperature)
-        "fp32" -> runInference<FP32>(ctx, FP32::class, isGguf, modelPathStr, modelPath, useGpuOpt, useGraph, tokenizerPathStr, prompt, steps, temperature)
+        "fp16" -> runInference<FP16>(ctx, FP16::class, isGguf, modelPathStr, modelPath, tokenizerPathStr, prompt, steps, temperature)
+        "fp32" -> runInference<FP32>(ctx, FP32::class, isGguf, modelPathStr, modelPath, tokenizerPathStr, prompt, steps, temperature)
         else -> error("Unsupported dtype: $dtypeStr. Use fp16 or fp32.")
     }
 }
 
-private suspend fun <T : DType> runInference(
+// Reified so we can call `LlamaNetworkLoader.fromWeights<T, V>` and
+// `DecoderGgufWeightLoader.loadToMap<T, V>` (both `inline reified T`).
+// The legacy `LlamaRuntime<T>` ctor doesn't need reification — only the
+// DSL path does.
+@Suppress("DuplicatedCode")
+private suspend inline fun <reified T : DType> runInference(
     ctx: ExecutionContext,
     dtype: KClass<T>,
     isGguf: Boolean,
     modelPathStr: String,
     modelPath: Path,
-    useGpuOpt: Boolean,
-    useGraph: Boolean,
     tokenizerPathStr: String?,
     prompt: String,
     steps: Int,
-    temperature: Float
+    temperature: Float,
 ) {
-    val runtimeWeights = if (isGguf) {
-        val ingestion = LlamaIngestion<T>(
+    val runtime: InferenceRuntime<T>
+    val vocabSize: Int
+
+    if (isGguf) {
+        // DSL path. Native has no MemorySegment, so QuantPolicy.DEQUANTIZE_TO_FP32
+        // is the only viable choice.
+        println("Loading GGUF model from $modelPathStr (Llama, DSL streaming, dtype=${dtype.simpleName})...")
+        val weights = DecoderGgufWeightLoader(
+            sourceProvider = { SystemFileSystem.source(modelPath).buffered() },
+            quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32,
+        ).loadToMap<T, Float>(ctx)
+        val model = LlamaNetworkLoader.fromWeights(weights)
+        runtime = OptimizedLLMRuntime(
+            model = model,
             ctx = ctx,
+            mode = OptimizedLLMMode.DIRECT,
             dtype = dtype,
-            config = LlamaLoadConfig(
-                quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32,
-                allowQuantized = false
-            )
+            bos = weights.metadata.bosTokenId,
         )
-        println("Loading GGUF model from $modelPathStr (dtype=${dtype.simpleName})...")
-        ingestion.load {
-            SystemFileSystem.source(modelPath).buffered()
-        }
+        vocabSize = weights.metadata.vocabSize
     } else {
+        // BIN (Karpathy llama2.c format) — kept on legacy LlamaRuntime; the
+        // .bin loader returns LlamaRuntimeWeights directly. Migrating .bin
+        // to the DSL path requires a converter and isn't in scope here.
         println("Loading Karpathy .bin model from $modelPathStr...")
         @Suppress("UNCHECKED_CAST")
-        Llama2DotCWeightLoader.load(ctx, SystemFileSystem.source(modelPath).buffered()) as LlamaRuntimeWeights<T>
+        val runtimeWeights = Llama2DotCWeightLoader.load(ctx, SystemFileSystem.source(modelPath).buffered())
+            as LlamaRuntimeWeights<T>
+        val cpuBackend = CpuAttentionBackend<T>(ctx, runtimeWeights, dtype)
+        @Suppress("DEPRECATION")
+        runtime = LlamaRuntime<T>(ctx, runtimeWeights, cpuBackend, dtype)
+        vocabSize = runtimeWeights.metadata.vocabSize
     }
 
-    val graphAccelerator = if (useGraph) {
-        println("Compiling MPSGraph layer graphs...")
-        createGraphAccelerator(ctx, runtimeWeights, dtype, 1e-5f)
-    } else null
-
-    val cpuBackend = CpuAttentionBackend<T>(ctx, runtimeWeights, dtype)
-    val runtime = LlamaRuntime<T>(ctx, runtimeWeights, cpuBackend, dtype, graphAccelerator = graphAccelerator)
-
     val tokenizer: Tokenizer = if (isGguf && tokenizerPathStr == null) {
         println("Loading embedded GGUF tokenizer...")
         GGUFTokenizer.fromSource(SystemFileSystem.source(modelPath).buffered())
@@ -161,7 +166,7 @@ private suspend fun <T : DType> runInference(
         val tPath = Path(tPathStr)
         if (!SystemFileSystem.exists(tPath)) error("Tokenizer not found: $tPathStr")
         println("Loading tokenizer from $tPathStr...")
-        TokenizerUtils.buildTokenizer(SystemFileSystem.source(tPath).buffered(), runtimeWeights.metadata.vocabSize)
+        TokenizerUtils.buildTokenizer(SystemFileSystem.source(tPath).buffered(), vocabSize)
     }
 
     val promptTokens = tokenizer.encode(prompt)