|
| 1 | +package sk.ainet.models.apertus |
| 2 | + |
| 3 | +import kotlinx.coroutines.runBlocking |
| 4 | +import sk.ainet.apps.llm.ModelFamily |
| 5 | +import sk.ainet.apps.llm.UnifiedModelLoader |
| 6 | +import sk.ainet.context.DirectCpuExecutionContext |
| 7 | +import sk.ainet.io.JvmRandomAccessSource |
| 8 | +import sk.ainet.io.gguf.StreamingGGUFReader |
| 9 | +import sk.ainet.io.model.QuantPolicy |
| 10 | +import java.io.File |
| 11 | +import kotlin.test.Test |
| 12 | +import kotlin.test.assertEquals |
| 13 | +import kotlin.test.assertNotNull |
| 14 | +import kotlin.test.assertTrue |
| 15 | + |
| 16 | +/** |
| 17 | + * Integration test against a real Apertus-8B-Instruct-2509 GGUF (Q4_K_S) downloaded |
| 18 | + * from `unsloth/Apertus-8B-Instruct-2509-GGUF` on Hugging Face. |
| 19 | + * |
| 20 | + * Skips silently when the GGUF is not present, so CI without network/cache stays green. |
| 21 | + * |
| 22 | + * Path resolution order: |
| 23 | + * - `APERTUS_GGUF_PATH` env var |
| 24 | + * - HF cache: `~/.cache/huggingface/hub/models--unsloth--Apertus-8B-Instruct-2509-GGUF/snapshots/.../Apertus-8B-Instruct-2509-Q4_K_S.gguf` |
| 25 | + */ |
| 26 | +class ApertusRealGgufLoadingTest { |
| 27 | + |
| 28 | + private val modelFile: File? = locateModel() |
| 29 | + |
| 30 | + @Test |
| 31 | + fun `peek detects apertus architecture and reads metadata fields`() { |
| 32 | + val file = modelFile ?: run { |
| 33 | + println("[skip] Apertus GGUF not found; set APERTUS_GGUF_PATH or download Q4_K_S into HF cache.") |
| 34 | + return |
| 35 | + } |
| 36 | + |
| 37 | + val info = UnifiedModelLoader.peek { JvmRandomAccessSource.open(file) } |
| 38 | + |
| 39 | + assertEquals("apertus", info.architecture, "GGUF should report apertus arch") |
| 40 | + assertEquals(ModelFamily.APERTUS, info.family, "ModelRegistry must classify as APERTUS") |
| 41 | + |
| 42 | + // Apertus-8B-Instruct-2509: 32 layers, 4096 hidden, 32k context, 131k vocab. |
| 43 | + assertTrue(info.blockCount > 0, "blockCount must be populated (got ${info.blockCount})") |
| 44 | + assertTrue(info.embeddingLength > 0, "embeddingLength must be populated (got ${info.embeddingLength})") |
| 45 | + assertTrue(info.contextLength > 0, "contextLength must be populated (got ${info.contextLength})") |
| 46 | + assertTrue(info.vocabSize > 0, "vocabSize must be populated (got ${info.vocabSize})") |
| 47 | + |
| 48 | + println("[real-load peek] arch=${info.architecture} layers=${info.blockCount} dim=${info.embeddingLength} ctx=${info.contextLength} vocab=${info.vocabSize}") |
| 49 | + } |
| 50 | + |
| 51 | + @Test |
| 52 | + fun `streaming reader exposes every tensor required by the apertus loader`() { |
| 53 | + val file = modelFile ?: run { |
| 54 | + println("[skip] Apertus GGUF not found.") |
| 55 | + return |
| 56 | + } |
| 57 | + |
| 58 | + val source = JvmRandomAccessSource.open(file) |
| 59 | + StreamingGGUFReader.open(source).use { reader -> |
| 60 | + val present = reader.tensors.map { it.name }.toSet() |
| 61 | + val blockCount = (reader.fields["apertus.block_count"] as? Number)?.toInt() |
| 62 | + ?: (reader.fields["apertus.block_count"] as? UInt)?.toInt() |
| 63 | + ?: error("apertus.block_count missing") |
| 64 | + |
| 65 | + val required = buildList { |
| 66 | + add(ApertusTensorNames.TOKEN_EMBEDDINGS) |
| 67 | + add(ApertusTensorNames.OUTPUT_NORM) |
| 68 | + add(ApertusTensorNames.OUTPUT_WEIGHT) |
| 69 | + repeat(blockCount) { layer -> |
| 70 | + add(ApertusTensorNames.attnNorm(layer)) |
| 71 | + add(ApertusTensorNames.attnQ(layer)) |
| 72 | + add(ApertusTensorNames.attnK(layer)) |
| 73 | + add(ApertusTensorNames.attnV(layer)) |
| 74 | + add(ApertusTensorNames.attnOut(layer)) |
| 75 | + add(ApertusTensorNames.attnQNorm(layer)) |
| 76 | + add(ApertusTensorNames.attnKNorm(layer)) |
| 77 | + add(ApertusTensorNames.ffnNorm(layer)) |
| 78 | + add(ApertusTensorNames.ffnDown(layer)) |
| 79 | + add(ApertusTensorNames.ffnUp(layer)) |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + val missing = required.filter { it !in present } |
| 84 | + assertTrue(missing.isEmpty(), "Tensors required by ApertusWeightLoader are absent from real GGUF:\n ${missing.joinToString("\n ")}") |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + @Test |
| 89 | + fun `loadQuantized fully populates ApertusQuantizedWeights from real GGUF`() = runBlocking { |
| 90 | + val file = modelFile ?: run { |
| 91 | + println("[skip] Apertus GGUF not found.") |
| 92 | + return@runBlocking |
| 93 | + } |
| 94 | + // Token-embedding dequant to FP32 alone is ~2 GB (4096 × 131072 floats); the |
| 95 | + // raw quant bytes for the rest add another ~5 GB. Need ≥ 8 GB heap to fit. |
| 96 | + val maxHeapGb = Runtime.getRuntime().maxMemory() / (1024L * 1024L * 1024L) |
| 97 | + if (maxHeapGb < 8) { |
| 98 | + println("[skip] heap=$maxHeapGb GB < 8 GB; rerun with -PapertusTestMaxHeap=12g") |
| 99 | + return@runBlocking |
| 100 | + } |
| 101 | + |
| 102 | + val ctx = DirectCpuExecutionContext.create() |
| 103 | + val loader = ApertusWeightLoader.fromRandomAccess( |
| 104 | + randomAccessProvider = { JvmRandomAccessSource.open(file) }, |
| 105 | + quantPolicy = QuantPolicy.RAW_BYTES |
| 106 | + ) |
| 107 | + |
| 108 | + val weights = loader.loadQuantized(ctx) |
| 109 | + val md = weights.metadata |
| 110 | + |
| 111 | + // Apertus-8B reference dimensions (from HF config.json). |
| 112 | + assertTrue(md.blockCount in 24..40, "Unexpected blockCount=${md.blockCount}") |
| 113 | + assertEquals(4096, md.embeddingLength, "Unexpected embeddingLength=${md.embeddingLength}") |
| 114 | + assertTrue(md.headCount > 0, "headCount=${md.headCount}") |
| 115 | + assertTrue(md.kvHeadCount in 1..md.headCount, "kvHeadCount=${md.kvHeadCount}") |
| 116 | + assertTrue(md.vocabSize > 100_000, "vocabSize=${md.vocabSize}") |
| 117 | + |
| 118 | + // FP32 small tensors (norms, token embedding) must be present. |
| 119 | + assertNotNull(weights.fp32Tensors[ApertusTensorNames.TOKEN_EMBEDDINGS], |
| 120 | + "${ApertusTensorNames.TOKEN_EMBEDDINGS} must be loaded as FP32") |
| 121 | + assertNotNull(weights.fp32Tensors[ApertusTensorNames.OUTPUT_NORM], |
| 122 | + "${ApertusTensorNames.OUTPUT_NORM} must be loaded as FP32") |
| 123 | + repeat(md.blockCount) { layer -> |
| 124 | + assertNotNull(weights.fp32Tensors[ApertusTensorNames.attnNorm(layer)], |
| 125 | + "${ApertusTensorNames.attnNorm(layer)} must be FP32") |
| 126 | + assertNotNull(weights.fp32Tensors[ApertusTensorNames.ffnNorm(layer)], |
| 127 | + "${ApertusTensorNames.ffnNorm(layer)} must be FP32") |
| 128 | + assertNotNull(weights.fp32Tensors[ApertusTensorNames.attnQNorm(layer)], |
| 129 | + "${ApertusTensorNames.attnQNorm(layer)} must be FP32") |
| 130 | + assertNotNull(weights.fp32Tensors[ApertusTensorNames.attnKNorm(layer)], |
| 131 | + "${ApertusTensorNames.attnKNorm(layer)} must be FP32") |
| 132 | + } |
| 133 | + |
| 134 | + // Large quantized projection matrices must be present. |
| 135 | + repeat(md.blockCount) { layer -> |
| 136 | + assertNotNull(weights.quantizedTensors[ApertusTensorNames.attnQ(layer)], |
| 137 | + "${ApertusTensorNames.attnQ(layer)} must be quantized") |
| 138 | + assertNotNull(weights.quantizedTensors[ApertusTensorNames.ffnDown(layer)], |
| 139 | + "${ApertusTensorNames.ffnDown(layer)} must be quantized") |
| 140 | + } |
| 141 | + |
| 142 | + // xIELU params must be populated for every layer. |
| 143 | + assertEquals(md.blockCount, weights.xieluParams.size, |
| 144 | + "xieluParams (${weights.xieluParams.size}) must match blockCount (${md.blockCount})") |
| 145 | + |
| 146 | + println("[real-load loadQuantized] fp32=${weights.fp32Tensors.size} quant=${weights.quantizedTensors.size} xielu-layers=${weights.xieluParams.size}") |
| 147 | + } |
| 148 | + |
| 149 | + /** |
| 150 | + * End-to-end network construction is intentionally NOT exercised here. |
| 151 | + * |
| 152 | + * `apertusNetwork(metadata)` (DSL inside skainet-lang-core) pre-allocates FP32 |
| 153 | + * zero-tensors for every Linear layer at construction time — independent of the |
| 154 | + * `quantPolicy` chosen in the loader. For Apertus-8B (32 layers, 4096 hidden, |
| 155 | + * ~14k FFN, 131k vocab) that's ~27 GB of FP32 zeros before WeightMapper has a |
| 156 | + * chance to substitute in the loaded tensors, which OOMs anything under 32 GB |
| 157 | + * of heap. The cleanup PR (commit 8a7e0ff) also removed `ApertusQuantizedRuntime`, |
| 158 | + * which was the only memory-efficient runtime path for quantized Apertus models. |
| 159 | + * |
| 160 | + * Tracking issue: see follow-up to be filed; the fix is in the DSL builder |
| 161 | + * (NetworkBuilder.kt:652 in skainet-lang-core) which calls `zeros(shape)` to |
| 162 | + * initialize Linear weights eagerly. Loader-level correctness up to |
| 163 | + * [ApertusWeightLoader.loadQuantized] is verified by the tests above. |
| 164 | + */ |
| 165 | + |
| 166 | + private fun locateModel(): File? { |
| 167 | + System.getenv("APERTUS_GGUF_PATH")?.let { p -> |
| 168 | + val f = File(p) |
| 169 | + if (f.isFile) return f |
| 170 | + } |
| 171 | + val home = System.getProperty("user.home") |
| 172 | + val snapshotsDir = File("$home/.cache/huggingface/hub/models--unsloth--Apertus-8B-Instruct-2509-GGUF/snapshots") |
| 173 | + if (!snapshotsDir.isDirectory) return null |
| 174 | + return snapshotsDir.listFiles()?.asSequence() |
| 175 | + ?.flatMap { it.listFiles()?.asSequence() ?: emptySequence() } |
| 176 | + ?.firstOrNull { it.name == "Apertus-8B-Instruct-2509-Q4_K_S.gguf" } |
| 177 | + } |
| 178 | +} |
0 commit comments