@@ -745,6 +745,100 @@ pub const GGUFModel = struct {
745745 }
746746};
747747
748+ // ═══════════════════════════════════════════════════════════════════════════════
749+ // MMAP GGUF MODEL - Near-instant loading via memory mapping
750+ // ═══════════════════════════════════════════════════════════════════════════════
751+
752+ /// GGUF Model using memory-mapped file (zero-copy tensor access)
753+ pub const MmapGGUFModel = struct {
754+ allocator : std.mem.Allocator ,
755+ reader : gguf.MmapGGUFReader ,
756+ config : ModelConfig ,
757+
758+ // Dequantized weights (loaded on demand)
759+ token_embedding : ? []f32 ,
760+ output_weight : ? []f32 ,
761+ output_norm : ? []f32 ,
762+
763+ pub fn init (allocator : std.mem.Allocator , path : []const u8 ) ! MmapGGUFModel {
764+ var reader = try gguf .MmapGGUFReader .init (allocator , path );
765+ errdefer reader .deinit ();
766+
767+ const arch = reader .getMetadataString ("general.architecture" ) orelse "llama" ;
768+
769+ var key_buf : [64 ]u8 = undefined ;
770+
771+ const vocab_size = blk : {
772+ if (reader .getTensor ("output.weight" )) | t | {
773+ break :blk @as (u32 , @intCast (t .dims [1 ]));
774+ }
775+ break :blk @as (u32 , 32000 );
776+ };
777+
778+ const config = ModelConfig {
779+ .vocab_size = vocab_size ,
780+ .hidden_size = @intCast (reader .getMetadataU32 (std .fmt .bufPrint (& key_buf , "{s}.embedding_length" , .{arch }) catch "llama.embedding_length" ) orelse 2048 ),
781+ .intermediate_size = @intCast (reader .getMetadataU32 (std .fmt .bufPrint (& key_buf , "{s}.feed_forward_length" , .{arch }) catch "llama.feed_forward_length" ) orelse 5632 ),
782+ .num_layers = @intCast (reader .getMetadataU32 (std .fmt .bufPrint (& key_buf , "{s}.block_count" , .{arch }) catch "llama.block_count" ) orelse 22 ),
783+ .num_heads = @intCast (reader .getMetadataU32 (std .fmt .bufPrint (& key_buf , "{s}.attention.head_count" , .{arch }) catch "llama.attention.head_count" ) orelse 32 ),
784+ .num_kv_heads = @intCast (reader .getMetadataU32 (std .fmt .bufPrint (& key_buf , "{s}.attention.head_count_kv" , .{arch }) catch "llama.attention.head_count_kv" ) orelse 4 ),
785+ .head_dim = 0 ,
786+ .context_length = @intCast (reader .getMetadataU32 (std .fmt .bufPrint (& key_buf , "{s}.context_length" , .{arch }) catch "llama.context_length" ) orelse 2048 ),
787+ .rope_theta = reader .getMetadataF32 (std .fmt .bufPrint (& key_buf , "{s}.rope.freq_base" , .{arch }) catch "llama.rope.freq_base" ) orelse 10000.0 ,
788+ .rms_norm_eps = reader .getMetadataF32 (std .fmt .bufPrint (& key_buf , "{s}.attention.layer_norm_rms_epsilon" , .{arch }) catch "llama.attention.layer_norm_rms_epsilon" ) orelse 1e-5 ,
789+ };
790+
791+ var model = MmapGGUFModel {
792+ .allocator = allocator ,
793+ .reader = reader ,
794+ .config = config ,
795+ .token_embedding = null ,
796+ .output_weight = null ,
797+ .output_norm = null ,
798+ };
799+
800+ model .config .head_dim = model .config .hidden_size / model .config .num_heads ;
801+
802+ return model ;
803+ }
804+
805+ pub fn deinit (self : * MmapGGUFModel ) void {
806+ if (self .token_embedding ) | e | self .allocator .free (e );
807+ if (self .output_weight ) | w | self .allocator .free (w );
808+ if (self .output_norm ) | n | self .allocator .free (n );
809+ self .reader .deinit ();
810+ }
811+
812+ /// Load embeddings using mmap (zero-copy read, then dequantize)
813+ pub fn loadEmbeddings (self : * MmapGGUFModel ) ! void {
814+ // Load token embeddings
815+ if (self .reader .getTensor ("token_embd.weight" )) | info | {
816+ const data = self .reader .getTensorData (info ); // Zero-copy!
817+ self .token_embedding = try dequantizeTensor (self .allocator , data , info .tensor_type , info .numElements ());
818+ }
819+
820+ // Load output weights
821+ if (self .reader .getTensor ("output.weight" )) | info | {
822+ const data = self .reader .getTensorData (info ); // Zero-copy!
823+ self .output_weight = try dequantizeTensor (self .allocator , data , info .tensor_type , info .numElements ());
824+ }
825+
826+ // Load output norm
827+ if (self .reader .getTensor ("output_norm.weight" )) | info | {
828+ const data = self .reader .getTensorData (info ); // Zero-copy!
829+ self .output_norm = try dequantizeTensor (self .allocator , data , info .tensor_type , info .numElements ());
830+ }
831+ }
832+
833+ /// Get tensor data directly from mmap (zero-copy)
834+ pub fn getTensorData (self : * const MmapGGUFModel , name : []const u8 ) ? []const u8 {
835+ if (self .reader .getTensor (name )) | info | {
836+ return self .reader .getTensorData (info );
837+ }
838+ return null ;
839+ }
840+ };
841+
748842// Tests
749843test "dequantize_q8_0" {
750844 const allocator = std .testing .allocator ;
0 commit comments