Add temperature + top-p (nucleus) sampling

gHashTag · ona-agent · gHashTag · commit e8265c97e7bf · 2026-02-01T15:53:04.000Z
- Temperature scaling for logits diversity
- Top-p nucleus sampling (sorted probability cutoff)
- CLI params: --temperature (default 0.7), --top-p (default 0.9)
- SamplingParams struct with temperature, top_p, top_k, repeat_penalty

Result: Model now gives diverse, relevant responses instead of
repetitive 'You are a person...' output.

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/bin/vibee b/bin/vibee
diff --git a/src/vibeec/gen_cmd.zig b/src/vibeec/gen_cmd.zig
@@ -44,6 +44,8 @@ pub fn main() !void {
         var model_path: ?[]const u8 = null;
         var prompt: ?[]const u8 = null;
         var max_tokens: u32 = 100;
+        var temperature: f32 = 0.7;
+        var top_p: f32 = 0.9;
 
         var i: usize = 2;
         while (i < args.len) : (i += 1) {
@@ -56,6 +58,12 @@ pub fn main() !void {
             } else if (std.mem.eql(u8, args[i], "--max-tokens") and i + 1 < args.len) {
                 max_tokens = std.fmt.parseInt(u32, args[i + 1], 10) catch 100;
                 i += 1;
+            } else if (std.mem.eql(u8, args[i], "--temperature") and i + 1 < args.len) {
+                temperature = std.fmt.parseFloat(f32, args[i + 1]) catch 0.7;
+                i += 1;
+            } else if (std.mem.eql(u8, args[i], "--top-p") and i + 1 < args.len) {
+                top_p = std.fmt.parseFloat(f32, args[i + 1]) catch 0.9;
+                i += 1;
             }
         }
 
@@ -64,7 +72,7 @@ pub fn main() !void {
             return;
         }
 
-        try gguf_chat.runChat(allocator, model_path.?, prompt, max_tokens);
+        try gguf_chat.runChat(allocator, model_path.?, prompt, max_tokens, temperature, top_p);
     } else if (std.mem.eql(u8, command, "help") or std.mem.eql(u8, command, "--help")) {
         printUsage();
     } else {
@@ -86,6 +94,8 @@ fn printUsage() void {
         \\  vibeec chat --model <path.gguf> [options]   Chat with GGUF model (SIMD optimized)
         \\    --prompt "text"                           Initial prompt
         \\    --max-tokens N                            Max tokens to generate (default: 100)
+        \\    --temperature F                           Sampling temperature (default: 0.7)
+        \\    --top-p F                                 Top-p nucleus sampling (default: 0.9)
         \\  vibeec help                                 Show this help
         \\
     , .{});
diff --git a/src/vibeec/gguf_chat.zig b/src/vibeec/gguf_chat.zig
@@ -12,17 +12,28 @@ const inference = @import("gguf_inference.zig");
 // Chat template for formatting prompts
 const ChatTemplate = tokenizer_mod.ChatTemplate;
 
+// Sampling parameters struct
+const SamplingParams = inference.SamplingParams;
+
 // Entry point for CLI chat command
-pub fn runChat(allocator: std.mem.Allocator, model_path: []const u8, initial_prompt: ?[]const u8, max_tokens: u32) !void {
+pub fn runChat(allocator: std.mem.Allocator, model_path: []const u8, initial_prompt: ?[]const u8, max_tokens: u32, temperature: f32, top_p: f32) !void {
     const stdout = std.io.getStdOut().writer();
 
     try stdout.print("\n", .{});
     try stdout.print("╔══════════════════════════════════════════════════════════════╗\n", .{});
     try stdout.print("║           TRINITY CHAT - SIMD Optimized LLM                  ║\n", .{});
-    try stdout.print("║           Chat Template + Streaming Output                   ║\n", .{});
+    try stdout.print("║           Temperature + Top-p Sampling                       ║\n", .{});
     try stdout.print("╚══════════════════════════════════════════════════════════════╝\n", .{});
     try stdout.print("\n", .{});
 
+    // Create sampling params
+    const sampling_params = SamplingParams{
+        .temperature = temperature,
+        .top_p = top_p,
+        .top_k = 40,
+        .repeat_penalty = 1.1,
+    };
+
     // Load model
     std.debug.print("Loading model: {s}\n", .{model_path});
     var model = model_mod.FullModel.init(allocator, model_path) catch |err| {
@@ -56,11 +67,12 @@ pub fn runChat(allocator: std.mem.Allocator, model_path: []const u8, initial_pro
 
     std.debug.print("Chat template: TinyLlama (ChatML format)\n", .{});
     std.debug.print("System: {s}\n", .{system_prompt});
+    std.debug.print("Sampling: temperature={d:.2}, top_p={d:.2}\n", .{sampling_params.temperature, sampling_params.top_p});
     std.debug.print("\nReady! Type your message (or 'quit' to exit):\n\n", .{});
 
     // Handle initial prompt if provided
     if (initial_prompt) |prompt| {
-        try generateWithTemplate(allocator, stdout, &model, &tokenizer, &template, system_prompt, prompt, max_tokens);
+        try generateWithTemplate(allocator, stdout, &model, &tokenizer, &template, system_prompt, prompt, max_tokens, sampling_params);
     }
 
     // Interactive loop
@@ -75,7 +87,7 @@ pub fn runChat(allocator: std.mem.Allocator, model_path: []const u8, initial_pro
         if (trimmed.len == 0) continue;
         if (std.mem.eql(u8, trimmed, "quit") or std.mem.eql(u8, trimmed, "exit")) break;
 
-        try generateWithTemplate(allocator, stdout, &model, &tokenizer, &template, system_prompt, trimmed, max_tokens);
+        try generateWithTemplate(allocator, stdout, &model, &tokenizer, &template, system_prompt, trimmed, max_tokens, sampling_params);
     }
 
     try stdout.print("Goodbye!\n", .{});
@@ -91,6 +103,7 @@ fn generateWithTemplate(
     system: []const u8,
     user_input: []const u8,
     max_tokens: u32,
+    params: SamplingParams,
 ) !void {
     // Format prompt with chat template
     const formatted = try template.formatPrompt(allocator, system, user_input);
@@ -128,24 +141,28 @@ fn generateWithTemplate(
     var last_token: u32 = 0;
 
     while (generated < max_tokens) : (generated += 1) {
-        // Sample next token (greedy)
-        var max_idx: u32 = 0;
-        var max_val: f32 = current_logits[0];
-        for (current_logits[1..], 1..) |l, i| {
-            if (l > max_val) {
-                max_val = l;
-                max_idx = @intCast(i);
+        // Sample next token with temperature + top-p
+        const sampled_token = inference.sampleWithParams(allocator, current_logits, params) catch blk: {
+            // Fallback to greedy on error
+            var max_idx: u32 = 0;
+            var max_val: f32 = current_logits[0];
+            for (current_logits[1..], 1..) |l, i| {
+                if (l > max_val) {
+                    max_val = l;
+                    max_idx = @intCast(i);
+                }
             }
-        }
+            break :blk max_idx;
+        };
 
         // Free current logits
         allocator.free(current_logits);
 
         // Check for EOS
-        if (max_idx == tokenizer.eos_token) break;
+        if (sampled_token == tokenizer.eos_token) break;
 
         // Decode and stream output immediately
-        const decoded = tokenizer.decode(allocator, &[_]u32{max_idx}) catch " ";
+        const decoded = tokenizer.decode(allocator, &[_]u32{sampled_token}) catch " ";
         defer if (decoded.len > 0) allocator.free(decoded);
         
         // Stream: print immediately without buffering
@@ -156,7 +173,7 @@ fn generateWithTemplate(
         if (std.mem.indexOf(u8, decoded, "<|") != null) break;
 
         // Get next logits
-        last_token = max_idx;
+        last_token = sampled_token;
         current_logits = model.forward(last_token, current_pos) catch break;
         current_pos += 1;
     }
diff --git a/src/vibeec/gguf_inference.zig b/src/vibeec/gguf_inference.zig
@@ -144,7 +144,7 @@ pub fn softmax(output: []f32, input: []const f32) void {
     }
 }
 
-// Sample from probability distribution
+// Sample from probability distribution (basic)
 pub fn sample(probs: []const f32, temperature: f32) u32 {
     if (temperature == 0.0) {
         // Greedy sampling
@@ -174,6 +174,130 @@ pub fn sample(probs: []const f32, temperature: f32) u32 {
     return @intCast(probs.len - 1);
 }
 
+// ═══════════════════════════════════════════════════════════════════════════════
+// ADVANCED SAMPLING - Temperature + Top-p (Nucleus) Sampling
+// ═══════════════════════════════════════════════════════════════════════════════
+
+/// Sampling parameters
+pub const SamplingParams = struct {
+    temperature: f32 = 0.7,
+    top_p: f32 = 0.9,
+    top_k: u32 = 40,
+    repeat_penalty: f32 = 1.1,
+};
+
+/// Apply temperature scaling to logits
+pub fn applyTemperature(logits: []f32, temperature: f32) void {
+    if (temperature <= 0.0 or temperature == 1.0) return;
+    
+    const inv_temp = 1.0 / temperature;
+    for (logits) |*l| {
+        l.* *= inv_temp;
+    }
+}
+
+/// Sample with temperature and top-p (nucleus sampling)
+/// Returns token index
+pub fn sampleWithParams(allocator: std.mem.Allocator, logits: []f32, params: SamplingParams) !u32 {
+    const n = logits.len;
+    
+    // Apply temperature
+    if (params.temperature > 0.0 and params.temperature != 1.0) {
+        applyTemperature(logits, params.temperature);
+    }
+    
+    // Greedy if temperature is 0
+    if (params.temperature == 0.0) {
+        var max_idx: u32 = 0;
+        var max_val: f32 = logits[0];
+        for (logits[1..], 1..) |l, i| {
+            if (l > max_val) {
+                max_val = l;
+                max_idx = @intCast(i);
+            }
+        }
+        return max_idx;
+    }
+    
+    // Convert to probabilities with softmax
+    var max_logit: f32 = logits[0];
+    for (logits[1..]) |l| {
+        if (l > max_logit) max_logit = l;
+    }
+    
+    var sum: f32 = 0.0;
+    for (logits) |*l| {
+        l.* = @exp(l.* - max_logit);
+        sum += l.*;
+    }
+    
+    const inv_sum = 1.0 / sum;
+    for (logits) |*l| {
+        l.* *= inv_sum;
+    }
+    
+    // Top-p (nucleus) sampling
+    if (params.top_p < 1.0) {
+        // Create index array for sorting
+        const indices = try allocator.alloc(u32, n);
+        defer allocator.free(indices);
+        for (indices, 0..) |*idx, i| {
+            idx.* = @intCast(i);
+        }
+        
+        // Sort indices by probability (descending)
+        std.mem.sort(u32, indices, logits, struct {
+            fn lessThan(probs: []f32, a: u32, b: u32) bool {
+                return probs[a] > probs[b]; // Descending
+            }
+        }.lessThan);
+        
+        // Find cutoff for top-p
+        var cumsum: f32 = 0.0;
+        var cutoff_idx: usize = n;
+        for (indices, 0..) |idx, i| {
+            cumsum += logits[idx];
+            if (cumsum >= params.top_p) {
+                cutoff_idx = i + 1;
+                break;
+            }
+        }
+        
+        // Zero out tokens below cutoff
+        for (indices[cutoff_idx..]) |idx| {
+            logits[idx] = 0.0;
+        }
+        
+        // Renormalize
+        sum = 0.0;
+        for (logits) |l| {
+            sum += l;
+        }
+        if (sum > 0.0) {
+            const inv = 1.0 / sum;
+            for (logits) |*l| {
+                l.* *= inv;
+            }
+        }
+    }
+    
+    // Sample from distribution
+    var prng = std.Random.DefaultPrng.init(@intCast(std.time.milliTimestamp()));
+    const random = prng.random();
+    const r = random.float(f32);
+    
+    var cumsum: f32 = 0.0;
+    for (logits, 0..) |p, i| {
+        cumsum += p;
+        if (r < cumsum) {
+            return @intCast(i);
+        }
+    }
+    
+    // Fallback to last token
+    return @intCast(n - 1);
+}
+
 // GGUF Model for inference
 pub const GGUFModel = struct {
     allocator: std.mem.Allocator,