|
| 1 | +# speculative_decoding.vibee |
| 2 | +# Speculative Decoding for faster autoregressive generation |
| 3 | +# Generate multiple tokens per target model forward pass |
| 4 | + |
| 5 | +name: speculative_decoding |
| 6 | +version: "1.0.0" |
| 7 | +language: zig |
| 8 | +module: speculative_decoding |
| 9 | + |
| 10 | +types: |
| 11 | + SpeculativeConfig: |
| 12 | + description: "Configuration for speculative decoding" |
| 13 | + fields: |
| 14 | + speculation_length: Int # K: number of tokens to speculate |
| 15 | + temperature: Float # Sampling temperature |
| 16 | + use_tree_attention: Bool # Enable tree-based speculation |
| 17 | + |
| 18 | + DraftResult: |
| 19 | + description: "Result from draft model speculation" |
| 20 | + fields: |
| 21 | + tokens: List<Int> # K speculated tokens |
| 22 | + probs: List<Float> # Draft probabilities for each token |
| 23 | + |
| 24 | + VerificationResult: |
| 25 | + description: "Result from target model verification" |
| 26 | + fields: |
| 27 | + accepted_count: Int # Number of accepted tokens |
| 28 | + accepted_tokens: List<Int> # Accepted token sequence |
| 29 | + next_token: Int # Token sampled after rejection |
| 30 | + acceptance_rate: Float # Running acceptance rate |
| 31 | + |
| 32 | +behaviors: |
| 33 | + - name: draft_speculate |
| 34 | + given: draft model, input token, position, K |
| 35 | + when: generating K candidate tokens |
| 36 | + then: returns DraftResult with tokens and probabilities |
| 37 | + |
| 38 | + - name: target_verify |
| 39 | + given: target model, input sequence, draft tokens |
| 40 | + when: verifying draft tokens in parallel |
| 41 | + then: returns logits for all K+1 positions |
| 42 | + |
| 43 | + - name: speculative_sample |
| 44 | + given: draft probs, target probs, draft token |
| 45 | + when: deciding to accept or reject |
| 46 | + then: accepts with prob min(1, p_target/p_draft), else samples correction |
| 47 | + |
| 48 | + - name: speculative_generate |
| 49 | + given: target model, draft model, prompt, max_tokens |
| 50 | + when: generating with speculation |
| 51 | + then: returns generated tokens with speedup |
| 52 | + |
| 53 | +# Algorithm: |
| 54 | +# |
| 55 | +# ┌─────────────────────────────────────────────────────────────┐ |
| 56 | +# │ SPECULATIVE DECODING │ |
| 57 | +# ├─────────────────────────────────────────────────────────────┤ |
| 58 | +# │ │ |
| 59 | +# │ 1. DRAFT: Generate K tokens with small model │ |
| 60 | +# │ draft_tokens = [t1, t2, t3, t4] (fast, ~10ms) │ |
| 61 | +# │ draft_probs = [p1, p2, p3, p4] │ |
| 62 | +# │ │ |
| 63 | +# │ 2. VERIFY: Run target model on all K tokens (parallel) │ |
| 64 | +# │ target_logits = target.forward([t0, t1, t2, t3, t4]) │ |
| 65 | +# │ (single forward pass, ~100ms) │ |
| 66 | +# │ │ |
| 67 | +# │ 3. ACCEPT/REJECT: For each position i: │ |
| 68 | +# │ r = uniform(0, 1) │ |
| 69 | +# │ if r < min(1, target_prob[i] / draft_prob[i]): │ |
| 70 | +# │ ACCEPT token i │ |
| 71 | +# │ else: │ |
| 72 | +# │ REJECT: sample from (target - draft) distribution │ |
| 73 | +# │ STOP speculation │ |
| 74 | +# │ │ |
| 75 | +# │ 4. BONUS: If all K accepted, sample K+1 from target │ |
| 76 | +# │ │ |
| 77 | +# └─────────────────────────────────────────────────────────────┘ |
| 78 | +# |
| 79 | +# Speedup Analysis: |
| 80 | +# Without speculation: 1 token per forward pass |
| 81 | +# With speculation (K=4, α=0.8): |
| 82 | +# Expected tokens = 1 + α + α² + α³ + α⁴ = 3.36 |
| 83 | +# Cost = 1 target + K draft ≈ 1.1 target (if draft is 10x faster) |
| 84 | +# Speedup = 3.36 / 1.1 ≈ 3x |
| 85 | +# |
| 86 | +# Self-Speculation (no draft model): |
| 87 | +# Use early exit from target model as draft |
| 88 | +# Or use same model with reduced layers |
0 commit comments