|
| 1 | +//! Codec provenance map: which real codec each primitive comes from. |
| 2 | +//! |
| 3 | +//! Every primitive in this audio stack was stolen from a production codec. |
| 4 | +//! Nothing invented — only transcoded and compressed to fit the HHTL cascade. |
| 5 | +//! |
| 6 | +//! ```text |
| 7 | +//! ┌─────────────┬──────────┬─────────┬────────┬─────────┬──────┬───────────┐ |
| 8 | +//! │ Our type │ Opus │ Whisper │ MP3 │ Vorbis │ Bark │ ElevenLabs│ |
| 9 | +//! ├─────────────┼──────────┼─────────┼────────┼─────────┼──────┼───────────┤ |
| 10 | +//! │ MDCT │ CELT │ │ hybrid │ ✓ │ │ │ |
| 11 | +//! │ 21 bands │ eBands48 │ │ 32 sub │ ✓ │ │ │ |
| 12 | +//! │ PVQ shape │ CELT PVQ │ │ │ residue │ │ │ |
| 13 | +//! │ Mel 80ch │ │ frontend│ │ │ │ │ |
| 14 | +//! │ Phase 4B │ │ STFT ∠ │ │ │ │ │ |
| 15 | +//! │ VoiceArch │ │ │ │ │ spk │ embedding │ |
| 16 | +//! │ RvqFrame │ │ │ │ │ 3stg │ │ |
| 17 | +//! │ OctaveBand │ │ │ ✓ │ floor │ │ │ |
| 18 | +//! │ Mode │ │ │ │ │ │ emotion │ |
| 19 | +//! │ HHTL skip │ │ │ mask │ floor │ │ │ |
| 20 | +//! │ CompLinear │ │ │ │ VQ cb │ RVQ │ │ |
| 21 | +//! │ Qualia17D │ (QPL) │ │ │ │ sem │ emotion │ |
| 22 | +//! └─────────────┴──────────┴─────────┴────────┴─────────┴──────┴───────────┘ |
| 23 | +//! ``` |
| 24 | +//! |
| 25 | +//! The architecture replaces neural inference with graph search at every stage: |
| 26 | +//! MP3's psychoacoustic model → HHTL cascade (RouteAction::Skip) |
| 27 | +//! Whisper's transformer → phoneme graph shortest path |
| 28 | +//! Bark's 3 GPT-2 stages → 3 HHTL levels (HEEL/HIP/TWIG) |
| 29 | +//! Vorbis's codebook VQ → CompiledLinear VNNI palette lookup |
| 30 | +//! ElevenLabs' voice cloning → VoiceArchetype 16-byte embedding |
| 31 | +
|
| 32 | +/// Codec provenance for each audio primitive. |
| 33 | +/// |
| 34 | +/// Documents which production codec each type was transcoded from, |
| 35 | +/// what aspect of that codec it captures, and what it replaces. |
| 36 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 37 | +pub enum CodecSource { |
| 38 | + Opus, |
| 39 | + Whisper, |
| 40 | + Mp3, |
| 41 | + OggVorbis, |
| 42 | + Bark, |
| 43 | + ElevenLabs, |
| 44 | +} |
| 45 | + |
| 46 | +/// What aspect of audio each primitive captures. |
| 47 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 48 | +pub enum AudioAspect { |
| 49 | + /// Spectral energy distribution (WHAT frequencies) |
| 50 | + SpectralEnvelope, |
| 51 | + /// Fine spectral shape within bands (HOW the energy is distributed) |
| 52 | + SpectralShape, |
| 53 | + /// Perceptual frequency mapping (WHERE in human hearing) |
| 54 | + PerceptualMapping, |
| 55 | + /// Temporal phase relationships (WHEN harmonics align) |
| 56 | + PhaseRelationship, |
| 57 | + /// Speaker identity (WHO is speaking) |
| 58 | + SpeakerIdentity, |
| 59 | + /// Semantic/emotional content (WHY it sounds that way) |
| 60 | + SemanticContent, |
| 61 | + /// Psychoacoustic masking (WHAT to skip) |
| 62 | + MaskingDecision, |
| 63 | + /// Codebook lookup (HOW to decompress) |
| 64 | + CodebookLookup, |
| 65 | +} |
| 66 | + |
| 67 | +/// Complete provenance record for one primitive. |
| 68 | +pub struct Provenance { |
| 69 | + pub our_type: &'static str, |
| 70 | + pub byte_size: usize, |
| 71 | + pub source: CodecSource, |
| 72 | + pub aspect: AudioAspect, |
| 73 | + pub source_concept: &'static str, |
| 74 | + pub what_it_replaces: &'static str, |
| 75 | +} |
| 76 | + |
| 77 | +/// Full provenance table for every audio primitive. |
| 78 | +/// |
| 79 | +/// This IS the design document. If a new primitive doesn't appear here, |
| 80 | +/// it wasn't stolen from a real codec and shouldn't exist. |
| 81 | +pub const PROVENANCE: &[Provenance] = &[ |
| 82 | + // ═══ From Opus CELT ═══ |
| 83 | + Provenance { |
| 84 | + our_type: "AudioFrame.band_energies", |
| 85 | + byte_size: 42, |
| 86 | + source: CodecSource::Opus, |
| 87 | + aspect: AudioAspect::SpectralEnvelope, |
| 88 | + source_concept: "eBands48 critical bands, gain in gain-shape split", |
| 89 | + what_it_replaces: "Per-coefficient quantization (MP3/Vorbis)", |
| 90 | + }, |
| 91 | + Provenance { |
| 92 | + our_type: "AudioFrame.pvq_summary", |
| 93 | + byte_size: 6, |
| 94 | + source: CodecSource::Opus, |
| 95 | + aspect: AudioAspect::SpectralShape, |
| 96 | + source_concept: "PVQ (Pyramid Vector Quantization) pulse allocation", |
| 97 | + what_it_replaces: "Huffman-coded residuals (MP3) / VQ codebook (Vorbis)", |
| 98 | + }, |
| 99 | + Provenance { |
| 100 | + our_type: "mdct_forward / mdct_backward", |
| 101 | + byte_size: 0, // transform, not stored |
| 102 | + source: CodecSource::Opus, |
| 103 | + aspect: AudioAspect::SpectralEnvelope, |
| 104 | + source_concept: "CELT MDCT: 960-sample window → 480 frequency bins", |
| 105 | + what_it_replaces: "FFT+windowing (all codecs use some form)", |
| 106 | + }, |
| 107 | + |
| 108 | + // ═══ From Whisper ═══ |
| 109 | + Provenance { |
| 110 | + our_type: "mel::log_mel_spectrogram", |
| 111 | + byte_size: 160, // 80 × BF16 per frame |
| 112 | + source: CodecSource::Whisper, |
| 113 | + aspect: AudioAspect::PerceptualMapping, |
| 114 | + source_concept: "80-channel mel filterbank at 16kHz, Hann STFT", |
| 115 | + what_it_replaces: "Transformer encoder (150M params → 80 f32 per frame)", |
| 116 | + }, |
| 117 | + |
| 118 | + // ═══ From MP3 ═══ |
| 119 | + Provenance { |
| 120 | + our_type: "HhtlCache::route() → Skip", |
| 121 | + byte_size: 0, // decision, not stored |
| 122 | + source: CodecSource::Mp3, |
| 123 | + aspect: AudioAspect::MaskingDecision, |
| 124 | + source_concept: "Psychoacoustic masking model (simultaneous + temporal)", |
| 125 | + what_it_replaces: "ISO 11172-3 psychoacoustic model 1/2 (iterative bit allocation)", |
| 126 | + }, |
| 127 | + Provenance { |
| 128 | + our_type: "OctaveBand", |
| 129 | + byte_size: 13, // 3×f32 + u8 |
| 130 | + source: CodecSource::Mp3, |
| 131 | + aspect: AudioAspect::SpectralEnvelope, |
| 132 | + source_concept: "32-subband polyphase filterbank (octave-spaced)", |
| 133 | + what_it_replaces: "Per-subband quantization + Huffman (MP3 granules)", |
| 134 | + }, |
| 135 | + |
| 136 | + // ═══ From Ogg Vorbis ═══ |
| 137 | + Provenance { |
| 138 | + our_type: "CompiledLinear (ndarray burn)", |
| 139 | + byte_size: 65536, // 256 centroids × 256 dim |
| 140 | + source: CodecSource::OggVorbis, |
| 141 | + aspect: AudioAspect::CodebookLookup, |
| 142 | + source_concept: "VQ codebook: precomputed centroids, lookup-based decode", |
| 143 | + what_it_replaces: "Huffman trees (MP3) / arithmetic coding (Opus range coder)", |
| 144 | + }, |
| 145 | + |
| 146 | + // ═══ From Bark (Suno) ═══ |
| 147 | + Provenance { |
| 148 | + our_type: "RvqFrame.archetype (HEEL)", |
| 149 | + byte_size: 1, |
| 150 | + source: CodecSource::Bark, |
| 151 | + aspect: AudioAspect::SemanticContent, |
| 152 | + source_concept: "Stage 1: GPT-2 semantic tokens (coarse meaning)", |
| 153 | + what_it_replaces: "350M-param GPT-2 autoregressive generation", |
| 154 | + }, |
| 155 | + Provenance { |
| 156 | + our_type: "RvqFrame.coarse (HIP)", |
| 157 | + byte_size: 8, |
| 158 | + source: CodecSource::Bark, |
| 159 | + aspect: AudioAspect::SpectralEnvelope, |
| 160 | + source_concept: "Stage 2: GPT-2 coarse acoustic tokens (spectral envelope)", |
| 161 | + what_it_replaces: "350M-param GPT-2 conditioned on semantic tokens", |
| 162 | + }, |
| 163 | + Provenance { |
| 164 | + our_type: "RvqFrame.fine (TWIG)", |
| 165 | + byte_size: 8, |
| 166 | + source: CodecSource::Bark, |
| 167 | + aspect: AudioAspect::SpectralShape, |
| 168 | + source_concept: "Stage 3: non-autoregressive fine acoustic tokens", |
| 169 | + what_it_replaces: "Fine model (smaller network, fills spectral detail)", |
| 170 | + }, |
| 171 | + |
| 172 | + // ═══ From ElevenLabs ═══ |
| 173 | + Provenance { |
| 174 | + our_type: "VoiceArchetype", |
| 175 | + byte_size: 16, |
| 176 | + source: CodecSource::ElevenLabs, |
| 177 | + aspect: AudioAspect::SpeakerIdentity, |
| 178 | + source_concept: "Speaker embedding (voice cloning conditioning vector)", |
| 179 | + what_it_replaces: "512-dim speaker embedding (2KB → 16 bytes)", |
| 180 | + }, |
| 181 | + |
| 182 | + // ═══ Phase (novel — no codec stores this) ═══ |
| 183 | + Provenance { |
| 184 | + our_type: "PhaseDescriptor", |
| 185 | + byte_size: 4, |
| 186 | + source: CodecSource::Whisper, // closest: Whisper STFT preserves phase internally |
| 187 | + aspect: AudioAspect::PhaseRelationship, |
| 188 | + source_concept: "STFT phase (discarded by all codecs except Griffin-Lim)", |
| 189 | + what_it_replaces: "Nothing — all codecs discard phase. We keep it as relative pressure.", |
| 190 | + }, |
| 191 | + |
| 192 | + // ═══ Qualia (novel — derived from QPL musical calibration) ═══ |
| 193 | + Provenance { |
| 194 | + our_type: "Qualia17D", |
| 195 | + byte_size: 68, |
| 196 | + source: CodecSource::Bark, // closest: Bark semantic tokens carry meaning |
| 197 | + aspect: AudioAspect::SemanticContent, |
| 198 | + source_concept: "QPL: Octave→arousal, Fifth→valence, Third→warmth, Tritone→tension", |
| 199 | + what_it_replaces: "No codec captures nonverbal meaning explicitly. This is the grid.", |
| 200 | + }, |
| 201 | +]; |
| 202 | + |
| 203 | +/// Total bytes for one complete frame (all primitives combined). |
| 204 | +/// |
| 205 | +/// AudioFrame (48) + PhaseDescriptor (4) + VoiceArchetype (16, amortized) |
| 206 | +/// = 52 bytes per frame for complete nonverbal characterization. |
| 207 | +/// + RvqFrame (17) for HHTL-compressed TTS output = 69 bytes. |
| 208 | +/// |
| 209 | +/// Compare: |
| 210 | +/// MP3 128kbps: ~417 bytes per 26ms frame |
| 211 | +/// Opus 64kbps: ~166 bytes per 20ms frame |
| 212 | +/// Bark tokens: ~128 bytes per frame |
| 213 | +/// Ours: 52-69 bytes per frame (complete, including phase + identity) |
| 214 | +pub const FRAME_BUDGET: usize = 52; |
| 215 | +pub const FRAME_BUDGET_WITH_TTS: usize = 69; |
| 216 | + |
| 217 | +/// Codec comparison: bits per second at comparable quality. |
| 218 | +/// |
| 219 | +/// These are approximate — our codec is lossy in a fundamentally |
| 220 | +/// different way (palette quantization, not psychoacoustic masking). |
| 221 | +pub const BITRATE_COMPARISON: &[(&str, u32, &str)] = &[ |
| 222 | + ("MP3 128k", 128_000, "psychoacoustic masking, Huffman"), |
| 223 | + ("Opus 64k", 64_000, "CELT+SILK hybrid, range coder"), |
| 224 | + ("Vorbis 128k", 128_000, "MDCT, floor+residue, VQ codebook"), |
| 225 | + ("Bark tokens", 25_600, "3-stage RVQ, ~100 tokens/sec × 256 bits"), |
| 226 | + ("Ours (48kHz)", 20_800, "52 bytes × 50 fps × 8 bits = 20.8 kbps"), |
| 227 | + ("Ours (24kHz)", 10_400, "52 bytes × 25 fps × 8 bits = 10.4 kbps"), |
| 228 | +]; |
| 229 | + |
| 230 | +/// Verify every AudioAspect is covered by at least one primitive. |
| 231 | +/// If an aspect is missing, we have a hole in our codec design. |
| 232 | +pub fn verify_aspect_coverage() -> Vec<AudioAspect> { |
| 233 | + use AudioAspect::*; |
| 234 | + let all = [SpectralEnvelope, SpectralShape, PerceptualMapping, |
| 235 | + PhaseRelationship, SpeakerIdentity, SemanticContent, |
| 236 | + MaskingDecision, CodebookLookup]; |
| 237 | + |
| 238 | + all.iter() |
| 239 | + .filter(|&&aspect| !PROVENANCE.iter().any(|p| p.aspect == aspect)) |
| 240 | + .copied() |
| 241 | + .collect() |
| 242 | +} |
| 243 | + |
| 244 | +#[cfg(test)] |
| 245 | +mod tests { |
| 246 | + use super::*; |
| 247 | + |
| 248 | + #[test] |
| 249 | + fn all_aspects_covered() { |
| 250 | + let missing = verify_aspect_coverage(); |
| 251 | + assert!(missing.is_empty(), "Missing audio aspects: {:?}", missing); |
| 252 | + } |
| 253 | + |
| 254 | + #[test] |
| 255 | + fn frame_budget_correct() { |
| 256 | + // AudioFrame (48) + PhaseDescriptor (4) = 52 |
| 257 | + assert_eq!(FRAME_BUDGET, 48 + 4); |
| 258 | + // + RvqFrame (17) = 69 |
| 259 | + assert_eq!(FRAME_BUDGET_WITH_TTS, 48 + 4 + 17); |
| 260 | + } |
| 261 | + |
| 262 | + #[test] |
| 263 | + fn provenance_byte_sizes_consistent() { |
| 264 | + // AudioFrame = 42 (energies) + 6 (pvq) = 48 |
| 265 | + let af_energies = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.band_energies").unwrap(); |
| 266 | + let af_pvq = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.pvq_summary").unwrap(); |
| 267 | + assert_eq!(af_energies.byte_size + af_pvq.byte_size, 48); |
| 268 | + |
| 269 | + // RvqFrame = 1 (HEEL) + 8 (HIP) + 8 (TWIG) = 17 |
| 270 | + let rvq_heel = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.archetype (HEEL)").unwrap(); |
| 271 | + let rvq_hip = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.coarse (HIP)").unwrap(); |
| 272 | + let rvq_twig = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.fine (TWIG)").unwrap(); |
| 273 | + assert_eq!(rvq_heel.byte_size + rvq_hip.byte_size + rvq_twig.byte_size, 17); |
| 274 | + } |
| 275 | + |
| 276 | + #[test] |
| 277 | + fn every_source_codec_represented() { |
| 278 | + // All 6 source codecs should appear at least once |
| 279 | + for source in [CodecSource::Opus, CodecSource::Whisper, CodecSource::Mp3, |
| 280 | + CodecSource::OggVorbis, CodecSource::Bark, CodecSource::ElevenLabs] { |
| 281 | + assert!(PROVENANCE.iter().any(|p| p.source == source), |
| 282 | + "Codec {:?} not represented in provenance table", source); |
| 283 | + } |
| 284 | + } |
| 285 | + |
| 286 | + #[test] |
| 287 | + fn our_bitrate_competitive() { |
| 288 | + // Our codec should be lower bitrate than all traditional codecs |
| 289 | + let ours_24k = BITRATE_COMPARISON.iter() |
| 290 | + .find(|&&(name, _, _)| name == "Ours (24kHz)") |
| 291 | + .unwrap().1; |
| 292 | + let mp3 = BITRATE_COMPARISON.iter() |
| 293 | + .find(|&&(name, _, _)| name == "MP3 128k") |
| 294 | + .unwrap().1; |
| 295 | + assert!(ours_24k < mp3, "Our codec should be lower bitrate than MP3"); |
| 296 | + } |
| 297 | +} |
0 commit comments