Skip to content

Commit 2155ed9

Browse files
authored
Merge pull request #102 from AdaWorldAPI/claude/continue-lance-graph-ndarray-Ld786
feat(audio): complete TTS pipeline — mel, voice, modes, phase, synth + AMX SIGILL fix
2 parents a42d999 + 84dfae0 commit 2155ed9

9 files changed

Lines changed: 2308 additions & 22 deletions

File tree

.claude/AMX_GOTCHAS.md

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,41 @@ For CPUID leaf 7 (AMX detection): use `__cpuid_count()`, not inline asm.
6666

6767
---
6868

69-
## Gotcha 4: OS must enable AMX via XSETBV
69+
## Gotcha 4: OS must enable AMX via XSETBV + process must request permission
7070

71-
AMX tiles are large (8 KB of state). The OS must opt in via XCR0 bits 17+18.
72-
Linux 5.19+ enables AMX by default. Older kernels: SIGILL on tile instructions.
71+
AMX tiles are large (8 KB of state). Two levels of OS enablement required:
72+
73+
1. **Kernel enables tile state in XCR0** (bits 17+18). Linux 5.19+ does this.
74+
2. **Process requests XCOMP_PERM** via `prctl(ARCH_REQ_XCOMP_PERM, 18)`.
75+
Without this, LDTILECFG will SIGILL even if XCR0 bits are set.
7376

7477
**Detection (stable)**:
7578
```rust
76-
let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0);
77-
let tilecfg = (xcr0.eax >> 17) & 1; // bit 17 = XTILECFG
78-
let tiledata = (xcr0.eax >> 18) & 1; // bit 18 = XTILEDATA
79-
// Both must be 1
80-
```
79+
// Step 1: CPUID — does CPU support AMX?
80+
let cpuid = core::arch::x86_64::__cpuid_count(7, 0);
81+
let amx_tile = (cpuid.edx >> 24) & 1;
82+
let amx_int8 = (cpuid.edx >> 25) & 1;
83+
84+
// Step 2: OSXSAVE — does OS support XSAVE?
85+
let cpuid_01 = core::arch::x86_64::__cpuid(1);
86+
let osxsave = (cpuid_01.ecx >> 27) & 1;
87+
88+
// Step 3: _xgetbv(0) — did OS ACTUALLY enable tile state?
89+
// ⚠ Do NOT use __cpuid_count(0xD, 0) — that reports what CPU SUPPORTS,
90+
// not what the OS ENABLED. _xgetbv(0) reads the actual XCR0 register.
91+
let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) };
92+
let tilecfg = (xcr0 >> 17) & 1; // bit 17 = XTILECFG
93+
let tiledata = (xcr0 >> 18) & 1; // bit 18 = XTILEDATA
94+
95+
// Step 4: prctl — request tile permission for this process
96+
// SYS_prctl = 157, ARCH_REQ_XCOMP_PERM = 0x1023, XFEATURE_XTILEDATA = 18
97+
// Returns 0 on success, -errno on failure. Idempotent.
98+
```
99+
100+
**Previous bug**: `__cpuid_count(0xD, 0)` reports XSAVE state component bitmap
101+
(what the CPU *supports*), NOT the actual XCR0 value (what the OS *enabled*).
102+
On hypervisors that advertise AMX in CPUID but don't enable tile state,
103+
the old check returned `true` → SIGILL on LDTILECFG.
81104

82105
---
83106

src/hpc/audio/codec_map.rs

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
//! Codec provenance map: which real codec each primitive comes from.
2+
//!
3+
//! Every primitive in this audio stack was stolen from a production codec.
4+
//! Nothing invented — only transcoded and compressed to fit the HHTL cascade.
5+
//!
6+
//! ```text
7+
//! ┌─────────────┬──────────┬─────────┬────────┬─────────┬──────┬───────────┐
8+
//! │ Our type │ Opus │ Whisper │ MP3 │ Vorbis │ Bark │ ElevenLabs│
9+
//! ├─────────────┼──────────┼─────────┼────────┼─────────┼──────┼───────────┤
10+
//! │ MDCT │ CELT │ │ hybrid │ ✓ │ │ │
11+
//! │ 21 bands │ eBands48 │ │ 32 sub │ ✓ │ │ │
12+
//! │ PVQ shape │ CELT PVQ │ │ │ residue │ │ │
13+
//! │ Mel 80ch │ │ frontend│ │ │ │ │
14+
//! │ Phase 4B │ │ STFT ∠ │ │ │ │ │
15+
//! │ VoiceArch │ │ │ │ │ spk │ embedding │
16+
//! │ RvqFrame │ │ │ │ │ 3stg │ │
17+
//! │ OctaveBand │ │ │ ✓ │ floor │ │ │
18+
//! │ Mode │ │ │ │ │ │ emotion │
19+
//! │ HHTL skip │ │ │ mask │ floor │ │ │
20+
//! │ CompLinear │ │ │ │ VQ cb │ RVQ │ │
21+
//! │ Qualia17D │ (QPL) │ │ │ │ sem │ emotion │
22+
//! └─────────────┴──────────┴─────────┴────────┴─────────┴──────┴───────────┘
23+
//! ```
24+
//!
25+
//! The architecture replaces neural inference with graph search at every stage:
26+
//! MP3's psychoacoustic model → HHTL cascade (RouteAction::Skip)
27+
//! Whisper's transformer → phoneme graph shortest path
28+
//! Bark's 3 GPT-2 stages → 3 HHTL levels (HEEL/HIP/TWIG)
29+
//! Vorbis's codebook VQ → CompiledLinear VNNI palette lookup
30+
//! ElevenLabs' voice cloning → VoiceArchetype 16-byte embedding
31+
32+
/// Codec provenance for each audio primitive.
33+
///
34+
/// Documents which production codec each type was transcoded from,
35+
/// what aspect of that codec it captures, and what it replaces.
36+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
37+
pub enum CodecSource {
38+
Opus,
39+
Whisper,
40+
Mp3,
41+
OggVorbis,
42+
Bark,
43+
ElevenLabs,
44+
}
45+
46+
/// What aspect of audio each primitive captures.
47+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
48+
pub enum AudioAspect {
49+
/// Spectral energy distribution (WHAT frequencies)
50+
SpectralEnvelope,
51+
/// Fine spectral shape within bands (HOW the energy is distributed)
52+
SpectralShape,
53+
/// Perceptual frequency mapping (WHERE in human hearing)
54+
PerceptualMapping,
55+
/// Temporal phase relationships (WHEN harmonics align)
56+
PhaseRelationship,
57+
/// Speaker identity (WHO is speaking)
58+
SpeakerIdentity,
59+
/// Semantic/emotional content (WHY it sounds that way)
60+
SemanticContent,
61+
/// Psychoacoustic masking (WHAT to skip)
62+
MaskingDecision,
63+
/// Codebook lookup (HOW to decompress)
64+
CodebookLookup,
65+
}
66+
67+
/// Complete provenance record for one primitive.
68+
pub struct Provenance {
69+
pub our_type: &'static str,
70+
pub byte_size: usize,
71+
pub source: CodecSource,
72+
pub aspect: AudioAspect,
73+
pub source_concept: &'static str,
74+
pub what_it_replaces: &'static str,
75+
}
76+
77+
/// Full provenance table for every audio primitive.
78+
///
79+
/// This IS the design document. If a new primitive doesn't appear here,
80+
/// it wasn't stolen from a real codec and shouldn't exist.
81+
pub const PROVENANCE: &[Provenance] = &[
82+
// ═══ From Opus CELT ═══
83+
Provenance {
84+
our_type: "AudioFrame.band_energies",
85+
byte_size: 42,
86+
source: CodecSource::Opus,
87+
aspect: AudioAspect::SpectralEnvelope,
88+
source_concept: "eBands48 critical bands, gain in gain-shape split",
89+
what_it_replaces: "Per-coefficient quantization (MP3/Vorbis)",
90+
},
91+
Provenance {
92+
our_type: "AudioFrame.pvq_summary",
93+
byte_size: 6,
94+
source: CodecSource::Opus,
95+
aspect: AudioAspect::SpectralShape,
96+
source_concept: "PVQ (Pyramid Vector Quantization) pulse allocation",
97+
what_it_replaces: "Huffman-coded residuals (MP3) / VQ codebook (Vorbis)",
98+
},
99+
Provenance {
100+
our_type: "mdct_forward / mdct_backward",
101+
byte_size: 0, // transform, not stored
102+
source: CodecSource::Opus,
103+
aspect: AudioAspect::SpectralEnvelope,
104+
source_concept: "CELT MDCT: 960-sample window → 480 frequency bins",
105+
what_it_replaces: "FFT+windowing (all codecs use some form)",
106+
},
107+
108+
// ═══ From Whisper ═══
109+
Provenance {
110+
our_type: "mel::log_mel_spectrogram",
111+
byte_size: 160, // 80 × BF16 per frame
112+
source: CodecSource::Whisper,
113+
aspect: AudioAspect::PerceptualMapping,
114+
source_concept: "80-channel mel filterbank at 16kHz, Hann STFT",
115+
what_it_replaces: "Transformer encoder (150M params → 80 f32 per frame)",
116+
},
117+
118+
// ═══ From MP3 ═══
119+
Provenance {
120+
our_type: "HhtlCache::route() → Skip",
121+
byte_size: 0, // decision, not stored
122+
source: CodecSource::Mp3,
123+
aspect: AudioAspect::MaskingDecision,
124+
source_concept: "Psychoacoustic masking model (simultaneous + temporal)",
125+
what_it_replaces: "ISO 11172-3 psychoacoustic model 1/2 (iterative bit allocation)",
126+
},
127+
Provenance {
128+
our_type: "OctaveBand",
129+
byte_size: 13, // 3×f32 + u8
130+
source: CodecSource::Mp3,
131+
aspect: AudioAspect::SpectralEnvelope,
132+
source_concept: "32-subband polyphase filterbank (octave-spaced)",
133+
what_it_replaces: "Per-subband quantization + Huffman (MP3 granules)",
134+
},
135+
136+
// ═══ From Ogg Vorbis ═══
137+
Provenance {
138+
our_type: "CompiledLinear (ndarray burn)",
139+
byte_size: 65536, // 256 centroids × 256 dim
140+
source: CodecSource::OggVorbis,
141+
aspect: AudioAspect::CodebookLookup,
142+
source_concept: "VQ codebook: precomputed centroids, lookup-based decode",
143+
what_it_replaces: "Huffman trees (MP3) / arithmetic coding (Opus range coder)",
144+
},
145+
146+
// ═══ From Bark (Suno) ═══
147+
Provenance {
148+
our_type: "RvqFrame.archetype (HEEL)",
149+
byte_size: 1,
150+
source: CodecSource::Bark,
151+
aspect: AudioAspect::SemanticContent,
152+
source_concept: "Stage 1: GPT-2 semantic tokens (coarse meaning)",
153+
what_it_replaces: "350M-param GPT-2 autoregressive generation",
154+
},
155+
Provenance {
156+
our_type: "RvqFrame.coarse (HIP)",
157+
byte_size: 8,
158+
source: CodecSource::Bark,
159+
aspect: AudioAspect::SpectralEnvelope,
160+
source_concept: "Stage 2: GPT-2 coarse acoustic tokens (spectral envelope)",
161+
what_it_replaces: "350M-param GPT-2 conditioned on semantic tokens",
162+
},
163+
Provenance {
164+
our_type: "RvqFrame.fine (TWIG)",
165+
byte_size: 8,
166+
source: CodecSource::Bark,
167+
aspect: AudioAspect::SpectralShape,
168+
source_concept: "Stage 3: non-autoregressive fine acoustic tokens",
169+
what_it_replaces: "Fine model (smaller network, fills spectral detail)",
170+
},
171+
172+
// ═══ From ElevenLabs ═══
173+
Provenance {
174+
our_type: "VoiceArchetype",
175+
byte_size: 16,
176+
source: CodecSource::ElevenLabs,
177+
aspect: AudioAspect::SpeakerIdentity,
178+
source_concept: "Speaker embedding (voice cloning conditioning vector)",
179+
what_it_replaces: "512-dim speaker embedding (2KB → 16 bytes)",
180+
},
181+
182+
// ═══ Phase (novel — no codec stores this) ═══
183+
Provenance {
184+
our_type: "PhaseDescriptor",
185+
byte_size: 4,
186+
source: CodecSource::Whisper, // closest: Whisper STFT preserves phase internally
187+
aspect: AudioAspect::PhaseRelationship,
188+
source_concept: "STFT phase (discarded by all codecs except Griffin-Lim)",
189+
what_it_replaces: "Nothing — all codecs discard phase. We keep it as relative pressure.",
190+
},
191+
192+
// ═══ Qualia (novel — derived from QPL musical calibration) ═══
193+
Provenance {
194+
our_type: "Qualia17D",
195+
byte_size: 68,
196+
source: CodecSource::Bark, // closest: Bark semantic tokens carry meaning
197+
aspect: AudioAspect::SemanticContent,
198+
source_concept: "QPL: Octave→arousal, Fifth→valence, Third→warmth, Tritone→tension",
199+
what_it_replaces: "No codec captures nonverbal meaning explicitly. This is the grid.",
200+
},
201+
];
202+
203+
/// Total bytes for one complete frame (all primitives combined).
204+
///
205+
/// AudioFrame (48) + PhaseDescriptor (4) + VoiceArchetype (16, amortized)
206+
/// = 52 bytes per frame for complete nonverbal characterization.
207+
/// + RvqFrame (17) for HHTL-compressed TTS output = 69 bytes.
208+
///
209+
/// Compare:
210+
/// MP3 128kbps: ~417 bytes per 26ms frame
211+
/// Opus 64kbps: ~166 bytes per 20ms frame
212+
/// Bark tokens: ~128 bytes per frame
213+
/// Ours: 52-69 bytes per frame (complete, including phase + identity)
214+
pub const FRAME_BUDGET: usize = 52;
215+
pub const FRAME_BUDGET_WITH_TTS: usize = 69;
216+
217+
/// Codec comparison: bits per second at comparable quality.
218+
///
219+
/// These are approximate — our codec is lossy in a fundamentally
220+
/// different way (palette quantization, not psychoacoustic masking).
221+
pub const BITRATE_COMPARISON: &[(&str, u32, &str)] = &[
222+
("MP3 128k", 128_000, "psychoacoustic masking, Huffman"),
223+
("Opus 64k", 64_000, "CELT+SILK hybrid, range coder"),
224+
("Vorbis 128k", 128_000, "MDCT, floor+residue, VQ codebook"),
225+
("Bark tokens", 25_600, "3-stage RVQ, ~100 tokens/sec × 256 bits"),
226+
("Ours (48kHz)", 20_800, "52 bytes × 50 fps × 8 bits = 20.8 kbps"),
227+
("Ours (24kHz)", 10_400, "52 bytes × 25 fps × 8 bits = 10.4 kbps"),
228+
];
229+
230+
/// Verify every AudioAspect is covered by at least one primitive.
231+
/// If an aspect is missing, we have a hole in our codec design.
232+
pub fn verify_aspect_coverage() -> Vec<AudioAspect> {
233+
use AudioAspect::*;
234+
let all = [SpectralEnvelope, SpectralShape, PerceptualMapping,
235+
PhaseRelationship, SpeakerIdentity, SemanticContent,
236+
MaskingDecision, CodebookLookup];
237+
238+
all.iter()
239+
.filter(|&&aspect| !PROVENANCE.iter().any(|p| p.aspect == aspect))
240+
.copied()
241+
.collect()
242+
}
243+
244+
#[cfg(test)]
245+
mod tests {
246+
use super::*;
247+
248+
#[test]
249+
fn all_aspects_covered() {
250+
let missing = verify_aspect_coverage();
251+
assert!(missing.is_empty(), "Missing audio aspects: {:?}", missing);
252+
}
253+
254+
#[test]
255+
fn frame_budget_correct() {
256+
// AudioFrame (48) + PhaseDescriptor (4) = 52
257+
assert_eq!(FRAME_BUDGET, 48 + 4);
258+
// + RvqFrame (17) = 69
259+
assert_eq!(FRAME_BUDGET_WITH_TTS, 48 + 4 + 17);
260+
}
261+
262+
#[test]
263+
fn provenance_byte_sizes_consistent() {
264+
// AudioFrame = 42 (energies) + 6 (pvq) = 48
265+
let af_energies = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.band_energies").unwrap();
266+
let af_pvq = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.pvq_summary").unwrap();
267+
assert_eq!(af_energies.byte_size + af_pvq.byte_size, 48);
268+
269+
// RvqFrame = 1 (HEEL) + 8 (HIP) + 8 (TWIG) = 17
270+
let rvq_heel = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.archetype (HEEL)").unwrap();
271+
let rvq_hip = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.coarse (HIP)").unwrap();
272+
let rvq_twig = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.fine (TWIG)").unwrap();
273+
assert_eq!(rvq_heel.byte_size + rvq_hip.byte_size + rvq_twig.byte_size, 17);
274+
}
275+
276+
#[test]
277+
fn every_source_codec_represented() {
278+
// All 6 source codecs should appear at least once
279+
for source in [CodecSource::Opus, CodecSource::Whisper, CodecSource::Mp3,
280+
CodecSource::OggVorbis, CodecSource::Bark, CodecSource::ElevenLabs] {
281+
assert!(PROVENANCE.iter().any(|p| p.source == source),
282+
"Codec {:?} not represented in provenance table", source);
283+
}
284+
}
285+
286+
#[test]
287+
fn our_bitrate_competitive() {
288+
// Our codec should be lower bitrate than all traditional codecs
289+
let ours_24k = BITRATE_COMPARISON.iter()
290+
.find(|&&(name, _, _)| name == "Ours (24kHz)")
291+
.unwrap().1;
292+
let mp3 = BITRATE_COMPARISON.iter()
293+
.find(|&&(name, _, _)| name == "MP3 128k")
294+
.unwrap().1;
295+
assert!(ours_24k < mp3, "Our codec should be lower bitrate than MP3");
296+
}
297+
}

0 commit comments

Comments
 (0)