Skip to content

Commit 1b0c10b

Browse files
larryliu0820claude
andauthored
docs(voxtral_realtime): document CUDA Windows workflow (#17993)
Add CUDA-Windows instructions to the Voxtral Realtime README, including export prerequisites and an example command. Document Windows build steps via CMake workflow presets and add PowerShell run examples with and without the .ptd data file. Note recommended CUDA architectures for int4 kernels, and reformat voxtral_realtime CMake presets without changing behavior. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 89b938a commit 1b0c10b

3 files changed

Lines changed: 141 additions & 27 deletions

File tree

.ci/scripts/test_model_e2e_windows.ps1

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ param(
1313
[Parameter(Mandatory = $true)]
1414
[string]$QuantName,
1515
[string]$ModelDir = ".",
16-
[string]$ExpectedCudaVersion = ""
16+
[string]$ExpectedCudaVersion = "",
17+
[string]$Mode = ""
1718
)
1819

1920
Set-StrictMode -Version Latest
@@ -25,6 +26,15 @@ if ($Device -ne "cuda-windows") {
2526
throw "Unsupported device '$Device'. Expected 'cuda-windows'."
2627
}
2728

29+
if ($Mode -ne "") {
30+
if ($Mode -notin @("vr-streaming", "vr-offline")) {
31+
throw "Unsupported mode '$Mode'. Supported modes: vr-streaming, vr-offline"
32+
}
33+
if ($HfModel -ne "mistralai/Voxtral-Mini-4B-Realtime-2602") {
34+
throw "Mode '$Mode' can only be used with Voxtral Realtime model"
35+
}
36+
}
37+
2838
Write-Host "Testing model: $HfModel (quantization: $QuantName)"
2939

3040
$resolvedModelDir = (Resolve-Path -Path $ModelDir).Path
@@ -79,7 +89,7 @@ switch ($HfModel) {
7989
$runnerTarget = "voxtral_realtime_runner"
8090
$runnerPath = "voxtral_realtime"
8191
$runnerPreset = "voxtral-realtime-cuda"
82-
$expectedOutput = "Loading audio from"
92+
$expectedOutput = "Quilter"
8393
$preprocessor = "preprocessor.pte"
8494
$tokenizerUrl = ""
8595
$tokenizerFile = "tekken.json"
@@ -207,6 +217,9 @@ try {
207217
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
208218
"--preprocessor_path", (Join-Path -Path $resolvedModelDir -ChildPath $preprocessor)
209219
)
220+
if ($Mode -ne "vr-offline") {
221+
$runnerArgs += "--streaming"
222+
}
210223
}
211224
}
212225

examples/models/voxtral_realtime/CMakePresets.json

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,16 @@
1414
{
1515
"name": "voxtral-realtime-cpu",
1616
"displayName": "Voxtral Realtime runner (CPU)",
17-
"inherits": ["voxtral-realtime-base"]
17+
"inherits": [
18+
"voxtral-realtime-base"
19+
]
1820
},
1921
{
2022
"name": "voxtral-realtime-metal",
2123
"displayName": "Voxtral Realtime runner (Metal)",
22-
"inherits": ["voxtral-realtime-base"],
24+
"inherits": [
25+
"voxtral-realtime-base"
26+
],
2327
"cacheVariables": {
2428
"EXECUTORCH_BUILD_METAL": "ON"
2529
},
@@ -32,14 +36,19 @@
3236
{
3337
"name": "voxtral-realtime-cuda",
3438
"displayName": "Voxtral Realtime runner (CUDA)",
35-
"inherits": ["voxtral-realtime-base"],
39+
"inherits": [
40+
"voxtral-realtime-base"
41+
],
3642
"cacheVariables": {
3743
"EXECUTORCH_BUILD_CUDA": "ON"
3844
},
3945
"condition": {
4046
"type": "inList",
4147
"string": "${hostSystemName}",
42-
"list": ["Linux", "Windows"]
48+
"list": [
49+
"Linux",
50+
"Windows"
51+
]
4352
}
4453
}
4554
],
@@ -48,20 +57,26 @@
4857
"name": "voxtral-realtime-cpu",
4958
"displayName": "Build Voxtral Realtime runner (CPU)",
5059
"configurePreset": "voxtral-realtime-cpu",
51-
"targets": ["voxtral_realtime_runner"]
60+
"targets": [
61+
"voxtral_realtime_runner"
62+
]
5263
},
5364
{
5465
"name": "voxtral-realtime-metal",
5566
"displayName": "Build Voxtral Realtime runner (Metal)",
5667
"configurePreset": "voxtral-realtime-metal",
5768
"configuration": "Release",
58-
"targets": ["voxtral_realtime_runner"]
69+
"targets": [
70+
"voxtral_realtime_runner"
71+
]
5972
},
6073
{
6174
"name": "voxtral-realtime-cuda",
6275
"displayName": "Build Voxtral Realtime runner (CUDA)",
6376
"configurePreset": "voxtral-realtime-cuda",
64-
"targets": ["voxtral_realtime_runner"]
77+
"targets": [
78+
"voxtral_realtime_runner"
79+
]
6580
}
6681
],
6782
"workflowPresets": [
@@ -108,4 +123,4 @@
108123
]
109124
}
110125
]
111-
}
126+
}

examples/models/voxtral_realtime/README.md

Lines changed: 103 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ python export_voxtral_rt.py \
9797
| `xnnpack` ||| `4w`, `8w`, `8da4w`, `8da8w` |
9898
| `metal` ||| none (fp32) or `fpa4w` (Metal-specific 4-bit) |
9999
| `cuda` ||| `4w`, `8w` |
100+
| `cuda-windows` ||| `4w`, `8w` |
100101

101102
Metal backend provides Apple GPU acceleration. CUDA backend provides NVIDIA GPU
102103
acceleration via AOTInductor.
@@ -171,6 +172,54 @@ Alternatively, you can build torchao with Metal support while installing ExecuTo
171172
EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
172173
```
173174

175+
### CUDA-Windows Export
176+
177+
Before running `cuda-windows` export, make sure these requirements are set up:
178+
- `x86_64-w64-mingw32-g++` is installed and on `PATH` (mingw-w64 cross-compiler).
179+
- `WINDOWS_CUDA_HOME` points to the extracted Windows CUDA package directory.
180+
181+
Example setup on Ubuntu (refer to [Parakeet README](../parakeet/README.md) for detailed extraction steps):
182+
183+
```bash
184+
# Ensure the WINDOWS_CUDA_HOME environment variable is set
185+
export WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
186+
```
187+
188+
Export the model for Windows CUDA (example with int4 quantization):
189+
190+
```bash
191+
python export_voxtral_rt.py \
192+
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
193+
--backend cuda-windows \
194+
--dtype bf16 \
195+
--output-dir ./voxtral_rt_exports \
196+
--qlinear-encoder 4w \
197+
--qlinear-encoder-packing-format tile_packed_to_4d \
198+
--qlinear 4w \
199+
--qlinear-packing-format tile_packed_to_4d \
200+
--qembedding 8w
201+
```
202+
203+
For streaming, add `--streaming`:
204+
205+
```bash
206+
python export_voxtral_rt.py \
207+
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
208+
--backend cuda-windows \
209+
--dtype bf16 \
210+
--streaming \
211+
--output-dir ./voxtral_rt_exports \
212+
--qlinear-encoder 4w \
213+
--qlinear-encoder-packing-format tile_packed_to_4d \
214+
--qlinear 4w \
215+
--qlinear-packing-format tile_packed_to_4d \
216+
--qembedding 8w
217+
```
218+
219+
Both offline and streaming exports generate:
220+
- `model.pte`
221+
- `aoti_cuda_blob.ptd`
222+
174223
### Options
175224

176225
| Flag | Default | Description |
@@ -228,6 +277,18 @@ make voxtral_realtime-metal
228277
This builds ExecuTorch with Metal backend support. The runner binary is at
229278
the same path as above. Metal exports can only run on macOS with Apple Silicon.
230279

280+
### CUDA-Windows
281+
282+
On Windows (PowerShell), use CMake workflow presets directly from the executorch root directory. Note that if you exported the model with 4-bit quantization, you may need to specify your GPU's compute capability (e.g., `80;86;89;90;120` for Ampere, Lovelace, Hopper, and Blackwell) to avoid "invalid device function" errors at runtime, as the `int4mm` kernels require SM 80 or newer.
283+
284+
```powershell
285+
$env:CMAKE_CUDA_ARCHITECTURES="80;86;89;90;120"
286+
cmake --workflow --preset llm-release-cuda
287+
Push-Location examples/models/voxtral_realtime
288+
cmake --workflow --preset voxtral-realtime-cuda
289+
Pop-Location
290+
```
291+
231292
## Run
232293

233294
The runner requires:
@@ -237,35 +298,49 @@ The runner requires:
237298
- A 16kHz mono WAV audio file (or live audio via `--mic`)
238299
- For CUDA: `aoti_cuda_blob.ptd` — delegate data file (pass via `--data_path`)
239300

240-
```bash
241-
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
242-
--model_path voxtral_rt_exports/model.pte \
243-
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
244-
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
301+
### Windows (PowerShell)
302+
303+
```powershell
304+
.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe `
305+
--model_path voxtral_rt_exports\model.pte `
306+
--tokenizer_path C:\path\to\tekken.json `
307+
--preprocessor_path voxtral_rt_exports\preprocessor.pte `
245308
--audio_path input.wav
246309
```
247310

248311
For CUDA, include the `.ptd` data file:
249312

250-
```bash
251-
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
252-
--model_path voxtral_rt_exports/model.pte \
253-
--data_path voxtral_rt_exports/aoti_cuda_blob.ptd \
254-
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
255-
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
313+
```powershell
314+
.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe `
315+
--model_path voxtral_rt_exports\model.pte `
316+
--data_path voxtral_rt_exports\aoti_cuda_blob.ptd `
317+
--tokenizer_path C:\path\to\tekken.json `
318+
--preprocessor_path voxtral_rt_exports\preprocessor.pte `
256319
--audio_path input.wav
257320
```
258321

259322
For streaming, add `--streaming`. This requires a model exported with
260323
`--streaming`. The runner processes audio in 80ms steps, computing mel
261324
and running the encoder+decoder incrementally.
262325

263-
```bash
264-
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
265-
--model_path voxtral_rt_exports/model.pte \
266-
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
267-
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
268-
--audio_path input.wav \
326+
```powershell
327+
.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe `
328+
--model_path voxtral_rt_exports\model.pte `
329+
--tokenizer_path C:\path\to\tekken.json `
330+
--preprocessor_path voxtral_rt_exports\preprocessor.pte `
331+
--audio_path input.wav `
332+
--streaming
333+
```
334+
335+
For CUDA streaming, include the `.ptd` data file:
336+
337+
```powershell
338+
.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe `
339+
--model_path voxtral_rt_exports\model.pte `
340+
--data_path voxtral_rt_exports\aoti_cuda_blob.ptd `
341+
--tokenizer_path C:\path\to\tekken.json `
342+
--preprocessor_path voxtral_rt_exports\preprocessor.pte `
343+
--audio_path input.wav `
269344
--streaming
270345
```
271346

@@ -285,6 +360,17 @@ ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f f32le -nostats -loglevel error
285360

286361
Ctrl+C stops recording and flushes remaining text.
287362

363+
**Windows (PowerShell):**
364+
365+
```powershell
366+
.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe `
367+
--model_path C:\path\to\voxtral_rt_exports\model.pte `
368+
--data_path C:\path\to\voxtral_rt_exports\aoti_cuda_blob.ptd `
369+
--tokenizer_path C:\path\to\tekken.json `
370+
--preprocessor_path C:\path\to\voxtral_rt_exports\preprocessor.pte `
371+
--audio_path C:\path\to\input.wav
372+
```
373+
288374
**CUDA:** Add `--data_path voxtral_rt_exports/aoti_cuda_blob.ptd` to all
289375
run commands above when using the CUDA backend.
290376

0 commit comments

Comments
 (0)