Skip to content

Commit 079545e

Browse files
committed
Add Voxtral Realtime Windows WPF application
1 parent 43eda77 commit 079545e

59 files changed

Lines changed: 4818 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# System files
22
.DS_Store
3+
*.png
4+
5+
# IDE / editor settings
6+
.claude/
7+
.vs/
38

49
# Python environment
510
.venv/
@@ -37,3 +42,30 @@ local.properties
3742
node_modules/
3843
__pycache__/
3944
.cursor/
45+
46+
# .NET / C# / Visual Studio
47+
[Bb]in/
48+
[Oo]bj/
49+
[Dd]ebug/
50+
[Rr]elease/
51+
*.dll
52+
*.exe
53+
*.pdb
54+
*.cache
55+
*.baml
56+
*.resources
57+
*.user
58+
*.suo
59+
*.nupkg
60+
*.snupkg
61+
*.g.cs
62+
*.g.resources
63+
*.AssemblyInfo.cs
64+
*.editorconfig
65+
project.assets.json
66+
project.nuget.cache
67+
*.nuget.dgspec.json
68+
*.nuget.g.props
69+
*.nuget.g.targets
70+
publish/
71+
.vs/

WINDOWS_APP_SKILLS.md

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
Windows App Developer
2+
Purpose
3+
Provides expertise in building modern Windows desktop applications using WinUI 3, WPF, and Windows App SDK. Specializes in XAML-based UI development, MVVM architecture, native Windows integration, and modern packaging with MSIX.
4+
5+
When to Use
6+
Building Windows desktop applications with WinUI 3 or WPF
7+
Implementing MVVM architecture for Windows apps
8+
Creating XAML layouts and custom controls
9+
Packaging applications with MSIX
10+
Integrating with Windows features (notifications, taskbar, system tray)
11+
Migrating WPF applications to WinUI 3
12+
Implementing Windows-specific features (jump lists, live tiles)
13+
Building Microsoft Store-ready applications
14+
Quick Start
15+
Invoke this skill when:
16+
17+
Building Windows desktop applications with WinUI 3 or WPF
18+
Implementing MVVM architecture for Windows apps
19+
Creating XAML layouts and custom controls
20+
Packaging applications with MSIX
21+
Integrating with Windows features (notifications, taskbar)
22+
Do NOT invoke when:
23+
24+
Building cross-platform apps → use mobile-developer or electron-pro
25+
Console applications → use appropriate language skill
26+
PowerShell GUI → use powershell-ui-architect
27+
Web applications → use appropriate web skill
28+
Decision Framework
29+
Windows App Task?
30+
├── New Modern App → WinUI 3 with Windows App SDK
31+
├── Existing WPF App → Maintain or migrate to WinUI 3
32+
├── Cross-Platform Priority → Consider .NET MAUI
33+
├── Enterprise Internal → WPF with proven patterns
34+
├── Store Distribution → MSIX packaging required
35+
└── System Integration → P/Invoke or Windows SDK APIs
36+
Core Workflows
37+
1. WinUI 3 Application Setup
38+
Create project using Windows App SDK template
39+
Configure Package.appxmanifest for capabilities
40+
Set up MVVM infrastructure (CommunityToolkit.Mvvm)
41+
Implement navigation and shell structure
42+
Create reusable control library
43+
Configure MSIX packaging
44+
Set up CI/CD for Store or sideload distribution
45+
2. MVVM Implementation
46+
Define ViewModels with observable properties
47+
Implement commands for user actions
48+
Create services for data and business logic
49+
Set up dependency injection container
50+
Bind Views to ViewModels in XAML
51+
Implement navigation service
52+
Add design-time data for XAML preview
53+
3. MSIX Packaging
54+
Configure Package.appxmanifest
55+
Define application identity and capabilities
56+
Set up visual assets (icons, splash)
57+
Configure installation behavior
58+
Sign package with certificate
59+
Test installation and updates
60+
Submit to Microsoft Store or deploy internally
61+
Best Practices
62+
Use WinUI 3 for new development, WPF for legacy maintenance
63+
Implement MVVM strictly for testability and separation
64+
Use x:Bind for compile-time binding validation
65+
Leverage Community Toolkit for common patterns
66+
Package with MSIX for modern installation experience
67+
Follow Fluent Design System for consistent UX
68+
Anti-Patterns
69+
Code-behind logic → Move to ViewModels
70+
Synchronous UI operations → Use async/await for I/O
71+
Direct service calls from Views → Go through ViewModels
72+
Ignoring DPI awareness → Test at multiple scale factors
73+
Missing capabilities → Declare required capabilities in manifest

voxtral_realtime/windows/README.md

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Voxtral Realtime - Windows CUDA
2+
3+
Real-time speech transcription desktop app powered by ExecuTorch with CUDA acceleration.
4+
5+
This is the Windows equivalent of the [macOS Voxtral Realtime app](../macos/).
6+
7+
## Quick Start (Pre-built Release)
8+
9+
Download `VoxtralRealtime.exe` from the [Releases](https://github.com/meta-pytorch/executorch-examples/releases) page and run it directly. No installation required.
10+
11+
You also need:
12+
- The `voxtral_realtime_runner.exe` (built from ExecuTorch with CUDA support)
13+
- Model files from HuggingFace (see [Model Files](#model-files) below)
14+
15+
## Prerequisites
16+
17+
- Windows 10/11 with NVIDIA GPU (CUDA-capable)
18+
- CUDA Toolkit installed (auto-detected from `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\`)
19+
- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) (only for building from source)
20+
21+
## Model Files
22+
23+
Download from HuggingFace:
24+
25+
```powershell
26+
pip install huggingface_hub
27+
huggingface-cli download younghan-meta/Voxtral-Mini-4B-Realtime-2602-ExecuTorch-CUDA --local-dir voxtral_rt_exports
28+
```
29+
30+
This downloads: `model.pte`, `preprocessor.pte`, `aoti_cuda_blob.ptd`
31+
32+
You also need the tokenizer from the base model:
33+
34+
```powershell
35+
huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602 tekken.json --local-dir voxtral_tokenizer
36+
```
37+
38+
## Building the Runner
39+
40+
Build the `voxtral_realtime_runner.exe` from the ExecuTorch repo:
41+
42+
```bash
43+
cd executorch
44+
cmake --preset voxtral-realtime-cuda
45+
cmake --build --preset voxtral-realtime-cuda
46+
```
47+
48+
The runner will be at `cmake-out/examples/models/voxtral_realtime/Release/voxtral_realtime_runner.exe`.
49+
50+
## Build from Source
51+
52+
```powershell
53+
# Install .NET SDK if not already installed
54+
winget install Microsoft.DotNet.SDK.8
55+
56+
# Build
57+
cd VoxtralRealtime
58+
dotnet restore
59+
dotnet build --configuration Release
60+
61+
# Run
62+
dotnet run --project VoxtralRealtime --configuration Release
63+
```
64+
65+
## Publish Standalone Executable
66+
67+
Create a single self-contained exe (no .NET runtime required on target machine):
68+
69+
```powershell
70+
cd VoxtralRealtime
71+
dotnet publish VoxtralRealtime --configuration Release --runtime win-x64 --self-contained true /p:PublishSingleFile=true /p:IncludeNativeLibrariesForSelfExtract=true /p:DebugType=none -o publish
72+
```
73+
74+
The output `publish\VoxtralRealtime.exe` can be distributed and run on any Windows x64 machine.
75+
76+
## Configuration
77+
78+
On first launch, the app auto-loads the model from default paths. All paths are configurable in Settings:
79+
80+
| File | Default Path |
81+
|------|-------------|
82+
| Runner | `cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe` |
83+
| Model | `voxtral_rt_exports_wsl\model.pte` |
84+
| Preprocessor | `voxtral_rt_exports_wsl\preprocessor.pte` |
85+
| CUDA blob | `voxtral_rt_exports_wsl\aoti_cuda_blob.ptd` |
86+
| Tokenizer | `tekken.json` |
87+
88+
## Features
89+
90+
- **Live Transcription** - Start/Pause/Resume with streaming text output
91+
- **Session Management** - Save, rename, pin, delete, and export sessions (TXT/JSON/SRT)
92+
- **Text Replacements** - Auto-correct transcription (e.g., "executorch" -> "ExecuTorch")
93+
- **Text Snippets** - Voice-triggered templates for common text blocks
94+
- **Dictation Mode** - Ctrl+Space global hotkey, floating overlay, auto-paste to any app, auto-stop on 2s silence
95+
- **Audio Level Visualization** - Real-time waveform display
96+
97+
## Keyboard Shortcuts
98+
99+
| Shortcut | Action |
100+
|----------|--------|
101+
| Ctrl+Shift+R | Start / Resume transcription |
102+
| Ctrl+. | Pause transcription |
103+
| Ctrl+Enter | End session |
104+
| Ctrl+Space | Toggle dictation mode |
105+
106+
## Architecture
107+
108+
The app wraps the `voxtral_realtime_runner.exe` C++ binary via stdin/stdout pipes:
109+
110+
```
111+
Microphone (WASAPI 48kHz) -> Resample to 16kHz mono f32le -> stdin pipe -> runner.exe -> stdout tokens -> WPF UI
112+
```
113+
114+
Platform-specific adaptations from the macOS app:
115+
- **Audio**: WASAPI (shared mode) replaces AVAudioEngine, with software resampling from native rate to 16kHz
116+
- **Hotkey**: Win32 `RegisterHotKey` replaces Carbon `RegisterEventHotKey`
117+
- **UI**: WPF (.NET 8) with MVVM replaces SwiftUI
118+
- **Backend**: `--data_path` for CUDA blob (macOS uses Metal backend)
119+
- **Stdin**: Binary mode (`_setmode`) required on Windows to prevent 0x1A byte (Ctrl+Z) from being interpreted as EOF
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.0.31903.59
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VoxtralRealtime", "VoxtralRealtime\VoxtralRealtime.csproj", "{A1B2C3D4-E5F6-7890-ABCD-EF1234567890}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{A1B2C3D4-E5F6-7890-ABCD-EF1234567890}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{A1B2C3D4-E5F6-7890-ABCD-EF1234567890}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{A1B2C3D4-E5F6-7890-ABCD-EF1234567890}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{A1B2C3D4-E5F6-7890-ABCD-EF1234567890}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
EndGlobal
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<!-- Copyright (c) Meta Platforms, Inc. and affiliates. -->
2+
<!-- All rights reserved. -->
3+
<!-- This source code is licensed under the BSD-style license found in the -->
4+
<!-- LICENSE file in the root directory of this source tree. -->
5+
6+
<Application x:Class="VoxtralRealtime.App"
7+
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
8+
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
9+
StartupUri="Views/MainWindow.xaml">
10+
<Application.Resources>
11+
<ResourceDictionary>
12+
<ResourceDictionary.MergedDictionaries>
13+
<ResourceDictionary Source="Resources/Styles.xaml"/>
14+
</ResourceDictionary.MergedDictionaries>
15+
</ResourceDictionary>
16+
</Application.Resources>
17+
</Application>
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright (c) Meta Platforms, Inc. and affiliates.
2+
// All rights reserved.
3+
// This source code is licensed under the BSD-style license found in the
4+
// LICENSE file in the root directory of this source tree.
5+
6+
using System.Runtime.InteropServices;
7+
using System.Windows;
8+
using VoxtralRealtime.Services;
9+
using VoxtralRealtime.ViewModels;
10+
11+
namespace VoxtralRealtime;
12+
13+
public partial class App : Application
14+
{
15+
[DllImport("kernel32.dll")]
16+
private static extern bool AttachConsole(int dwProcessId);
17+
private const int ATTACH_PARENT_PROCESS = -1;
18+
public static SettingsViewModel Settings { get; private set; } = null!;
19+
public static ReplacementStoreViewModel ReplacementStore { get; private set; } = null!;
20+
public static SnippetStoreViewModel SnippetStore { get; private set; } = null!;
21+
public static TextPipeline TextPipeline { get; private set; } = null!;
22+
public static TranscriptStoreViewModel Store { get; private set; } = null!;
23+
public static DictationViewModel Dictation { get; private set; } = null!;
24+
public static GlobalHotkeyService HotkeyService { get; private set; } = null!;
25+
26+
protected override void OnStartup(StartupEventArgs e)
27+
{
28+
base.OnStartup(e);
29+
30+
// Attach to parent console so Console.WriteLine shows in the terminal
31+
AttachConsole(ATTACH_PARENT_PROCESS);
32+
AppLogger.Log("App", "Voxtral Realtime starting up");
33+
34+
Settings = new SettingsViewModel();
35+
ReplacementStore = new ReplacementStoreViewModel();
36+
SnippetStore = new SnippetStoreViewModel();
37+
TextPipeline = new TextPipeline(ReplacementStore, SnippetStore);
38+
Store = new TranscriptStoreViewModel(Settings, TextPipeline);
39+
HotkeyService = new GlobalHotkeyService();
40+
Dictation = new DictationViewModel(Store, Settings, HotkeyService);
41+
}
42+
43+
protected override void OnExit(ExitEventArgs e)
44+
{
45+
Dictation.Cleanup();
46+
Store.Shutdown();
47+
base.OnExit(e);
48+
}
49+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright (c) Meta Platforms, Inc. and affiliates.
2+
// All rights reserved.
3+
// This source code is licensed under the BSD-style license found in the
4+
// LICENSE file in the root directory of this source tree.
5+
6+
using System.Globalization;
7+
using System.Windows;
8+
using System.Windows.Data;
9+
10+
namespace VoxtralRealtime.Converters;
11+
12+
public class BoolToVisibilityConverter : IValueConverter
13+
{
14+
public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
15+
{
16+
bool invert = parameter?.ToString() == "Invert";
17+
bool visible = value is bool b && b;
18+
if (invert) visible = !visible;
19+
return visible ? Visibility.Visible : Visibility.Collapsed;
20+
}
21+
22+
public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture)
23+
{
24+
return value is Visibility v && v == Visibility.Visible;
25+
}
26+
}
27+
28+
public class InverseBoolConverter : IValueConverter
29+
{
30+
public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
31+
{
32+
return value is bool b && !b;
33+
}
34+
35+
public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture)
36+
{
37+
return value is bool b && !b;
38+
}
39+
}
40+
41+
public class NullToVisibilityConverter : IValueConverter
42+
{
43+
public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
44+
{
45+
bool invert = parameter?.ToString() == "Invert";
46+
bool isNull = value == null;
47+
if (value is string s) isNull = string.IsNullOrEmpty(s);
48+
bool visible = invert ? isNull : !isNull;
49+
return visible ? Visibility.Visible : Visibility.Collapsed;
50+
}
51+
52+
public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture)
53+
{
54+
throw new NotImplementedException();
55+
}
56+
}
57+
58+
public class EnumToVisibilityConverter : IValueConverter
59+
{
60+
public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
61+
{
62+
if (value == null || parameter == null) return Visibility.Collapsed;
63+
return value.ToString() == parameter.ToString() ? Visibility.Visible : Visibility.Collapsed;
64+
}
65+
66+
public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture)
67+
{
68+
throw new NotImplementedException();
69+
}
70+
}

0 commit comments

Comments
 (0)