Skip to content

Commit df517d3

Browse files
author
jack
committed
feat: add voice dictation with OpenAI Whisper
- Add microphone button that appears when OpenAI is configured - Implement real-time waveform visualization during recording - Add backend /audio/transcribe endpoint with security measures: - 25MB file size limit with 413 status code - 30-second timeout for API calls - Proper authentication via X-Secret-Key - Add visual feedback during transcription - Show recording duration and estimated file size - Warn users when approaching 25MB limit - Auto-stop recording at 10 minutes or 25MB - Add comprehensive integration tests - Fix ESLint configuration and MessageCopyLink warning Security: API keys remain backend-only, no frontend exposure
1 parent 19befb6 commit df517d3

12 files changed

Lines changed: 834 additions & 40 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/goose-server/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ serde_yaml = "0.9.34"
3737
axum-extra = "0.10.0"
3838
utoipa = { version = "4.1", features = ["axum_extras", "chrono"] }
3939
dirs = "6.0.0"
40-
reqwest = { version = "0.12.9", features = ["json", "rustls-tls", "blocking"], default-features = false }
40+
reqwest = { version = "0.12.9", features = ["json", "rustls-tls", "blocking", "multipart"], default-features = false }
4141

4242
[[bin]]
4343
name = "goosed"
Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
/// Audio transcription route handler
2+
///
3+
/// This module provides endpoints for audio transcription using OpenAI's Whisper API.
4+
/// The OpenAI API key must be configured in the backend for this to work.
5+
use super::utils::verify_secret_key;
6+
use crate::state::AppState;
7+
use axum::{
8+
extract::State,
9+
http::{HeaderMap, StatusCode},
10+
routing::post,
11+
Json, Router,
12+
};
13+
use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
14+
use reqwest::Client;
15+
use serde::{Deserialize, Serialize};
16+
use std::sync::Arc;
17+
use std::time::Duration;
18+
19+
// Constants
20+
const MAX_AUDIO_SIZE_BYTES: usize = 25 * 1024 * 1024; // 25MB
21+
const OPENAI_TIMEOUT_SECONDS: u64 = 30;
22+
23+
#[derive(Debug, Deserialize)]
24+
struct TranscribeRequest {
25+
audio: String, // Base64 encoded audio data
26+
mime_type: String,
27+
}
28+
29+
#[derive(Debug, Serialize)]
30+
struct TranscribeResponse {
31+
text: String,
32+
}
33+
34+
#[derive(Debug, Deserialize)]
35+
struct WhisperResponse {
36+
text: String,
37+
}
38+
39+
/// Transcribe audio using OpenAI's Whisper API
40+
///
41+
/// # Request
42+
/// - `audio`: Base64 encoded audio data
43+
/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav")
44+
///
45+
/// # Response
46+
/// - `text`: Transcribed text from the audio
47+
///
48+
/// # Errors
49+
/// - 401: Unauthorized (missing or invalid X-Secret-Key header)
50+
/// - 412: Precondition Failed (OpenAI API key not configured)
51+
/// - 400: Bad Request (invalid base64 audio data)
52+
/// - 413: Payload Too Large (audio file exceeds 25MB limit)
53+
/// - 415: Unsupported Media Type (unsupported audio format)
54+
/// - 502: Bad Gateway (OpenAI API error)
55+
/// - 503: Service Unavailable (network error)
56+
async fn transcribe_handler(
57+
State(state): State<Arc<AppState>>,
58+
headers: HeaderMap,
59+
Json(request): Json<TranscribeRequest>,
60+
) -> Result<Json<TranscribeResponse>, StatusCode> {
61+
verify_secret_key(&headers, &state)?;
62+
63+
// Get the OpenAI API key from config
64+
let config = goose::config::Config::global();
65+
let api_key: String = config
66+
.get_secret("OPENAI_API_KEY")
67+
.map_err(|_| StatusCode::PRECONDITION_FAILED)?;
68+
69+
// Decode the base64 audio data
70+
let audio_bytes = BASE64
71+
.decode(&request.audio)
72+
.map_err(|_| StatusCode::BAD_REQUEST)?;
73+
74+
// Check file size
75+
if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
76+
tracing::warn!(
77+
"Audio file too large: {} bytes (max: {} bytes)",
78+
audio_bytes.len(),
79+
MAX_AUDIO_SIZE_BYTES
80+
);
81+
return Err(StatusCode::PAYLOAD_TOO_LARGE);
82+
}
83+
84+
// Determine file extension based on MIME type
85+
let file_extension = match request.mime_type.as_str() {
86+
"audio/webm" => "webm",
87+
"audio/mp4" => "mp4",
88+
"audio/mpeg" => "mp3",
89+
"audio/mpga" => "mpga",
90+
"audio/m4a" => "m4a",
91+
"audio/wav" => "wav",
92+
"audio/x-wav" => "wav",
93+
_ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE),
94+
};
95+
96+
// Create a multipart form with the audio file
97+
let part = reqwest::multipart::Part::bytes(audio_bytes)
98+
.file_name(format!("audio.{}", file_extension))
99+
.mime_str(&request.mime_type)
100+
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
101+
102+
let form = reqwest::multipart::Form::new()
103+
.part("file", part)
104+
.text("model", "whisper-1")
105+
.text("response_format", "json");
106+
107+
// Make request to OpenAI Whisper API
108+
let client = Client::builder()
109+
.timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS))
110+
.build()
111+
.map_err(|e| {
112+
tracing::error!("Failed to create HTTP client: {}", e);
113+
StatusCode::INTERNAL_SERVER_ERROR
114+
})?;
115+
116+
let response = client
117+
.post("https://api.openai.com/v1/audio/transcriptions")
118+
.header("Authorization", format!("Bearer {}", api_key))
119+
.multipart(form)
120+
.send()
121+
.await
122+
.map_err(|e| {
123+
if e.is_timeout() {
124+
tracing::error!(
125+
"OpenAI API request timed out after {}s",
126+
OPENAI_TIMEOUT_SECONDS
127+
);
128+
StatusCode::GATEWAY_TIMEOUT
129+
} else {
130+
tracing::error!("Failed to send request to OpenAI: {}", e);
131+
StatusCode::SERVICE_UNAVAILABLE
132+
}
133+
})?;
134+
135+
if !response.status().is_success() {
136+
let error_text = response.text().await.unwrap_or_default();
137+
tracing::error!("OpenAI API error: {}", error_text);
138+
return Err(StatusCode::BAD_GATEWAY);
139+
}
140+
141+
let whisper_response: WhisperResponse = response.json().await.map_err(|e| {
142+
tracing::error!("Failed to parse OpenAI response: {}", e);
143+
StatusCode::INTERNAL_SERVER_ERROR
144+
})?;
145+
146+
Ok(Json(TranscribeResponse {
147+
text: whisper_response.text,
148+
}))
149+
}
150+
151+
pub fn routes(state: Arc<AppState>) -> Router {
152+
Router::new()
153+
.route("/audio/transcribe", post(transcribe_handler))
154+
.with_state(state)
155+
}
156+
157+
#[cfg(test)]
158+
mod tests {
159+
use super::*;
160+
use axum::{body::Body, http::Request};
161+
use tower::ServiceExt;
162+
163+
#[tokio::test]
164+
async fn test_transcribe_endpoint_requires_auth() {
165+
let state = AppState::new(
166+
Arc::new(goose::agents::Agent::new()),
167+
"test-secret".to_string(),
168+
)
169+
.await;
170+
let app = routes(state);
171+
172+
// Test without auth header
173+
let request = Request::builder()
174+
.uri("/audio/transcribe")
175+
.method("POST")
176+
.header("content-type", "application/json")
177+
.body(Body::from(
178+
serde_json::to_string(&serde_json::json!({
179+
"audio": "dGVzdA==",
180+
"mime_type": "audio/webm"
181+
}))
182+
.unwrap(),
183+
))
184+
.unwrap();
185+
186+
let response = app.oneshot(request).await.unwrap();
187+
assert_eq!(response.status(), StatusCode::UNAUTHORIZED);
188+
}
189+
190+
#[tokio::test]
191+
async fn test_transcribe_endpoint_validates_size() {
192+
let state = AppState::new(
193+
Arc::new(goose::agents::Agent::new()),
194+
"test-secret".to_string(),
195+
)
196+
.await;
197+
let app = routes(state);
198+
199+
// Create a large base64 string (simulating > 25MB audio)
200+
let large_audio = BASE64.encode(vec![0u8; MAX_AUDIO_SIZE_BYTES + 1]);
201+
202+
let request = Request::builder()
203+
.uri("/audio/transcribe")
204+
.method("POST")
205+
.header("content-type", "application/json")
206+
.header("x-secret-key", "test-secret")
207+
.body(Body::from(
208+
serde_json::to_string(&serde_json::json!({
209+
"audio": large_audio,
210+
"mime_type": "audio/webm"
211+
}))
212+
.unwrap(),
213+
))
214+
.unwrap();
215+
216+
let response = app.oneshot(request).await.unwrap();
217+
assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE);
218+
}
219+
220+
#[tokio::test]
221+
async fn test_transcribe_endpoint_validates_mime_type() {
222+
let state = AppState::new(
223+
Arc::new(goose::agents::Agent::new()),
224+
"test-secret".to_string(),
225+
)
226+
.await;
227+
let app = routes(state);
228+
229+
let request = Request::builder()
230+
.uri("/audio/transcribe")
231+
.method("POST")
232+
.header("content-type", "application/json")
233+
.header("x-secret-key", "test-secret")
234+
.body(Body::from(
235+
serde_json::to_string(&serde_json::json!({
236+
"audio": "dGVzdA==",
237+
"mime_type": "application/pdf" // Invalid MIME type
238+
}))
239+
.unwrap(),
240+
))
241+
.unwrap();
242+
243+
let response = app.oneshot(request).await.unwrap();
244+
assert_eq!(response.status(), StatusCode::UNSUPPORTED_MEDIA_TYPE);
245+
}
246+
247+
#[tokio::test]
248+
async fn test_transcribe_endpoint_handles_invalid_base64() {
249+
let state = AppState::new(
250+
Arc::new(goose::agents::Agent::new()),
251+
"test-secret".to_string(),
252+
)
253+
.await;
254+
let app = routes(state);
255+
256+
let request = Request::builder()
257+
.uri("/audio/transcribe")
258+
.method("POST")
259+
.header("content-type", "application/json")
260+
.header("x-secret-key", "test-secret")
261+
.body(Body::from(
262+
serde_json::to_string(&serde_json::json!({
263+
"audio": "invalid-base64-!@#$%",
264+
"mime_type": "audio/webm"
265+
}))
266+
.unwrap(),
267+
))
268+
.unwrap();
269+
270+
let response = app.oneshot(request).await.unwrap();
271+
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
272+
}
273+
}

crates/goose-server/src/routes/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Export route modules
22
pub mod agent;
3+
pub mod audio;
34
pub mod config_management;
45
pub mod context;
56
pub mod extension;
@@ -19,6 +20,7 @@ pub fn configure(state: Arc<crate::state::AppState>) -> Router {
1920
.merge(health::routes())
2021
.merge(reply::routes(state.clone()))
2122
.merge(agent::routes(state.clone()))
23+
.merge(audio::routes(state.clone()))
2224
.merge(context::routes(state.clone()))
2325
.merge(extension::routes(state.clone()))
2426
.merge(config_management::routes(state.clone()))

ui/desktop/.eslintrc.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"root": true,
33
"env": {
44
"browser": true,
5-
"es2020": true
5+
"es2020": true,
6+
"node": true
67
},
78
"extends": [
89
"eslint:recommended",

ui/desktop/eslint.config.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ module.exports = [
7070
HTMLTextAreaElement: 'readonly',
7171
HTMLButtonElement: 'readonly',
7272
HTMLDivElement: 'readonly',
73+
HTMLCanvasElement: 'readonly',
7374
File: 'readonly',
7475
FileList: 'readonly',
7576
FileReader: 'readonly',
@@ -87,10 +88,17 @@ module.exports = [
8788
React: 'readonly',
8889
handleAction: 'readonly',
8990
requestAnimationFrame: 'readonly',
91+
cancelAnimationFrame: 'readonly',
9092
ResizeObserver: 'readonly',
9193
MutationObserver: 'readonly',
9294
NodeFilter: 'readonly',
9395
Text: 'readonly',
96+
AudioContext: 'readonly',
97+
AnalyserNode: 'readonly',
98+
MediaRecorder: 'readonly',
99+
MediaStream: 'readonly',
100+
Blob: 'readonly',
101+
FormData: 'readonly',
94102
},
95103
},
96104
plugins: {

0 commit comments

Comments
 (0)