Skip to content

Commit ba8eaed

Browse files
Voice to text support!
1 parent ca6032d commit ba8eaed

12 files changed

Lines changed: 334 additions & 69 deletions

main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from fastapi.staticfiles import StaticFiles
88
from fastapi.templating import Jinja2Templates
99
from fastapi.responses import RedirectResponse, Response, HTMLResponse
10-
from routers import chat, files, setup
10+
from routers import audio, chat, files, setup
1111
from utils.conversations import create_conversation
1212
from fastapi.exceptions import HTTPException, RequestValidationError
1313

@@ -37,6 +37,7 @@ async def lifespan(app: FastAPI):
3737
app = FastAPI(lifespan=lifespan)
3838

3939
# Mount routers
40+
app.include_router(audio.router)
4041
app.include_router(chat.router)
4142
app.include_router(files.router)
4243
app.include_router(setup.router)

routers/audio.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import logging
2+
import os
3+
4+
from dotenv import load_dotenv
5+
from fastapi import APIRouter, File, UploadFile
6+
from fastapi.responses import PlainTextResponse
7+
from openai import AsyncOpenAI
8+
9+
logger = logging.getLogger("uvicorn.error")
10+
11+
router = APIRouter(prefix="/audio", tags=["audio"])
12+
13+
14+
@router.post("/transcribe")
15+
async def transcribe_audio(audio: UploadFile = File(...)) -> PlainTextResponse:
16+
"""Transcribe an uploaded audio file using OpenAI Whisper."""
17+
load_dotenv(override=True)
18+
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
19+
20+
transcription = await client.audio.transcriptions.create(
21+
model="whisper-1",
22+
file=(audio.filename, await audio.read(), audio.content_type),
23+
)
24+
25+
return PlainTextResponse(transcription.text)

static/audio-recorder.js

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// Audio recording and transcription
2+
(function () {
3+
let mediaRecorder = null;
4+
let audioChunks = [];
5+
6+
const MIC_SVG =
7+
'<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"/><path d="M19 10v2a7 7 0 0 1-14 0v-2"/><line x1="12" y1="19" x2="12" y2="23"/><line x1="8" y1="23" x2="16" y2="23"/></svg>';
8+
const STOP_SVG =
9+
'<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="currentColor" stroke="none"><rect x="4" y="4" width="16" height="16" rx="2"/></svg>';
10+
11+
function getMicButton() {
12+
return document.getElementById("micButton");
13+
}
14+
15+
function getTextarea() {
16+
return document.getElementById("userInput");
17+
}
18+
19+
async function startRecording() {
20+
const btn = getMicButton();
21+
22+
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
23+
alert(
24+
"Microphone access requires a secure context.\n" +
25+
"Please access this app via http://localhost:8000 instead of http://0.0.0.0:8000."
26+
);
27+
return;
28+
}
29+
30+
try {
31+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
32+
mediaRecorder = new MediaRecorder(stream);
33+
audioChunks = [];
34+
35+
mediaRecorder.addEventListener("dataavailable", (e) => {
36+
if (e.data.size > 0) audioChunks.push(e.data);
37+
});
38+
39+
mediaRecorder.addEventListener("stop", async () => {
40+
// Stop all tracks so the browser releases the mic
41+
stream.getTracks().forEach((t) => t.stop());
42+
43+
const blob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
44+
await transcribe(blob);
45+
});
46+
47+
mediaRecorder.start();
48+
btn.classList.add("recording");
49+
btn.innerHTML = STOP_SVG;
50+
btn.title = "Stop recording";
51+
} catch (err) {
52+
console.error("Microphone access denied:", err);
53+
alert("Could not access your microphone. Please check permissions.");
54+
}
55+
}
56+
57+
function stopRecording() {
58+
if (mediaRecorder && mediaRecorder.state !== "inactive") {
59+
mediaRecorder.stop();
60+
}
61+
const btn = getMicButton();
62+
btn.classList.remove("recording");
63+
btn.innerHTML = MIC_SVG;
64+
btn.title = "Record audio";
65+
}
66+
67+
async function transcribe(blob) {
68+
const formData = new FormData();
69+
// Use webm extension; Whisper accepts it
70+
formData.append("audio", blob, "recording.webm");
71+
72+
const textarea = getTextarea();
73+
const btn = getMicButton();
74+
const savedPlaceholder = textarea ? textarea.placeholder : "";
75+
76+
// Show transcribing state
77+
if (textarea) {
78+
textarea.placeholder = "Transcribing...";
79+
textarea.disabled = true;
80+
}
81+
if (btn) btn.disabled = true;
82+
83+
try {
84+
const res = await fetch("/audio/transcribe", {
85+
method: "POST",
86+
body: formData,
87+
});
88+
89+
if (!res.ok) {
90+
console.error("Transcription failed:", res.status);
91+
alert("Transcription failed. Please try again.");
92+
return;
93+
}
94+
95+
const text = await res.text();
96+
if (textarea) {
97+
// Append with a space if there's already text
98+
const current = textarea.value.trim();
99+
textarea.value = current ? current + " " + text : text;
100+
// Trigger resize
101+
textarea.style.height = "auto";
102+
textarea.style.height = textarea.scrollHeight + "px";
103+
textarea.focus();
104+
}
105+
} catch (err) {
106+
console.error("Transcription request error:", err);
107+
alert("Transcription failed. Please try again.");
108+
} finally {
109+
// Restore input state
110+
if (textarea) {
111+
textarea.placeholder = savedPlaceholder;
112+
textarea.disabled = false;
113+
}
114+
if (btn) btn.disabled = false;
115+
}
116+
}
117+
118+
// Toggle recording on mic button click
119+
document.addEventListener("click", (e) => {
120+
const btn = e.target.closest("#micButton");
121+
if (!btn) return;
122+
123+
if (mediaRecorder && mediaRecorder.state === "recording") {
124+
stopRecording();
125+
} else {
126+
startRecording();
127+
}
128+
});
129+
})();

static/styles.css

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,38 @@ pre {
442442
display: none;
443443
}
444444

445+
/* Mic button */
446+
.micButton {
447+
display: flex;
448+
align-items: center;
449+
justify-content: center;
450+
width: 40px;
451+
height: calc(1em + 32px + 4px);
452+
cursor: pointer;
453+
color: #666;
454+
flex-shrink: 0;
455+
background: none;
456+
border: none;
457+
border-radius: 50%;
458+
transition: color 0.2s;
459+
padding: 0;
460+
font-size: inherit;
461+
}
462+
463+
.micButton:hover {
464+
color: #000;
465+
}
466+
467+
.micButton.recording {
468+
color: #e53e3e;
469+
animation: micPulse 1.5s infinite;
470+
}
471+
472+
@keyframes micPulse {
473+
0%, 100% { opacity: 1; }
474+
50% { opacity: 0.4; }
475+
}
476+
445477
/* Image preview above the input row */
446478
.imagePreview {
447479
display: flex;

templates/index.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444
onchange="previewImages(this)"
4545
/>
4646
</label>
47+
<button type="button" id="micButton" class="micButton" title="Record audio">
48+
<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"/><path d="M19 10v2a7 7 0 0 1-14 0v-2"/><line x1="12" y1="19" x2="12" y2="23"/><line x1="8" y1="23" x2="16" y2="23"/></svg>
49+
</button>
4750
<textarea
4851
class="input"
4952
name="userInput"

templates/layout.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,6 @@
3434
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
3535
<script src="https://cdn.jsdelivr.net/npm/dompurify/dist/purify.min.js"></script>
3636
<script src="{{ url_for('static', path='stream-md.js') }}"></script>
37+
<script src="{{ url_for('static', path='audio-recorder.js') }}"></script>
3738
</body>
3839
</html>

tests/conftest.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,31 @@ def anyio_backend() -> str:
6666
}
6767

6868

69+
# ---------------------------------------------------------------------------
70+
# Suppress "Event loop is closed" RuntimeError from anyio test runner cleanup.
71+
#
72+
# On Python 3.13 + httpx + anyio, streaming response cleanup can race with
73+
# event loop shutdown. anyio's TestRunner captures these as async exceptions
74+
# and re-raises them after the test passes. Since they are harmless cleanup
75+
# artifacts (not test failures), we filter them out.
76+
# ---------------------------------------------------------------------------
77+
try:
78+
import anyio._backends._asyncio as _anyio_asyncio_backend
79+
80+
_original_raise_async = _anyio_asyncio_backend.TestRunner._raise_async_exceptions
81+
82+
def _filtered_raise_async(self): # type: ignore[no-untyped-def]
83+
self._exceptions = [
84+
e for e in self._exceptions
85+
if not (isinstance(e, RuntimeError) and "Event loop is closed" in str(e))
86+
]
87+
_original_raise_async(self)
88+
89+
_anyio_asyncio_backend.TestRunner._raise_async_exceptions = _filtered_raise_async # type: ignore[assignment]
90+
except (ImportError, AttributeError):
91+
pass # Different anyio version; skip the patch
92+
93+
6994
@pytest.fixture(autouse=True)
7095
def _isolate_asyncio_running_loop(request: pytest.FixtureRequest) -> Iterator[None]:
7196
"""

tests/test_audio_transcription.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Unit tests for audio transcription endpoint."""
2+
3+
import io
4+
from unittest.mock import AsyncMock, MagicMock, patch
5+
6+
import pytest
7+
from httpx import ASGITransport, AsyncClient
8+
9+
from main import app
10+
11+
12+
@pytest.fixture(params=["asyncio"])
13+
def anyio_backend(request):
14+
return request.param
15+
16+
17+
@pytest.mark.anyio
18+
async def test_transcribe_returns_text():
19+
"""POST /audio/transcribe with an audio file returns transcribed text."""
20+
mock_client = AsyncMock()
21+
transcription = MagicMock()
22+
transcription.text = "Hello, world!"
23+
mock_client.audio.transcriptions.create = AsyncMock(return_value=transcription)
24+
25+
with patch("routers.audio.AsyncOpenAI", return_value=mock_client):
26+
transport = ASGITransport(app=app)
27+
async with AsyncClient(transport=transport, base_url="http://test") as ac:
28+
fake_audio = io.BytesIO(b"\x00" * 100)
29+
response = await ac.post(
30+
"/audio/transcribe",
31+
files={"audio": ("recording.webm", fake_audio, "audio/webm")},
32+
)
33+
34+
assert response.status_code == 200
35+
assert response.text == "Hello, world!"
36+
37+
# Verify whisper-1 model was used
38+
call_kwargs = mock_client.audio.transcriptions.create.call_args
39+
assert call_kwargs.kwargs.get("model") == "whisper-1"
40+
41+
42+
@pytest.mark.anyio
43+
async def test_transcribe_without_file_returns_422():
44+
"""POST /audio/transcribe without a file returns 422."""
45+
transport = ASGITransport(app=app)
46+
async with AsyncClient(transport=transport, base_url="http://test") as ac:
47+
response = await ac.post("/audio/transcribe")
48+
49+
assert response.status_code == 422
50+
51+
52+
@pytest.mark.anyio
53+
async def test_mic_button_present_in_index():
54+
"""The index page should contain a mic button."""
55+
transport = ASGITransport(app=app)
56+
async with AsyncClient(transport=transport, base_url="http://test") as ac:
57+
response = await ac.get("/")
58+
59+
# May redirect to setup if env vars missing, so check both
60+
if response.status_code == 200:
61+
assert "micButton" in response.text
62+
else:
63+
# If redirected to setup, verify the index template has the button
64+
# by reading the template file directly
65+
with open("templates/index.html") as f:
66+
template = f.read()
67+
assert "micButton" in template

0 commit comments

Comments
 (0)