-
Notifications
You must be signed in to change notification settings - Fork 309
Expand file tree
/
Copy pathapp.py
More file actions
174 lines (140 loc) · 4.84 KB
/
app.py
File metadata and controls
174 lines (140 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Live Audio Transcription — Foundry Local SDK Example (Python)
#
# Tries PyAudio mic capture first; falls back to synthetic PCM if unavailable.
#
# Usage:
# pip install -r requirements.txt
# python src/app.py # Live microphone
# python src/app.py --synth # Synthetic 440Hz sine wave
import math
import signal
import struct
import sys
import threading
import time
from foundry_local_sdk import Configuration, FoundryLocalManager
use_synth = "--synth" in sys.argv
print("===========================================================")
print(" Foundry Local -- Live Audio Transcription Demo (Python)")
print("===========================================================")
print()
config = Configuration(app_name="foundry_local_samples")
FoundryLocalManager.initialize(config)
manager = FoundryLocalManager.instance
manager.download_and_register_eps()
model = manager.catalog.get_model("nemotron-speech-streaming-en-0.6b")
if model is None:
raise RuntimeError('Model "nemotron-speech-streaming-en-0.6b" not found in catalog')
model.download(
lambda progress: print(f"\rDownloading model: {progress:.2f}%", end="", flush=True)
)
print()
print(f"Loading model {model.id}...", end="")
model.load()
print("done.")
audio_client = model.get_audio_client()
session = audio_client.create_live_transcription_session()
session.settings.sample_rate = 16000
session.settings.channels = 1
session.settings.language = "en"
session.start()
print("✓ Session started")
# --- Background thread reads transcription results (mirrors JS readPromise) ---
def read_results():
for result in session.get_stream():
text = result.content[0].text if result.content else ""
if result.is_final:
print()
print(f" [FINAL] {text}")
elif text:
print(text, end="", flush=True)
read_thread = threading.Thread(target=read_results, daemon=True)
read_thread.start()
# --- Microphone capture (mirrors JS naudiodon2 / C++ PortAudio) ---
# Try PyAudio for mic input; fall back to synthetic PCM on failure.
RATE = 16000
CHANNELS = 1
CHUNK = RATE // 10 # 100ms of audio = 1600 frames
stop_event = threading.Event()
mic_active = False
pa = None
stream = None
if not use_synth:
try:
import pyaudio
pa = pyaudio.PyAudio()
stream = pa.open(
format=pyaudio.paInt16,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
mic_active = True
print()
print("===========================================================")
print(" LIVE TRANSCRIPTION ACTIVE")
print(" Speak into your microphone.")
print(" Press Ctrl+C to stop.")
print("===========================================================")
print()
def capture_mic():
while not stop_event.is_set():
try:
pcm_data = stream.read(CHUNK, exception_on_overflow=False)
if pcm_data:
session.append(pcm_data)
except Exception as e:
print(f"\n[ERROR] Microphone capture failed: {e}")
stop_event.set()
break
capture_thread = threading.Thread(target=capture_mic, daemon=True)
capture_thread.start()
except Exception as e:
print(f"Could not initialize microphone: {e}")
print("Falling back to synthetic audio test...")
print()
mic_active = False
if stream:
stream.close()
if pa:
pa.terminate()
pa = None
stream = None
# Fallback: push synthetic PCM (440Hz sine wave) — mirrors JS catch block
if not mic_active:
print("Pushing synthetic audio (440Hz sine, 2s)...")
duration = 2
total_samples = RATE * duration
pcm_bytes = bytearray(total_samples * 2)
for i in range(total_samples):
t = i / RATE
sample = int(32767 * 0.5 * math.sin(2 * math.pi * 440 * t))
struct.pack_into("<h", pcm_bytes, i * 2, sample)
chunk_size = (RATE // 10) * 2 # 100ms
for offset in range(0, len(pcm_bytes), chunk_size):
end = min(offset + chunk_size, len(pcm_bytes))
session.append(bytes(pcm_bytes[offset:end]))
time.sleep(0.1)
print("✓ Synthetic audio pushed")
time.sleep(3) # Wait for remaining transcription results
# --- Graceful shutdown (mirrors JS SIGINT handler / C++ SignalHandler) ---
def shutdown(*_args):
print("\n\nStopping...")
stop_event.set()
if stream:
stream.stop_stream()
stream.close()
if pa:
pa.terminate()
session.stop()
read_thread.join(timeout=5)
model.unload()
print("✓ Done")
sys.exit(0)
signal.signal(signal.SIGINT, lambda *a: shutdown())
if mic_active:
# Block until Ctrl+C
stop_event.wait()
else:
shutdown()