vosk-api/python/example/test_gradio.py at a5ce4810dc109667e79b7995519ca0bf1e8931e4 · alphacep/vosk-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3

import json
import gradio as gr

from vosk import KaldiRecognizer, Model

model = Model(lang="en-us")

def transcribe(stream, new_chunk, transcribe_speaker, transcribe_meeting):

    sample_rate, audio_data = new_chunk
    audio_data = audio_data.tobytes()

    if stream is None:
        rec = KaldiRecognizer(model, sample_rate)
        result = []
    else:
        rec, result = stream

    if rec.AcceptWaveform(audio_data):
        text_result = json.loads(rec.Result())["text"]
        if text_result != "":
            result.append(text_result)
        partial_result = ""
    else:
        partial_result = json.loads(rec.PartialResult())["partial"] + " "

    return (rec, result), "\n".join(result) + "\n" + partial_result

def start_transcription():
    return None, ""

def stop_transcription(stream):
    if stream is not None:
        rec, result = stream
        final_result = json.loads(rec.FinalResult())["text"]
        result.append(final_result)
        return None, "\n".join(result)
    return None, ""

with gr.Blocks() as demo:
    transcribe_speaker = gr.Checkbox(label="Transcribe Speaker's Voice")
    transcribe_meeting = gr.Checkbox(label="Transcribe Entire Meeting")
    start_button = gr.Button("Start Transcription")
    stop_button = gr.Button("Stop Transcription")
    state = gr.State()
    audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
    text = gr.Textbox()

    start_button.click(start_transcription, inputs=[], outputs=[state, text])
    stop_button.click(stop_transcription, inputs=[state], outputs=[state, text])
    audio.change(transcribe, inputs=[state, audio, transcribe_speaker, transcribe_meeting], outputs=[state, text])

demo.launch(share=True)