Skip to content

Commit 8248b05

Browse files
author
Jicheng Lu
committed
add audio output
1 parent 9966804 commit 8248b05

3 files changed

Lines changed: 149 additions & 104 deletions

File tree

src/lib/helpers/pcmProcessor.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ export const AudioRecordingWorklet = `
33
class AudioProcessingWorklet extends AudioWorkletProcessor {
44
55
// send and clear buffer every 2048 samples,
6-
// which at 16khz is about 8 times a second
6+
// which at 16khz is about 8 times a second,
7+
// or at 24khz is about 11 times a second
78
buffer = new Int16Array(2048);
89
910
// current write index
@@ -43,12 +44,12 @@ class AudioProcessingWorklet extends AudioWorkletProcessor {
4344
// convert float32 -1 to 1 to int16 -32768 to 32767
4445
const int16Value = float32Array[i] * 32768;
4546
this.buffer[this.bufferWriteIndex++] = int16Value;
46-
if(this.bufferWriteIndex >= this.buffer.length) {
47+
if (this.bufferWriteIndex >= this.buffer.length) {
4748
this.sendAndClearBuffer();
4849
}
4950
}
5051
51-
if(this.bufferWriteIndex >= this.buffer.length) {
52+
if (this.bufferWriteIndex >= this.buffer.length) {
5253
this.sendAndClearBuffer();
5354
}
5455
}
Lines changed: 145 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,42 @@
1+
import { PUBLIC_SERVICE_URL } from "$env/static/public";
12
import { AudioRecordingWorklet } from "$lib/helpers/pcmProcessor";
23

3-
// @ts-ignore
4-
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
5-
64
// @ts-ignore
75
const AudioContext = window.AudioContext || window.webkitAudioContext;
86

7+
const sampleRate = 24000;
8+
9+
/** @type {AudioContext} */
10+
let audioCtx = new AudioContext();
11+
12+
/** @type {any[]} */
13+
let audioQueue = [];
14+
15+
/** @type {boolean} */
16+
let isPlaying = false;
17+
918
export const realtimeChat = {
1019

1120
/** @type {WebSocket | null} */
1221
socket: null,
1322

14-
/** @type {MediaRecorder | null} */
15-
mediaRecorder: null,
16-
1723
/** @type {MediaStream | null} */
1824
mediaStream: null,
1925

20-
/** @type {SpeechRecognition | null} */
21-
recognition: null,
26+
/** @type {AudioWorkletNode | null} */
27+
workletNode: null,
28+
29+
/** @type {MediaStreamAudioSourceNode | null} */
30+
micSource: null,
2231

2332
/**
2433
* @param {string} agentId
2534
* @param {string} conversationId
2635
*/
2736
start(agentId, conversationId) {
28-
this.socket = new WebSocket(`ws://localhost:5100/chat/stream/${agentId}/${conversationId}`);
37+
reset();
38+
const wsUrl = buildWebsocketUrl();
39+
this.socket = new WebSocket(`${wsUrl}/chat/stream/${agentId}/${conversationId}`);
2940

3041
this.socket.onopen = async () => {
3142
console.log("WebSocket connected");
@@ -35,17 +46,17 @@ export const realtimeChat = {
3546
}));
3647

3748
this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
38-
const audioCtx = new AudioContext({ sampleRate: 16000 });
49+
audioCtx = new AudioContext({ sampleRate: sampleRate });
3950

4051
const workletName = "audio-recorder-worklet";
41-
const src = createWorketFromSrc(workletName, AudioRecordingWorklet);
52+
const src = createWorkletFromSrc(workletName, AudioRecordingWorklet);
4253
await audioCtx.audioWorklet.addModule(src);
4354

44-
const workletNode = new AudioWorkletNode(audioCtx, workletName);
45-
const micSource = audioCtx.createMediaStreamSource(this.mediaStream);
46-
micSource.connect(workletNode);
55+
this.workletNode = new AudioWorkletNode(audioCtx, workletName);
56+
this.micSource = audioCtx.createMediaStreamSource(this.mediaStream);
57+
this.micSource.connect(this.workletNode);
4758

48-
workletNode.port.onmessage = event => {
59+
this.workletNode.port.onmessage = event => {
4960
const arrayBuffer = event.data.data.int16arrayBuffer;
5061
if (arrayBuffer && this.socket?.readyState === WebSocket.OPEN) {
5162
const arrayBufferString = arrayBufferToBase64(arrayBuffer);
@@ -55,92 +66,41 @@ export const realtimeChat = {
5566
}));
5667
}
5768
};
69+
};
5870

59-
// this.recognition = new SpeechRecognition();
60-
// this.recognition.continuous = true;
61-
// this.recognition.interimResults = false;
62-
// this.recognition.lang = "en-US";
63-
64-
// this.recognition.onresult = (/** @type { any } */ event) => {
65-
// const lastResult = event.results[event.results.length - 1];
66-
// const transcript = lastResult[0].transcript.trim();
67-
68-
// console.log("Recognized:", transcript);
69-
70-
// const message = {
71-
// event: "media",
72-
// payload: transcript
73-
// };
74-
75-
// if (this.socket?.readyState === WebSocket.OPEN) {
76-
// this.socket.send(JSON.stringify(message));
77-
// }
78-
// };
79-
80-
// this.recognition.onend = () => {
81-
// console.log('Speech recognition closed.');
82-
// };
83-
// this.recognition.start();
84-
85-
// navigator.mediaDevices.getUserMedia({ audio: true })
86-
// .then(stream => {
87-
// this.mediaStream = stream;
88-
// this.mediaRecorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
89-
// /** @type {any[]} */
90-
// let audioChunks = [];
91-
// this.mediaRecorder.ondataavailable = (/** @type {any} */ event) => {
92-
// if (event.data.size > 0) {
93-
// // audioChunks.push(event.data);
94-
// }
95-
// };
96-
97-
// this.mediaRecorder.onstop = async () => {
98-
// console.log('mediaRecorder stopped');
99-
// // const blob = new Blob(audioChunks, { type: 'audio/webm' });
100-
// // const arrayBuffer = await blob.arrayBuffer();
101-
102-
// // // Decode audio and downsample to PCM16
103-
// // const audioCtx = new AudioContext({ sampleRate: 16000 });
104-
// // const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
105-
106-
// // const channelData = audioBuffer.getChannelData(0); // mono
107-
// // const pcm16 = new Int16Array(channelData.length);
108-
109-
// // for (let i = 0; i < channelData.length; i++) {
110-
// // pcm16[i] = Math.max(-1, Math.min(1, channelData[i])) * 32767;
111-
// // }
112-
113-
// // const pcmBytes = new Uint8Array(pcm16.buffer);
114-
// // const base64 = btoa(String.fromCharCode(...pcmBytes));
115-
// // console.log(base64);
116-
// };
117-
118-
// this.mediaRecorder.start();
119-
// })
120-
// .catch((err) => {
121-
// console.error("Failed to access microphone", err);
122-
// });
71+
this.socket.onmessage = (/** @type {MessageEvent} */ e) => {
72+
try {
73+
const json = JSON.parse(e.data);
74+
if (json.event === 'media' && !!json.media.payload) {
75+
const data = json.media.payload;
76+
enqueueAudioChunk(data);
77+
}
78+
} catch {
79+
// console.error('Error when parsing message');
80+
}
12381
};
12482

12583
this.socket.onclose = () => {
12684
console.log("Websocket closed");
127-
}
85+
};
12886

129-
this.socket.onerror = (/** @type {any} */ e) => console.error('WebSocket error', e);
87+
this.socket.onerror = (/** @type {Event} */ e) => {
88+
console.error('WebSocket error', e);
89+
};
13090
},
13191

13292
stop() {
133-
if (this.mediaRecorder) {
134-
this.mediaRecorder.stop();
135-
}
136-
93+
reset();
94+
13795
if (this.mediaStream) {
13896
this.mediaStream.getTracks().forEach(t => t.stop());
13997
this.mediaStream = null;
14098
}
14199

142-
if (this.recognition) {
143-
this.recognition.stop();
100+
if (this.workletNode) {
101+
this.micSource?.disconnect(this.workletNode);
102+
this.workletNode.port.close();
103+
this.workletNode.disconnect();
144104
}
145105

146106
if (this.socket?.readyState === WebSocket.OPEN) {
@@ -152,24 +112,66 @@ export const realtimeChat = {
152112
}
153113
};
154114

115+
116+
function buildWebsocketUrl() {
117+
let url = '';
118+
const host = PUBLIC_SERVICE_URL.split('://');
119+
120+
if (PUBLIC_SERVICE_URL.startsWith('https')) {
121+
url = `wss:${host[1]}`;
122+
} else if (PUBLIC_SERVICE_URL.startsWith('http')) {
123+
url = `ws:${host[1]}`;
124+
}
125+
126+
return url;
127+
}
128+
129+
function reset() {
130+
isPlaying = false;
131+
audioQueue = [];
132+
}
133+
155134
/**
156-
* @param {ArrayBuffer} buffer
135+
* @param {string} base64Audio
157136
*/
158-
function arrayBufferToBase64(buffer) {
159-
var binary = "";
160-
var bytes = new Uint8Array(buffer);
161-
var len = bytes.byteLength;
162-
for (var i = 0; i < len; i++) {
163-
binary += String.fromCharCode(bytes[i]);
137+
function enqueueAudioChunk(base64Audio) {
138+
const arrayBuffer = base64ToArrayBuffer(base64Audio);
139+
const float32Data = convert16BitPCMToFloat32(arrayBuffer);
140+
141+
const audioBuffer = audioCtx.createBuffer(1, float32Data.length, sampleRate);
142+
audioBuffer.getChannelData(0).set(float32Data);
143+
audioQueue.push(audioBuffer);
144+
145+
if (!isPlaying) {
146+
playNext();
164147
}
165-
return window.btoa(binary);
166148
}
167149

150+
function playNext() {
151+
if (audioQueue.length === 0) {
152+
isPlaying = false;
153+
return;
154+
}
155+
156+
isPlaying = true;
157+
const buffer = audioQueue.shift();
158+
159+
const source = audioCtx.createBufferSource();
160+
source.buffer = buffer;
161+
source.connect(audioCtx.destination);
162+
163+
source.onended = () => {
164+
playNext();
165+
};
166+
source.start();
167+
}
168+
169+
168170
/**
169171
* @param {string} workletName
170172
* @param {string} workletSrc
171173
*/
172-
function createWorketFromSrc(workletName, workletSrc) {
174+
function createWorkletFromSrc(workletName, workletSrc) {
173175
const script = new Blob(
174176
[`registerProcessor("${workletName}", ${workletSrc})`],
175177
{
@@ -178,4 +180,51 @@ function createWorketFromSrc(workletName, workletSrc) {
178180
);
179181

180182
return URL.createObjectURL(script);
183+
};
184+
185+
186+
/**
187+
* @param {ArrayBuffer} buffer
188+
*/
189+
function arrayBufferToBase64(buffer) {
190+
var binary = "";
191+
var bytes = new Uint8Array(buffer);
192+
var len = bytes.byteLength;
193+
for (var i = 0; i < len; i++) {
194+
binary += String.fromCharCode(bytes[i]);
195+
}
196+
return btoa(binary);
197+
};
198+
199+
/**
200+
* @param {string} base64
201+
*/
202+
function base64ToArrayBuffer(base64) {
203+
const binaryStr = atob(base64);
204+
const len = binaryStr.length;
205+
const bytes = new Uint8Array(len);
206+
207+
for (let i = 0; i < len; i++) {
208+
bytes[i] = binaryStr.charCodeAt(i);
209+
}
210+
return bytes.buffer;
211+
};
212+
213+
/**
214+
* @param {ArrayBuffer} buffer
215+
*/
216+
function convert16BitPCMToFloat32(buffer) {
217+
const chunk = new Uint8Array(buffer);
218+
const output = new Float32Array(chunk.length / 2);
219+
const dataView = new DataView(chunk.buffer);
220+
221+
for (let i = 0; i< chunk.length / 2; i++) {
222+
try {
223+
const int16 = dataView.getInt16(i * 2, true);
224+
output[i] = int16 / 32767;
225+
} catch (e) {
226+
console.error(e);
227+
}
228+
}
229+
return output;
181230
};

src/routes/chat/[agentId]/[conversationId]/chat-box.svelte

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
} from '$env/static/public';
4747
import { BOT_SENDERS, LEARNER_ID, TRAINING_MODE, USER_SENDERS, ADMIN_ROLES, IMAGE_DATA_PREFIX } from '$lib/helpers/constants';
4848
import { signalr } from '$lib/services/signalr-service.js';
49-
import { llmRealtime } from '$lib/services/llm-realtime-service.js';
5049
import { newConversation } from '$lib/services/conversation-service';
5150
import DialogModal from '$lib/common/DialogModal.svelte';
5251
import HeadTitle from '$lib/common/HeadTitle.svelte';
@@ -674,14 +673,10 @@
674673
if (disableSpeech) return;
675674
676675
if (!isListening) {
677-
// llmRealtime.start(params.agentId, (/** @type {any} */ message) => {
678-
// console.log(message);
679-
// });
680676
realtimeChat.start(params.agentId, params.conversationId);
681677
isListening = true;
682678
microphoneIcon = "microphone";
683679
} else {
684-
// llmRealtime.stop();
685680
realtimeChat.stop();
686681
isListening = false;
687682
microphoneIcon = "microphone-off";

0 commit comments

Comments
 (0)