Skip to content

Commit 67afe04

Browse files
pranavjoshi001Rushikesh Gavalicompulim
authored
Core speech to speech implementation (#5654)
* initial no-op s2s core implementation * minor * refactor to align close to activity structure * refactor composer to not use direct state inside effect * more implementation chunk * minor refactor * Mic Implementation and animation in fluent theme * test case added * screenshot added * refactor * increase sec to capture more outgoing event in test file * changelog updated * refactor code as per code review * remove not needed files * test case updated * refactor as per comment * review comment fixed * instead of core import from webchat internal * update screen shot * remaining review comment fixed * modify mock to try to fix html test * fixing 1 test case * increase duration in one test to eait for audio playback * left over review comment * trying to revert diff image * reverting diff image * activity spec changes and capabilities integration * fix TS issue * Apply PR suggestions * Fix InferOutput * Fix Prettier * Fix Prettier --------- Co-authored-by: Rushikesh Gavali <rgavali_linux@microsoft.com> Co-authored-by: William Wong <compulim@users.noreply.github.com>
1 parent f7a730f commit 67afe04

File tree

81 files changed

+3984
-60
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+3984
-60
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ Breaking changes in this release:
150150
- The `botframework-webchat` package now uses CSS modules for styling purposes, in PR [#5666](https://github.com/microsoft/BotFramework-WebChat/pull/5666), in PR [#5677](https://github.com/microsoft/BotFramework-WebChat/pull/5677) by [@OEvgeny](https://github.com/OEvgeny)
151151
- 👷🏻 Added `npm run build-browser` script for building test harness package only, in PR [#5667](https://github.com/microsoft/BotFramework-WebChat/pull/5667), by [@compulim](https://github.com/compulim)
152152
- Added pull-based capabilities system for dynamically discovering adapter capabilities at runtime, in PR [#5679](https://github.com/microsoft/BotFramework-WebChat/pull/5679), by [@pranavjoshi001](https://github.com/pranavjoshi001)
153+
- Added Speech-to-Speech (S2S) support for real-time voice conversations, in PR [#5654](https://github.com/microsoft/BotFramework-WebChat/pull/5654), by [@pranavjoshi](https://github.com/pranavjoshi001)
153154

154155
### Changed
155156

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/* global AudioContext */
2+
3+
/**
4+
* Mocks AudioContext.createBuffer to return buffers with minimum duration.
5+
*
6+
*/
7+
export function setupMockAudioPlayback() {
8+
const originalCreateBuffer = AudioContext.prototype.createBuffer;
9+
10+
AudioContext.prototype.createBuffer = function (numberOfChannels, length, sampleRate) {
11+
// Ensure minimum duration of 0.5 seconds for testing
12+
const minSamples = Math.floor(sampleRate * 0.5);
13+
const actualLength = Math.max(length, minSamples);
14+
15+
return originalCreateBuffer.call(this, numberOfChannels, actualLength, sampleRate);
16+
};
17+
18+
return {
19+
restore: () => {
20+
AudioContext.prototype.createBuffer = originalCreateBuffer;
21+
}
22+
};
23+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/* global clearInterval, MessageChannel, navigator, setInterval, URL, window */
2+
3+
/**
4+
* Mocks browser audio APIs for speechToSpeech testing.
5+
*
6+
* - Intercepts AudioContext.audioWorklet.addModule() to prevent blob execution
7+
* - Mocks AudioWorkletNode for the 'audio-recorder' processor
8+
* - Mocks navigator.mediaDevices.getUserMedia() to return a test audio stream
9+
*/
10+
export function setupMockMediaDevices() {
11+
const OriginalAudioContext = window.AudioContext;
12+
13+
// Intercept AudioContext to mock audioWorklet.addModule
14+
window.AudioContext = function (options) {
15+
const ctx = new OriginalAudioContext(options);
16+
17+
ctx.audioWorklet.addModule = url => {
18+
if (url.startsWith('blob:')) {
19+
URL.revokeObjectURL(url);
20+
}
21+
return Promise.resolve();
22+
};
23+
24+
return ctx;
25+
};
26+
27+
Object.setPrototypeOf(window.AudioContext, OriginalAudioContext);
28+
window.AudioContext.prototype = OriginalAudioContext.prototype;
29+
30+
// Mock AudioWorkletNode - uses GainNode as base so source.connect() works
31+
window.AudioWorkletNode = function (context, name, options) {
32+
const node = context.createGain();
33+
const channel = new MessageChannel();
34+
let recording = false;
35+
let intervalId = null;
36+
37+
node.port = channel.port1;
38+
39+
// port1 is exposed as worklet.port to the real code
40+
// Real code sends to port1 → received by port2.onmessage (commands)
41+
// Mock sends from port2 → received by port1.onmessage (audio chunks)
42+
channel.port2.onmessage = ({ data }) => {
43+
if (data.command === 'START') {
44+
recording = true;
45+
const bufferSize = options?.processorOptions?.bufferSize || 2400;
46+
47+
// Send chunks at ~100ms intervals while recording
48+
// Use port2.postMessage so port1.onmessage (set by real code) receives it
49+
intervalId = setInterval(() => {
50+
if (recording) {
51+
channel.port2.postMessage({ eventType: 'audio', audioData: new Float32Array(bufferSize) });
52+
}
53+
}, 100);
54+
} else if (data.command === 'STOP') {
55+
recording = false;
56+
if (intervalId) {
57+
clearInterval(intervalId);
58+
intervalId = null;
59+
}
60+
}
61+
};
62+
63+
return node;
64+
};
65+
66+
// Mock getUserMedia with oscillator-based test stream
67+
navigator.mediaDevices.getUserMedia = constraints => {
68+
const sampleRate = constraints?.audio?.sampleRate || 24000;
69+
const ctx = new OriginalAudioContext({ sampleRate });
70+
const oscillator = ctx.createOscillator();
71+
const destination = ctx.createMediaStreamDestination();
72+
73+
oscillator.connect(destination);
74+
oscillator.start();
75+
76+
destination.stream.getTracks().forEach(track => {
77+
const originalStop = track.stop.bind(track);
78+
track.stop = () => {
79+
oscillator.stop();
80+
ctx.close();
81+
originalStop();
82+
};
83+
});
84+
85+
return Promise.resolve(destination.stream);
86+
};
87+
}
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
<!doctype html>
2+
<html lang="en-US">
3+
<head>
4+
<link href="/assets/index.css" rel="stylesheet" type="text/css" />
5+
<script crossorigin="anonymous" src="https://unpkg.com/@babel/standalone@7.8.7/babel.min.js"></script>
6+
<script crossorigin="anonymous" src="https://unpkg.com/react@16.8.6/umd/react.production.min.js"></script>
7+
<script crossorigin="anonymous" src="https://unpkg.com/react-dom@16.8.6/umd/react-dom.production.min.js"></script>
8+
<script crossorigin="anonymous" src="/test-harness.js"></script>
9+
<script crossorigin="anonymous" src="/test-page-object.js"></script>
10+
<script crossorigin="anonymous" src="/__dist__/webchat-es5.js"></script>
11+
<script crossorigin="anonymous" src="/__dist__/botframework-webchat-fluent-theme.production.min.js"></script>
12+
</head>
13+
<body>
14+
<main id="webchat"></main>
15+
<!--
16+
Test: Barge-in scenario with full state cycle
17+
18+
Flow:
19+
1. User starts recording → "Listening..."
20+
2. Bot sends audio chunks → "Talk to interrupt..." (bot speaking)
21+
3. User barges in (server detects) → "Listening..." (user speaking)
22+
4. Server processes → "Processing..."
23+
5. Bot responds with new audio → "Talk to interrupt..." (bot speaking again)
24+
6. User toggles mic off
25+
-->
26+
<script type="module">
27+
import { setupMockMediaDevices } from '/assets/esm/speechToSpeech/mockMediaDevices.js';
28+
import { setupMockAudioPlayback } from '/assets/esm/speechToSpeech/mockAudioPlayback.js';
29+
30+
setupMockMediaDevices();
31+
setupMockAudioPlayback();
32+
</script>
33+
<script type="text/babel">
34+
run(async function () {
35+
const {
36+
React,
37+
ReactDOM: { render },
38+
WebChat: { FluentThemeProvider, ReactWebChat, testIds }
39+
} = window;
40+
41+
const { directLine, store } = testHelpers.createDirectLineEmulator();
42+
43+
// Set voice configuration capability to enable microphone button
44+
directLine.setCapability('getVoiceConfiguration', { sampleRate: 24000, chunkIntervalMs: 100 }, { emitEvent: false });
45+
46+
render(
47+
<FluentThemeProvider variant="fluent">
48+
<ReactWebChat
49+
directLine={directLine}
50+
store={store}
51+
/>
52+
</FluentThemeProvider>,
53+
document.getElementById('webchat')
54+
);
55+
56+
await pageConditions.uiConnected();
57+
58+
const micButton = document.querySelector(`[data-testid="${testIds.sendBoxMicrophoneButton}"]`);
59+
const textArea = document.querySelector(`[data-testid="${testIds.sendBoxTextBox}"]`);
60+
expect(micButton).toBeTruthy();
61+
expect(textArea).toBeTruthy();
62+
63+
// Start recording
64+
await host.click(micButton);
65+
66+
await pageConditions.became(
67+
'Recording started',
68+
() => micButton.getAttribute('aria-label')?.includes('Microphone on'),
69+
1000
70+
);
71+
72+
// VERIFY: State is "listening"
73+
await pageConditions.became(
74+
'State: listening → Placeholder: "Listening..."',
75+
() => textArea.getAttribute('placeholder') === 'Listening...',
76+
2000
77+
);
78+
79+
// Bot starts speaking (sends audio chunks)
80+
await directLine.emulateIncomingVoiceActivity({
81+
type: 'event',
82+
name: 'media.chunk',
83+
from: { role: 'bot' },
84+
value: { content: 'AAAAAA==', contentType: 'audio/webm' },
85+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.chunk'
86+
});
87+
88+
await directLine.emulateIncomingVoiceActivity({
89+
type: 'event',
90+
name: 'media.chunk',
91+
from: { role: 'bot' },
92+
value: { content: 'AAAAAA==', contentType: 'audio/webm' },
93+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.chunk'
94+
});
95+
96+
// VERIFY: State is "bot_speaking" (isPlaying = true)
97+
await pageConditions.became(
98+
'State: bot_speaking → Placeholder: "Talk to interrupt..."',
99+
() => textArea.getAttribute('placeholder') === 'Talk to interrupt...',
100+
1000
101+
);
102+
103+
// VERIFY: Mic button has pulse animation during bot speaking
104+
expect(micButton.className).toMatch(/with-pulse/);
105+
106+
// User barges in (server detects user speech)
107+
await directLine.emulateIncomingVoiceActivity({
108+
type: 'event',
109+
name: 'request.update',
110+
from: { role: 'bot' },
111+
value: { state: 'detected', message: 'Your request is identified' },
112+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.state'
113+
});
114+
115+
// VERIFY: State changes to "user_speaking" - bot audio stopped
116+
await pageConditions.became(
117+
'State: user_speaking → Placeholder: "Listening…" (barge-in worked)',
118+
() => textArea.getAttribute('placeholder') === 'Listening...',
119+
1000
120+
);
121+
122+
// VERIFY: Mic button still has pulse animation during user speaking
123+
expect(micButton.className).toMatch(/with-pulse/);
124+
125+
// Server processes the user's interrupted request
126+
await directLine.emulateIncomingVoiceActivity({
127+
type: 'event',
128+
name: 'request.update',
129+
from: { role: 'bot' },
130+
value: { state: 'processing', message: 'Your request is being processed' },
131+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.state'
132+
});
133+
134+
// VERIFY: State is "processing"
135+
await pageConditions.became(
136+
'State: processing → Placeholder: "Processing…"',
137+
() => textArea.getAttribute('placeholder') === 'Processing...',
138+
1000
139+
);
140+
141+
// User transcript arrives
142+
await directLine.emulateIncomingVoiceActivity({
143+
type: 'event',
144+
name: 'media.end',
145+
from: { role: 'bot' },
146+
text: 'Stop! Change my destination.',
147+
value: { transcription: 'Stop! Change my destination.', origin: 'user' },
148+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.transcript'
149+
});
150+
151+
await pageConditions.numActivitiesShown(1);
152+
153+
// Bot responds with new audio
154+
await directLine.emulateIncomingVoiceActivity({
155+
type: 'event',
156+
name: 'media.chunk',
157+
from: { role: 'bot' },
158+
value: { content: 'AAAAAA==', contentType: 'audio/webm' },
159+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.chunk'
160+
});
161+
162+
// VERIFY: State is "bot_speaking" again
163+
await pageConditions.became(
164+
'State: bot_speaking → Placeholder: "Talk to interrupt..." (bot responding)',
165+
() => textArea.getAttribute('placeholder') === 'Talk to interrupt...',
166+
1000
167+
);
168+
169+
// Bot transcript arrives
170+
await directLine.emulateIncomingVoiceActivity({
171+
type: 'event',
172+
name: 'media.end',
173+
from: { role: 'bot' },
174+
text: 'Sure, where would you like to go instead?',
175+
value: { transcription: 'Sure, where would you like to go instead?', origin: 'agent' },
176+
valueType: 'application/vnd.microsoft.activity.azure.directline.audio.transcript'
177+
});
178+
179+
await pageConditions.numActivitiesShown(2);
180+
181+
// Verify both messages appear
182+
const activities = pageElements.activityContents();
183+
expect(activities[0]).toHaveProperty('textContent', 'Stop! Change my destination.');
184+
expect(activities[1]).toHaveProperty('textContent', 'Sure, where would you like to go instead?');
185+
186+
// Toggle mic off
187+
await host.click(micButton);
188+
189+
await pageConditions.became(
190+
'Recording stopped',
191+
() => micButton.getAttribute('aria-label')?.includes('Microphone off'),
192+
1000
193+
);
194+
});
195+
</script>
196+
</body>
197+
</html>
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
<!doctype html>
2+
<html lang="en-US">
3+
<head>
4+
<link href="/assets/index.css" rel="stylesheet" type="text/css" />
5+
<script crossorigin="anonymous" src="https://unpkg.com/@babel/standalone@7.8.7/babel.min.js"></script>
6+
<script crossorigin="anonymous" src="https://unpkg.com/react@16.8.6/umd/react.production.min.js"></script>
7+
<script crossorigin="anonymous" src="https://unpkg.com/react-dom@16.8.6/umd/react-dom.production.min.js"></script>
8+
<script crossorigin="anonymous" src="/test-harness.js"></script>
9+
<script crossorigin="anonymous" src="/test-page-object.js"></script>
10+
<script crossorigin="anonymous" src="/__dist__/webchat-es5.js"></script>
11+
<script crossorigin="anonymous" src="/__dist__/botframework-webchat-fluent-theme.production.min.js"></script>
12+
</head>
13+
<body>
14+
<main id="webchat"></main>
15+
<script type="text/babel">
16+
run(async function () {
17+
const {
18+
React,
19+
ReactDOM: { render },
20+
WebChat: { FluentThemeProvider, ReactWebChat, testIds }
21+
} = window;
22+
23+
// GIVEN: Web Chat with Fluent Theme and microphone button enabled
24+
const { directLine, store } = testHelpers.createDirectLineEmulator();
25+
26+
// Set voice configuration capability to enable microphone button
27+
directLine.setCapability('getVoiceConfiguration', { sampleRate: 24000, chunkIntervalMs: 100 }, { emitEvent: false });
28+
29+
render(
30+
<FluentThemeProvider variant="fluent">
31+
<ReactWebChat
32+
directLine={directLine}
33+
store={store}
34+
styleOptions={{
35+
disableFileUpload: true,
36+
hideTelephoneKeypadButton: false,
37+
}}
38+
/>
39+
</FluentThemeProvider>,
40+
document.getElementById('webchat')
41+
);
42+
43+
await pageConditions.uiConnected();
44+
45+
// THEN: Microphone button should be present
46+
const micButton = document.querySelector(`[data-testid="${testIds.sendBoxMicrophoneButton}"]`);
47+
expect(micButton).toBeTruthy();
48+
49+
// THEN: Telephone keypad button should be present
50+
const keypadButton = document.querySelector(`[data-testid="${testIds.sendBoxTelephoneKeypadToolbarButton}"]`);
51+
expect(keypadButton).toBeTruthy();
52+
53+
// THEN: Text counter should NOT be present
54+
const textCounter = document.querySelector('.sendbox__text-counter');
55+
expect(textCounter).toBeFalsy();
56+
57+
// THEN: Send button should NOT be present
58+
const sendButton = document.querySelector(`[data-testid="${testIds.sendBoxSendButton}"]`);
59+
expect(sendButton).toBeFalsy();
60+
61+
// THEN: Should show sendbox with microphone and keypad buttons
62+
await host.snapshot('local');
63+
64+
// WHEN: Voice configuration is removed from directLine
65+
directLine.setCapability('getVoiceConfiguration', undefined);
66+
67+
// Wait for UI to update
68+
await pageConditions.became(
69+
'Microphone button should be hidden after removing voice configuration',
70+
() => !document.querySelector(`[data-testid="${testIds.sendBoxMicrophoneButton}"]`),
71+
1000
72+
);
73+
74+
// THEN: Microphone button should NOT be present anymore
75+
const micButtonAfterRemoval = document.querySelector(`[data-testid="${testIds.sendBoxMicrophoneButton}"]`);
76+
expect(micButtonAfterRemoval).toBeFalsy();
77+
});
78+
</script>
79+
</body>
80+
</html>
4.03 KB
Loading

0 commit comments

Comments
 (0)