Skip to content

Commit fc3c6af

Browse files
authored
Merge pull request #480 from open-edge-platform/update-branch
fix: digital avatar not splitting into sentence (#995)
2 parents 9f1cb12 + 3a0c6b3 commit fc3c6af

7 files changed

Lines changed: 434 additions & 105 deletions

File tree

usecases/ai/edge-ai-demo-studio/frontend/src/app/api/services/text-generation/chat/route.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import { getPayload } from 'payload'
1717
import { engines } from '@/engines/registry'
1818
import { logger } from '@/lib/logger'
1919
import { buildMcpTools } from '@/lib/mcp-tools'
20+
import { SentenceProcessor } from '@/lib/sentence-processor'
2021
import type { Service } from '@/payload-types'
2122
import { metaMap } from '@/services/_generated/meta'
2223
import { hermesToolMiddleware } from '@ai-sdk-tool/parser'
@@ -129,6 +130,39 @@ async function getWorkloadModel(
129130
)
130131
}
131132

133+
interface LipsyncConfig {
134+
sessionId: string
135+
voice: string
136+
speed: string
137+
}
138+
139+
function dispatchSentenceToLipsync(
140+
sentence: string,
141+
lipsync: LipsyncConfig,
142+
): void {
143+
const lipsyncMeta = metaMap['lipsync']
144+
if (!lipsyncMeta) return
145+
146+
// Derive the TTS URL server-side from trusted config to prevent SSRF.
147+
const ttsMeta = metaMap['text-to-speech']
148+
const ttsUrl = ttsMeta ? `http://localhost:${ttsMeta.port}/v1` : undefined
149+
150+
fetch(`http://localhost:${lipsyncMeta.port}/v1/lipsync/chat`, {
151+
method: 'POST',
152+
headers: { 'Content-Type': 'application/json' },
153+
body: JSON.stringify({
154+
text: sentence,
155+
session_id: lipsync.sessionId,
156+
chat_type: 'echo',
157+
voice: lipsync.voice,
158+
speed: lipsync.speed,
159+
...(ttsUrl ? { tts_url: ttsUrl } : {}),
160+
}),
161+
}).catch((err) => {
162+
logger.error('Failed to dispatch sentence to lipsync:', err)
163+
})
164+
}
165+
132166
export async function POST(req: Request) {
133167
let body: {
134168
messages: UIMessage[]
@@ -141,6 +175,7 @@ export async function POST(req: Request) {
141175
knowledgeBaseId?: number
142176
disableReasoning?: boolean
143177
mcpServerIds?: number[]
178+
lipsync?: LipsyncConfig
144179
}
145180

146181
try {
@@ -160,8 +195,10 @@ export async function POST(req: Request) {
160195
knowledgeBaseId,
161196
disableReasoning,
162197
mcpServerIds,
198+
lipsync,
163199
} = body
164200

201+
// Get available model
165202
let model: string
166203
const textGenerationMeta = metaMap['text-generation']
167204
try {
@@ -224,6 +261,9 @@ export async function POST(req: Request) {
224261
cleanupImageMessage(messages),
225262
)
226263

264+
// Initialize sentence processor for lipsync sentence-by-sentence streaming
265+
const sentenceProcessor = lipsync ? new SentenceProcessor() : null
266+
227267
const result = streamText({
228268
model: wrappedModel,
229269
system: systemPrompt,
@@ -235,7 +275,21 @@ export async function POST(req: Request) {
235275
...(mcpTools && Object.keys(mcpTools.tools).length > 0
236276
? { tools: mcpTools.tools, stopWhen: stepCountIs(5) }
237277
: {}),
278+
onChunk({ chunk }) {
279+
if (chunk.type === 'text-delta' && sentenceProcessor && lipsync) {
280+
const sentences = sentenceProcessor.addTextChunk(chunk.text)
281+
for (const sentence of sentences) {
282+
dispatchSentenceToLipsync(sentence, lipsync)
283+
}
284+
}
285+
},
238286
onFinish: async () => {
287+
if (sentenceProcessor && lipsync) {
288+
const finalSentences = sentenceProcessor.flush()
289+
for (const sentence of finalSentences) {
290+
dispatchSentenceToLipsync(sentence, lipsync)
291+
}
292+
}
239293
await mcpTools?.cleanup()
240294
},
241295
})
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Copyright (C) 2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
const MIN_WORDS_PER_SENTENCE = 5
5+
6+
const EMOJI_REGEX =
7+
/[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]/gu
8+
9+
function removeEmojis(text: string): string {
10+
return text.replace(EMOJI_REGEX, '')
11+
}
12+
13+
function countWords(text: string): number {
14+
return text
15+
.trim()
16+
.split(/\s+/)
17+
.filter((word) => word.length > 0).length
18+
}
19+
20+
/**
21+
* Processes streaming text into complete sentences using Intl.Segmenter.
22+
* Buffers short sentences together until a minimum word count is reached.
23+
*/
24+
export class SentenceProcessor {
25+
private segmenter: Intl.Segmenter
26+
private accumulatedText = ''
27+
private sentenceBuffer = ''
28+
29+
constructor(locale: string = 'en') {
30+
this.segmenter = new Intl.Segmenter(locale, { granularity: 'sentence' })
31+
}
32+
33+
/** Feed a new chunk of streamed text. Returns any completed sentences. */
34+
addTextChunk(chunk: string): string[] {
35+
this.accumulatedText += chunk
36+
const segments = Array.from(this.segmenter.segment(this.accumulatedText))
37+
const completedSentences: string[] = []
38+
39+
// Process complete sentences (not the last segment which might be incomplete)
40+
for (let i = 0; i < segments.length - 1; i++) {
41+
const sentence = this.processSentence(segments[i].segment)
42+
if (sentence) {
43+
completedSentences.push(sentence)
44+
}
45+
}
46+
47+
// Keep the last segment as it might be incomplete
48+
const lastSegment = segments[segments.length - 1]
49+
this.accumulatedText = lastSegment ? lastSegment.segment : ''
50+
51+
return completedSentences
52+
}
53+
54+
/** Flush all remaining buffered text as final sentences. */
55+
flush(): string[] {
56+
const completedSentences: string[] = []
57+
58+
if (this.accumulatedText.trim()) {
59+
const segments = Array.from(this.segmenter.segment(this.accumulatedText))
60+
for (const segment of segments) {
61+
const sentence = this.processSentence(segment.segment)
62+
if (sentence) {
63+
completedSentences.push(sentence)
64+
}
65+
}
66+
this.accumulatedText = ''
67+
}
68+
69+
const finalSentence = this.flushBuffer()
70+
if (finalSentence) {
71+
completedSentences.push(finalSentence)
72+
}
73+
74+
return completedSentences
75+
}
76+
77+
private processSentence(sentence: string): string | null {
78+
const cleanSentence = removeEmojis(sentence).trim()
79+
if (!cleanSentence) return null
80+
81+
this.sentenceBuffer += (this.sentenceBuffer ? ' ' : '') + cleanSentence
82+
83+
if (countWords(this.sentenceBuffer) >= MIN_WORDS_PER_SENTENCE) {
84+
const result = this.sentenceBuffer
85+
this.sentenceBuffer = ''
86+
return result
87+
}
88+
89+
return null
90+
}
91+
92+
private flushBuffer(): string | null {
93+
if (this.sentenceBuffer.trim()) {
94+
const result = this.sentenceBuffer.trim()
95+
this.sentenceBuffer = ''
96+
return result
97+
}
98+
return null
99+
}
100+
}

usecases/ai/edge-ai-demo-studio/frontend/src/samples/common/hooks/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ export { useWakeWordTrigger } from './use-wake-word-trigger'
1515
export { useRagParams } from './use-rag-params'
1616
export { useRagChatSetup } from './use-rag-chat-setup'
1717
export { useWakeWordStt } from './use-wake-word-stt'
18+
export { useSentenceSpeech } from './use-sentence-speech'
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// Copyright (C) 2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
'use client'
5+
6+
import { useCallback, useEffect, useRef } from 'react'
7+
import { SentenceProcessor } from '@/lib/sentence-processor'
8+
import type {
9+
ChatMsg,
10+
ChatStatus,
11+
} from '@/services/text-generation/components/chat-helpers'
12+
import { extractTextContent } from '@/services/text-generation/components/chat-helpers'
13+
14+
interface UseSentenceSpeechOptions {
15+
messages: ChatMsg[]
16+
status: ChatStatus
17+
/** Called with each completed sentence during streaming */
18+
onSentence: (sentence: string) => void
19+
/** Called when all sentences (including final flush) have been dispatched */
20+
onComplete?: () => void
21+
/** Set to false to disable processing (e.g. when TTS is offline) */
22+
enabled?: boolean
23+
}
24+
25+
/**
26+
* Watches streaming chat messages and splits the assistant's response into
27+
* sentences in real-time using `SentenceProcessor`. Calls `onSentence` for
28+
* each completed sentence, enabling progressive TTS playback.
29+
*/
30+
export function useSentenceSpeech({
31+
messages,
32+
status,
33+
onSentence,
34+
onComplete,
35+
enabled = true,
36+
}: UseSentenceSpeechOptions) {
37+
const processedLengthRef = useRef(0)
38+
const processorRef = useRef<SentenceProcessor>(new SentenceProcessor())
39+
const prevMessageIdRef = useRef<string | null>(null)
40+
const flushedRef = useRef(false)
41+
42+
// Store callbacks in refs so they never cause effect re-runs
43+
const onSentenceRef = useRef(onSentence)
44+
const onCompleteRef = useRef(onComplete)
45+
useEffect(() => {
46+
onSentenceRef.current = onSentence
47+
onCompleteRef.current = onComplete
48+
}, [onSentence, onComplete])
49+
50+
// Mirror latest messages/status into refs for use in transition effects
51+
const messagesRef = useRef(messages)
52+
const statusRef = useRef(status)
53+
useEffect(() => {
54+
messagesRef.current = messages
55+
}, [messages])
56+
useEffect(() => {
57+
statusRef.current = status
58+
}, [status])
59+
60+
// When enabled transitions false → true, fast-forward the tracked state to
61+
// the current assistant message so stale content is not re-spoken.
62+
const prevEnabledRef = useRef(enabled)
63+
useEffect(() => {
64+
if (prevEnabledRef.current === enabled) return
65+
prevEnabledRef.current = enabled
66+
67+
if (enabled) {
68+
const msgs = messagesRef.current
69+
const lastMsg = msgs[msgs.length - 1]
70+
if (lastMsg?.role === 'assistant') {
71+
prevMessageIdRef.current = lastMsg.id
72+
processedLengthRef.current = extractTextContent(lastMsg).length
73+
processorRef.current = new SentenceProcessor()
74+
flushedRef.current = statusRef.current !== 'streaming'
75+
}
76+
}
77+
}, [enabled])
78+
79+
// Process new text chunks as the streaming message grows
80+
useEffect(() => {
81+
if (!enabled) return
82+
83+
const lastMsg = messages[messages.length - 1]
84+
if (!lastMsg || lastMsg.role !== 'assistant') return
85+
86+
// Reset processor when a new assistant message starts
87+
if (lastMsg.id !== prevMessageIdRef.current) {
88+
prevMessageIdRef.current = lastMsg.id
89+
processedLengthRef.current = 0
90+
processorRef.current = new SentenceProcessor()
91+
flushedRef.current = false
92+
}
93+
94+
// Don't re-process after flush
95+
if (flushedRef.current) return
96+
97+
const text = extractTextContent(lastMsg)
98+
const newText = text.slice(processedLengthRef.current)
99+
if (!newText) return
100+
101+
processedLengthRef.current = text.length
102+
103+
const sentences = processorRef.current.addTextChunk(newText)
104+
for (const sentence of sentences) {
105+
onSentenceRef.current(sentence)
106+
}
107+
}, [messages, enabled])
108+
109+
// Flush remaining text when streaming completes
110+
useEffect(() => {
111+
if (!enabled) return
112+
if (status !== 'ready') return
113+
if (flushedRef.current) return
114+
115+
flushedRef.current = true
116+
117+
const sentences = processorRef.current.flush()
118+
for (const sentence of sentences) {
119+
onSentenceRef.current(sentence)
120+
}
121+
122+
onCompleteRef.current?.()
123+
}, [status, enabled])
124+
125+
const reset = useCallback(() => {
126+
processedLengthRef.current = 0
127+
processorRef.current = new SentenceProcessor()
128+
prevMessageIdRef.current = null
129+
flushedRef.current = false
130+
}, [])
131+
132+
return { reset }
133+
}

usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar-lite/components/chat-panel.tsx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ interface ChatPanelProps {
2020
sttOnline?: boolean
2121
disabled?: boolean
2222
isVlm?: boolean
23+
isSpeaking?: boolean
2324
imagePreview?: string | null
2425
onImageSelect?: (e: React.ChangeEvent<HTMLInputElement>) => void
2526
onImageRemove?: () => void
@@ -36,14 +37,19 @@ export function ChatPanel({
3637
sttOnline,
3738
disabled,
3839
isVlm,
40+
isSpeaking,
3941
imagePreview,
4042
onImageSelect,
4143
onImageRemove,
4244
}: ChatPanelProps) {
45+
// Show the stop button while the avatar is speaking, even after LLM finishes
46+
const effectiveStatus =
47+
isSpeaking && status === 'ready' ? 'streaming' : status
48+
4349
return (
4450
<VlmChatPanel
4551
messages={messages}
46-
status={status}
52+
status={effectiveStatus}
4753
input={input}
4854
onInputChange={onInputChange}
4955
onSend={onSend}

0 commit comments

Comments
 (0)