Merge pull request #480 from open-edge-platform/update-branch

gooishin · web-flow · commit fc3c6af91455 · 2026-04-15T10:21:47.000+08:00
fix: digital avatar not splitting into sentence (#995)
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/app/api/services/text-generation/chat/route.ts b/usecases/ai/edge-ai-demo-studio/frontend/src/app/api/services/text-generation/chat/route.ts
@@ -17,6 +17,7 @@ import { getPayload } from 'payload'
 import { engines } from '@/engines/registry'
 import { logger } from '@/lib/logger'
 import { buildMcpTools } from '@/lib/mcp-tools'
+import { SentenceProcessor } from '@/lib/sentence-processor'
 import type { Service } from '@/payload-types'
 import { metaMap } from '@/services/_generated/meta'
 import { hermesToolMiddleware } from '@ai-sdk-tool/parser'
@@ -129,6 +130,39 @@ async function getWorkloadModel(
   )
 }
 
+interface LipsyncConfig {
+  sessionId: string
+  voice: string
+  speed: string
+}
+
+function dispatchSentenceToLipsync(
+  sentence: string,
+  lipsync: LipsyncConfig,
+): void {
+  const lipsyncMeta = metaMap['lipsync']
+  if (!lipsyncMeta) return
+
+  // Derive the TTS URL server-side from trusted config to prevent SSRF.
+  const ttsMeta = metaMap['text-to-speech']
+  const ttsUrl = ttsMeta ? `http://localhost:${ttsMeta.port}/v1` : undefined
+
+  fetch(`http://localhost:${lipsyncMeta.port}/v1/lipsync/chat`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({
+      text: sentence,
+      session_id: lipsync.sessionId,
+      chat_type: 'echo',
+      voice: lipsync.voice,
+      speed: lipsync.speed,
+      ...(ttsUrl ? { tts_url: ttsUrl } : {}),
+    }),
+  }).catch((err) => {
+    logger.error('Failed to dispatch sentence to lipsync:', err)
+  })
+}
+
 export async function POST(req: Request) {
   let body: {
     messages: UIMessage[]
@@ -141,6 +175,7 @@ export async function POST(req: Request) {
     knowledgeBaseId?: number
     disableReasoning?: boolean
     mcpServerIds?: number[]
+    lipsync?: LipsyncConfig
   }
 
   try {
@@ -160,8 +195,10 @@ export async function POST(req: Request) {
     knowledgeBaseId,
     disableReasoning,
     mcpServerIds,
+    lipsync,
   } = body
 
+  // Get available model
   let model: string
   const textGenerationMeta = metaMap['text-generation']
   try {
@@ -224,6 +261,9 @@ export async function POST(req: Request) {
         cleanupImageMessage(messages),
       )
 
+      // Initialize sentence processor for lipsync sentence-by-sentence streaming
+      const sentenceProcessor = lipsync ? new SentenceProcessor() : null
+
       const result = streamText({
         model: wrappedModel,
         system: systemPrompt,
@@ -235,7 +275,21 @@ export async function POST(req: Request) {
         ...(mcpTools && Object.keys(mcpTools.tools).length > 0
           ? { tools: mcpTools.tools, stopWhen: stepCountIs(5) }
           : {}),
+        onChunk({ chunk }) {
+          if (chunk.type === 'text-delta' && sentenceProcessor && lipsync) {
+            const sentences = sentenceProcessor.addTextChunk(chunk.text)
+            for (const sentence of sentences) {
+              dispatchSentenceToLipsync(sentence, lipsync)
+            }
+          }
+        },
         onFinish: async () => {
+          if (sentenceProcessor && lipsync) {
+            const finalSentences = sentenceProcessor.flush()
+            for (const sentence of finalSentences) {
+              dispatchSentenceToLipsync(sentence, lipsync)
+            }
+          }
           await mcpTools?.cleanup()
         },
       })
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/lib/sentence-processor.ts b/usecases/ai/edge-ai-demo-studio/frontend/src/lib/sentence-processor.ts
@@ -0,0 +1,100 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+const MIN_WORDS_PER_SENTENCE = 5
+
+const EMOJI_REGEX =
+  /[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]/gu
+
+function removeEmojis(text: string): string {
+  return text.replace(EMOJI_REGEX, '')
+}
+
+function countWords(text: string): number {
+  return text
+    .trim()
+    .split(/\s+/)
+    .filter((word) => word.length > 0).length
+}
+
+/**
+ * Processes streaming text into complete sentences using Intl.Segmenter.
+ * Buffers short sentences together until a minimum word count is reached.
+ */
+export class SentenceProcessor {
+  private segmenter: Intl.Segmenter
+  private accumulatedText = ''
+  private sentenceBuffer = ''
+
+  constructor(locale: string = 'en') {
+    this.segmenter = new Intl.Segmenter(locale, { granularity: 'sentence' })
+  }
+
+  /** Feed a new chunk of streamed text. Returns any completed sentences. */
+  addTextChunk(chunk: string): string[] {
+    this.accumulatedText += chunk
+    const segments = Array.from(this.segmenter.segment(this.accumulatedText))
+    const completedSentences: string[] = []
+
+    // Process complete sentences (not the last segment which might be incomplete)
+    for (let i = 0; i < segments.length - 1; i++) {
+      const sentence = this.processSentence(segments[i].segment)
+      if (sentence) {
+        completedSentences.push(sentence)
+      }
+    }
+
+    // Keep the last segment as it might be incomplete
+    const lastSegment = segments[segments.length - 1]
+    this.accumulatedText = lastSegment ? lastSegment.segment : ''
+
+    return completedSentences
+  }
+
+  /** Flush all remaining buffered text as final sentences. */
+  flush(): string[] {
+    const completedSentences: string[] = []
+
+    if (this.accumulatedText.trim()) {
+      const segments = Array.from(this.segmenter.segment(this.accumulatedText))
+      for (const segment of segments) {
+        const sentence = this.processSentence(segment.segment)
+        if (sentence) {
+          completedSentences.push(sentence)
+        }
+      }
+      this.accumulatedText = ''
+    }
+
+    const finalSentence = this.flushBuffer()
+    if (finalSentence) {
+      completedSentences.push(finalSentence)
+    }
+
+    return completedSentences
+  }
+
+  private processSentence(sentence: string): string | null {
+    const cleanSentence = removeEmojis(sentence).trim()
+    if (!cleanSentence) return null
+
+    this.sentenceBuffer += (this.sentenceBuffer ? ' ' : '') + cleanSentence
+
+    if (countWords(this.sentenceBuffer) >= MIN_WORDS_PER_SENTENCE) {
+      const result = this.sentenceBuffer
+      this.sentenceBuffer = ''
+      return result
+    }
+
+    return null
+  }
+
+  private flushBuffer(): string | null {
+    if (this.sentenceBuffer.trim()) {
+      const result = this.sentenceBuffer.trim()
+      this.sentenceBuffer = ''
+      return result
+    }
+    return null
+  }
+}
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/samples/common/hooks/index.ts b/usecases/ai/edge-ai-demo-studio/frontend/src/samples/common/hooks/index.ts
@@ -15,3 +15,4 @@ export { useWakeWordTrigger } from './use-wake-word-trigger'
 export { useRagParams } from './use-rag-params'
 export { useRagChatSetup } from './use-rag-chat-setup'
 export { useWakeWordStt } from './use-wake-word-stt'
+export { useSentenceSpeech } from './use-sentence-speech'
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/samples/common/hooks/use-sentence-speech.ts b/usecases/ai/edge-ai-demo-studio/frontend/src/samples/common/hooks/use-sentence-speech.ts
@@ -0,0 +1,133 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+'use client'
+
+import { useCallback, useEffect, useRef } from 'react'
+import { SentenceProcessor } from '@/lib/sentence-processor'
+import type {
+  ChatMsg,
+  ChatStatus,
+} from '@/services/text-generation/components/chat-helpers'
+import { extractTextContent } from '@/services/text-generation/components/chat-helpers'
+
+interface UseSentenceSpeechOptions {
+  messages: ChatMsg[]
+  status: ChatStatus
+  /** Called with each completed sentence during streaming */
+  onSentence: (sentence: string) => void
+  /** Called when all sentences (including final flush) have been dispatched */
+  onComplete?: () => void
+  /** Set to false to disable processing (e.g. when TTS is offline) */
+  enabled?: boolean
+}
+
+/**
+ * Watches streaming chat messages and splits the assistant's response into
+ * sentences in real-time using `SentenceProcessor`. Calls `onSentence` for
+ * each completed sentence, enabling progressive TTS playback.
+ */
+export function useSentenceSpeech({
+  messages,
+  status,
+  onSentence,
+  onComplete,
+  enabled = true,
+}: UseSentenceSpeechOptions) {
+  const processedLengthRef = useRef(0)
+  const processorRef = useRef<SentenceProcessor>(new SentenceProcessor())
+  const prevMessageIdRef = useRef<string | null>(null)
+  const flushedRef = useRef(false)
+
+  // Store callbacks in refs so they never cause effect re-runs
+  const onSentenceRef = useRef(onSentence)
+  const onCompleteRef = useRef(onComplete)
+  useEffect(() => {
+    onSentenceRef.current = onSentence
+    onCompleteRef.current = onComplete
+  }, [onSentence, onComplete])
+
+  // Mirror latest messages/status into refs for use in transition effects
+  const messagesRef = useRef(messages)
+  const statusRef = useRef(status)
+  useEffect(() => {
+    messagesRef.current = messages
+  }, [messages])
+  useEffect(() => {
+    statusRef.current = status
+  }, [status])
+
+  // When enabled transitions false → true, fast-forward the tracked state to
+  // the current assistant message so stale content is not re-spoken.
+  const prevEnabledRef = useRef(enabled)
+  useEffect(() => {
+    if (prevEnabledRef.current === enabled) return
+    prevEnabledRef.current = enabled
+
+    if (enabled) {
+      const msgs = messagesRef.current
+      const lastMsg = msgs[msgs.length - 1]
+      if (lastMsg?.role === 'assistant') {
+        prevMessageIdRef.current = lastMsg.id
+        processedLengthRef.current = extractTextContent(lastMsg).length
+        processorRef.current = new SentenceProcessor()
+        flushedRef.current = statusRef.current !== 'streaming'
+      }
+    }
+  }, [enabled])
+
+  // Process new text chunks as the streaming message grows
+  useEffect(() => {
+    if (!enabled) return
+
+    const lastMsg = messages[messages.length - 1]
+    if (!lastMsg || lastMsg.role !== 'assistant') return
+
+    // Reset processor when a new assistant message starts
+    if (lastMsg.id !== prevMessageIdRef.current) {
+      prevMessageIdRef.current = lastMsg.id
+      processedLengthRef.current = 0
+      processorRef.current = new SentenceProcessor()
+      flushedRef.current = false
+    }
+
+    // Don't re-process after flush
+    if (flushedRef.current) return
+
+    const text = extractTextContent(lastMsg)
+    const newText = text.slice(processedLengthRef.current)
+    if (!newText) return
+
+    processedLengthRef.current = text.length
+
+    const sentences = processorRef.current.addTextChunk(newText)
+    for (const sentence of sentences) {
+      onSentenceRef.current(sentence)
+    }
+  }, [messages, enabled])
+
+  // Flush remaining text when streaming completes
+  useEffect(() => {
+    if (!enabled) return
+    if (status !== 'ready') return
+    if (flushedRef.current) return
+
+    flushedRef.current = true
+
+    const sentences = processorRef.current.flush()
+    for (const sentence of sentences) {
+      onSentenceRef.current(sentence)
+    }
+
+    onCompleteRef.current?.()
+  }, [status, enabled])
+
+  const reset = useCallback(() => {
+    processedLengthRef.current = 0
+    processorRef.current = new SentenceProcessor()
+    prevMessageIdRef.current = null
+    flushedRef.current = false
+  }, [])
+
+  return { reset }
+}
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar-lite/components/chat-panel.tsx b/usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar-lite/components/chat-panel.tsx
@@ -20,6 +20,7 @@ interface ChatPanelProps {
   sttOnline?: boolean
   disabled?: boolean
   isVlm?: boolean
+  isSpeaking?: boolean
   imagePreview?: string | null
   onImageSelect?: (e: React.ChangeEvent<HTMLInputElement>) => void
   onImageRemove?: () => void
@@ -36,14 +37,19 @@ export function ChatPanel({
   sttOnline,
   disabled,
   isVlm,
+  isSpeaking,
   imagePreview,
   onImageSelect,
   onImageRemove,
 }: ChatPanelProps) {
+  // Show the stop button while the avatar is speaking, even after LLM finishes
+  const effectiveStatus =
+    isSpeaking && status === 'ready' ? 'streaming' : status
+
   return (
     <VlmChatPanel
       messages={messages}
-      status={status}
+      status={effectiveStatus}
       input={input}
       onInputChange={onInputChange}
       onSend={onSend}
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar-lite/demo.tsx b/usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar-lite/demo.tsx
diff --git a/usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar/demo.tsx b/usecases/ai/edge-ai-demo-studio/frontend/src/samples/digital-avatar/demo.tsx