Skip to content

Commit b1f6ead

Browse files
committed
feat(channels): TTS auto-apply for Discord/Telegram/WhatsApp + UI enhancements
Backend: - Add TTS auto-apply support to Discord, Telegram, WhatsApp channels - Improve text processing for TTS strip markdown - Wire TTS config handler to gateway server UI: - Add voice recorder hook for chat input - Add agent hooks tab with summary card - Update i18n for agents (en/vi/zh) - Minor CSP and credential section fixes
1 parent 284c9cb commit b1f6ead

27 files changed

Lines changed: 932 additions & 47 deletions

File tree

cmd/gateway.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ func runGateway() {
378378

379379
// Register all RPC methods
380380
server.SetLogTee(logTee)
381-
pairingMethods, heartbeatMethods, chatMethods := registerAllMethods(server, agentRouter, pgStores.Sessions, pgStores.Cron, pgStores.Pairing, cfg, cfgPath, workspace, dataDir, msgBus, execApprovalMgr, pgStores.Agents, pgStores.Skills, pgStores.ConfigSecrets, pgStores.Teams, contextFileInterceptor, logTee, pgStores.Heartbeats, pgStores.ConfigPermissions, pgStores.SystemConfigs, pgStores.Tenants, pgStores.SkillTenantCfgs)
381+
pairingMethods, heartbeatMethods, chatMethods := registerAllMethods(server, agentRouter, pgStores.Sessions, pgStores.Cron, pgStores.Pairing, cfg, cfgPath, workspace, dataDir, msgBus, execApprovalMgr, pgStores.Agents, pgStores.Skills, pgStores.ConfigSecrets, pgStores.Teams, contextFileInterceptor, logTee, pgStores.Heartbeats, pgStores.ConfigPermissions, pgStores.SystemConfigs, pgStores.Tenants, pgStores.SkillTenantCfgs, audioMgr)
382382

383383
// Phase 3: Agent hooks RPC methods (hooks.list/create/update/delete/toggle/test/history).
384384
if hs, ok := pgStores.Hooks.(hooks.HookStore); ok && hs != nil {

cmd/gateway_lifecycle.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,9 @@ func (d *gatewayDeps) runLifecycle(
9494
return
9595
}
9696
if d.pgStores.ConfigSecrets != nil {
97-
if secrets, err := d.pgStores.ConfigSecrets.GetAll(context.Background()); err == nil && len(secrets) > 0 {
97+
// Use master tenant context to load global TTS secrets
98+
masterCtx := store.WithTenantID(context.Background(), store.MasterTenantID)
99+
if secrets, err := d.pgStores.ConfigSecrets.GetAll(masterCtx); err == nil && len(secrets) > 0 {
98100
updatedCfg.ApplyDBSecrets(secrets)
99101
}
100102
}

cmd/gateway_methods.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"log/slog"
66

77
"github.com/nextlevelbuilder/goclaw/internal/agent"
8+
"github.com/nextlevelbuilder/goclaw/internal/audio"
89
"github.com/nextlevelbuilder/goclaw/internal/bus"
910
"github.com/nextlevelbuilder/goclaw/internal/config"
1011
"github.com/nextlevelbuilder/goclaw/internal/gateway"
@@ -13,11 +14,12 @@ import (
1314
"github.com/nextlevelbuilder/goclaw/internal/tools"
1415
)
1516

16-
func registerAllMethods(server *gateway.Server, agents *agent.Router, sessStore store.SessionStore, cronStore store.CronStore, pairingStore store.PairingStore, cfg *config.Config, cfgPath, workspace, dataDir string, msgBus *bus.MessageBus, execApprovalMgr *tools.ExecApprovalManager, agentStore store.AgentStore, skillStore store.SkillStore, configSecretsStore store.ConfigSecretsStore, teamStore store.TeamStore, contextFileInterceptor *tools.ContextFileInterceptor, logTee *gateway.LogTee, heartbeatStore store.HeartbeatStore, configPermStore store.ConfigPermissionStore, sysConfigStore store.SystemConfigStore, tenantStore store.TenantStore, skillTenantCfgStore store.SkillTenantConfigStore) (*methods.PairingMethods, *methods.HeartbeatMethods, *methods.ChatMethods) {
17+
func registerAllMethods(server *gateway.Server, agents *agent.Router, sessStore store.SessionStore, cronStore store.CronStore, pairingStore store.PairingStore, cfg *config.Config, cfgPath, workspace, dataDir string, msgBus *bus.MessageBus, execApprovalMgr *tools.ExecApprovalManager, agentStore store.AgentStore, skillStore store.SkillStore, configSecretsStore store.ConfigSecretsStore, teamStore store.TeamStore, contextFileInterceptor *tools.ContextFileInterceptor, logTee *gateway.LogTee, heartbeatStore store.HeartbeatStore, configPermStore store.ConfigPermissionStore, sysConfigStore store.SystemConfigStore, tenantStore store.TenantStore, skillTenantCfgStore store.SkillTenantConfigStore, audioMgr *audio.Manager) (*methods.PairingMethods, *methods.HeartbeatMethods, *methods.ChatMethods) {
1718
router := server.Router()
1819

1920
// Phase 1: Core methods
2021
chatMethods := methods.NewChatMethods(agents, sessStore, cfg, server.RateLimiter(), msgBus)
22+
chatMethods.SetAudioManager(audioMgr) // Wire TTS auto-apply for WS responses
2123
chatMethods.Register(router)
2224
methods.NewAgentsMethods(agents, cfg, cfgPath, workspace, agentStore, contextFileInterceptor, msgBus).Register(router)
2325
methods.NewSessionsMethods(sessStore, msgBus, cfg).Register(router)

internal/audio/text_processing.go

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,47 @@ package audio
22

33
import "regexp"
44

5+
// Pre-compiled regexes for performance (called per stream chunk).
6+
var (
7+
mdFencedCodeRe = regexp.MustCompile("(?s)```[^`]*```")
8+
mdInlineCodeRe = regexp.MustCompile("`([^`]+)`")
9+
mdBoldStarRe = regexp.MustCompile(`\*\*([^*]+)\*\*`)
10+
mdItalicStarRe = regexp.MustCompile(`\*([^*]+)\*`)
11+
mdBoldUnderRe = regexp.MustCompile(`__([^_]+)__`)
12+
mdItalicUnderRe = regexp.MustCompile(`_([^_]+)_`)
13+
mdLinkRe = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`)
14+
mdHeadingRe = regexp.MustCompile(`(?m)^#+\s+`)
15+
16+
ttsTextBlockRe = regexp.MustCompile(`(?s)\[\[tts:text\]\](.*?)\[\[/tts:text\]\]`)
17+
ttsVoiceBlockRe = regexp.MustCompile(`(?s)\[\[tts\]\].*?\[\[/tts\]\]`)
18+
ttsBareTagRe = regexp.MustCompile(`\[\[/?tts(?::[^\]]*)?\]\]`)
19+
)
20+
521
// stripMarkdown removes common markdown formatting so TTS reads prose, not
622
// syntax characters. Preserves inner text of bold/italic/inline code/links.
723
func stripMarkdown(text string) string {
8-
text = regexp.MustCompile("(?s)```[^`]*```").ReplaceAllString(text, "")
9-
text = regexp.MustCompile("`([^`]+)`").ReplaceAllString(text, "$1")
10-
text = regexp.MustCompile(`\*\*([^*]+)\*\*`).ReplaceAllString(text, "$1")
11-
text = regexp.MustCompile(`\*([^*]+)\*`).ReplaceAllString(text, "$1")
12-
text = regexp.MustCompile(`__([^_]+)__`).ReplaceAllString(text, "$1")
13-
text = regexp.MustCompile(`_([^_]+)_`).ReplaceAllString(text, "$1")
14-
text = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(text, "$1")
15-
text = regexp.MustCompile(`(?m)^#+\s+`).ReplaceAllString(text, "")
24+
text = mdFencedCodeRe.ReplaceAllString(text, "")
25+
text = mdInlineCodeRe.ReplaceAllString(text, "$1")
26+
text = mdBoldStarRe.ReplaceAllString(text, "$1")
27+
text = mdItalicStarRe.ReplaceAllString(text, "$1")
28+
text = mdBoldUnderRe.ReplaceAllString(text, "$1")
29+
text = mdItalicUnderRe.ReplaceAllString(text, "$1")
30+
text = mdLinkRe.ReplaceAllString(text, "$1")
31+
text = mdHeadingRe.ReplaceAllString(text, "")
1632
return text
1733
}
1834

19-
// stripTtsDirectives removes [[tts...]] markup from text.
20-
// `[[tts:text]]...[[/tts:text]]` blocks keep their inner content.
21-
// Bare `[[tts]]` and `[[tts:something]]` tags are removed entirely.
22-
func stripTtsDirectives(text string) string {
23-
text = regexp.MustCompile(`(?s)\[\[tts:text\]\](.*?)\[\[/tts:text\]\]`).ReplaceAllString(text, "$1")
24-
text = regexp.MustCompile(`\[\[tts(?::[^\]]*)?\]\]`).ReplaceAllString(text, "")
35+
// StripTTSDirectives removes [[tts...]] markup from text.
36+
// `[[tts:text]]...[[/tts:text]]` blocks keep their inner content (voice + text display).
37+
// `[[tts]]...[[/tts]]` blocks are removed entirely including content (voice only, no text).
38+
// Bare `[[tts:something]]` tags without closing are removed.
39+
// Exported for use by channels TTS auto-apply.
40+
func StripTTSDirectives(text string) string {
41+
// 1. [[tts:text]]...[[/tts:text]] → keep inner content (transcript mode)
42+
text = ttsTextBlockRe.ReplaceAllString(text, "$1")
43+
// 2. [[tts]]...[[/tts]] → remove entirely including content (voice-only mode)
44+
text = ttsVoiceBlockRe.ReplaceAllString(text, "")
45+
// 3. Remove any remaining bare/unclosed tags
46+
text = ttsBareTagRe.ReplaceAllString(text, "")
2547
return text
2648
}

internal/channels/discord/discord.go

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"log/slog"
7+
"strings"
78
"sync"
89
"time"
910

@@ -130,7 +131,7 @@ func (c *Channel) Stop(_ context.Context) error {
130131
}
131132

132133
// Send delivers an outbound message to a Discord channel.
133-
func (c *Channel) Send(_ context.Context, msg bus.OutboundMessage) (err error) {
134+
func (c *Channel) Send(ctx context.Context, msg bus.OutboundMessage) (err error) {
134135
if !c.IsRunning() {
135136
return fmt.Errorf("discord bot not running")
136137
}
@@ -166,6 +167,42 @@ func (c *Channel) Send(_ context.Context, msg bus.OutboundMessage) (err error) {
166167

167168
content := msg.Content
168169

170+
// TTS auto-apply: convert [[tts]] tagged responses to voice
171+
if c.audioMgr != nil && content != "" {
172+
isVoiceInbound := msg.Metadata["is_voice_inbound"] == "true"
173+
ttsResult, ttsErr := c.audioMgr.AutoApplyToText(ctx, content, "discord", isVoiceInbound, "")
174+
if ttsErr != nil {
175+
slog.Debug("discord: tts auto-apply error", "error", ttsErr)
176+
}
177+
if ttsResult != nil && ttsResult.AudioPath != "" {
178+
// Send voice file via media API
179+
if err := c.sendMediaMessage(channelID, "", []bus.MediaAttachment{{
180+
URL: ttsResult.AudioPath,
181+
ContentType: ttsResult.AudioMime,
182+
}}); err != nil {
183+
slog.Warn("discord: tts auto-apply voice send failed, falling back to text", "error", err)
184+
} else {
185+
// Voice sent successfully
186+
strippedText := strings.TrimSpace(ttsResult.Text)
187+
if strippedText == "" {
188+
// Voice-only: delete placeholder (no text to show)
189+
if pID, ok := c.placeholders.LoadAndDelete(placeholderKey); ok {
190+
if msgID, ok := pID.(string); ok {
191+
_ = c.session.ChannelMessageDelete(channelID, msgID)
192+
}
193+
}
194+
return nil
195+
}
196+
// Has remaining text: let normal flow handle placeholder edit
197+
content = strippedText
198+
}
199+
}
200+
// Update content with directives stripped (even if TTS not applied)
201+
if ttsResult != nil {
202+
content = ttsResult.Text
203+
}
204+
}
205+
169206
// Handle outbound media attachments: send files via Discord's file upload API.
170207
if len(msg.Media) > 0 {
171208
// Delete placeholder if present

internal/channels/manager.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type RunContext struct {
3232
ChatID string
3333
MessageID string // platform message ID (string to support Feishu "om_xxx", Telegram "12345", etc.)
3434
Metadata map[string]string // outbound routing metadata (thread_id, local_key, group_id)
35+
TenantID uuid.UUID // tenant scope for per-tenant TTS
3536
Streaming bool // whether run uses streaming (to avoid double-delivery of block replies)
3637
BlockReplyEnabled bool // whether block.reply delivery is enabled for this run (resolved at RegisterRun time)
3738
ToolStatusEnabled bool // whether tool name shows in streaming preview during tool execution

internal/channels/telegram/send.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,39 @@ func (c *Channel) Send(ctx context.Context, msg bus.OutboundMessage) error {
252252
return err
253253
}
254254

255+
// TTS auto-apply: convert [[tts]] tagged responses to voice
256+
if c.audioMgr != nil && msg.Content != "" {
257+
isVoiceInbound := msg.Metadata["is_voice_inbound"] == "true"
258+
ttsResult, ttsErr := c.audioMgr.AutoApplyToText(ctx, msg.Content, "telegram", isVoiceInbound, "")
259+
if ttsErr != nil {
260+
slog.Debug("telegram: tts auto-apply error", "error", ttsErr)
261+
}
262+
if ttsResult != nil && ttsResult.AudioPath != "" {
263+
// Send voice message instead of text
264+
if err := c.sendVoice(ctx, tu.ID(chatID), ttsResult.AudioPath, "", replyToMsgID, threadID); err != nil {
265+
slog.Warn("telegram: tts auto-apply voice send failed, falling back to text", "error", err)
266+
} else {
267+
// Voice sent successfully
268+
strippedText := strings.TrimSpace(ttsResult.Text)
269+
if strippedText == "" {
270+
// Voice-only: delete placeholder (no text to show)
271+
if pID, ok := c.placeholders.LoadAndDelete(localKey); ok {
272+
if msgID, ok := pID.(int); ok && msgID > 0 {
273+
_ = c.deleteMessage(ctx, chatID, msgID)
274+
}
275+
}
276+
return nil
277+
}
278+
// Has remaining text: let normal flow handle placeholder edit
279+
msg.Content = strippedText
280+
}
281+
}
282+
// Update content with directives stripped (even if TTS not applied)
283+
if ttsResult != nil {
284+
msg.Content = ttsResult.Text
285+
}
286+
}
287+
255288
// Text-only message
256289
htmlContent := markdownToTelegramHTML(msg.Content)
257290
chunks := chunkHTML(htmlContent, telegramMaxMessageLen)

internal/channels/telegram/stream.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/mymmrac/telego"
1414
tu "github.com/mymmrac/telego/telegoutil"
1515

16+
"github.com/nextlevelbuilder/goclaw/internal/audio"
1617
"github.com/nextlevelbuilder/goclaw/internal/channels"
1718
)
1819

@@ -158,6 +159,8 @@ func (ds *DraftStream) flush(ctx context.Context) error {
158159
}
159160

160161
text := ds.pending
162+
// Strip TTS directives before displaying (they'll be processed by Send() later)
163+
text = audio.StripTTSDirectives(text)
161164
htmlText := markdownToTelegramHTML(text)
162165

163166
// --- Draft transport (sendMessageDraft) ---

internal/channels/whatsapp/outbound.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import (
1616
)
1717

1818
// Send delivers an outbound message to WhatsApp via whatsmeow.
19-
func (c *Channel) Send(_ context.Context, msg bus.OutboundMessage) error {
19+
func (c *Channel) Send(ctx context.Context, msg bus.OutboundMessage) error {
2020
if c.client == nil || !c.client.IsConnected() {
2121
return fmt.Errorf("whatsapp not connected")
2222
}
@@ -26,6 +26,44 @@ func (c *Channel) Send(_ context.Context, msg bus.OutboundMessage) error {
2626
return fmt.Errorf("invalid whatsapp JID %q: %w", msg.ChatID, err)
2727
}
2828

29+
// TTS auto-apply: convert [[tts]] tagged responses to voice
30+
if c.audioMgr != nil && msg.Content != "" {
31+
isVoiceInbound := msg.Metadata["is_voice_inbound"] == "true"
32+
ttsResult, ttsErr := c.audioMgr.AutoApplyToText(ctx, msg.Content, "whatsapp", isVoiceInbound, "")
33+
if ttsErr != nil {
34+
slog.Debug("whatsapp: tts auto-apply error", "error", ttsErr)
35+
}
36+
if ttsResult != nil && ttsResult.AudioPath != "" {
37+
// Send audio as voice message
38+
audioData, readErr := os.ReadFile(ttsResult.AudioPath)
39+
if readErr == nil {
40+
waMsg, buildErr := c.buildMediaMessage(audioData, ttsResult.AudioMime, "")
41+
if buildErr == nil {
42+
// Mark as voice message (PTT) for WhatsApp
43+
if waMsg.AudioMessage != nil {
44+
waMsg.AudioMessage.PTT = new(true)
45+
}
46+
if _, sendErr := c.client.SendMessage(c.ctx, chatJID, waMsg); sendErr != nil {
47+
slog.Warn("whatsapp: tts auto-apply voice send failed, falling back to text", "error", sendErr)
48+
} else {
49+
// Voice sent successfully, stop typing and return
50+
if cancel, ok := c.typingCancel.LoadAndDelete(msg.ChatID); ok {
51+
if fn, ok := cancel.(context.CancelFunc); ok {
52+
fn()
53+
}
54+
}
55+
go c.sendPresence(chatJID, types.ChatPresencePaused)
56+
return nil
57+
}
58+
}
59+
}
60+
}
61+
// Update content with directives stripped (even if TTS not applied)
62+
if ttsResult != nil {
63+
msg.Content = ttsResult.Text
64+
}
65+
}
66+
2967
// Send media attachments first.
3068
if len(msg.Media) > 0 {
3169
for i, m := range msg.Media {

internal/gateway/methods/chat.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"log/slog"
1010

1111
"github.com/nextlevelbuilder/goclaw/internal/agent"
12+
"github.com/nextlevelbuilder/goclaw/internal/audio"
1213
"github.com/nextlevelbuilder/goclaw/internal/bus"
1314
"github.com/nextlevelbuilder/goclaw/internal/config"
1415
httpapi "github.com/nextlevelbuilder/goclaw/internal/http"
@@ -30,12 +31,18 @@ type ChatMethods struct {
3031
rateLimiter *gateway.RateLimiter
3132
eventBus bus.EventPublisher
3233
postTurn tools.PostTurnProcessor
34+
audioMgr *audio.Manager // for TTS auto-apply on WS responses (nil = disabled)
3335
}
3436

3537
func NewChatMethods(agents *agent.Router, sess store.SessionStore, cfg *config.Config, rl *gateway.RateLimiter, eventBus bus.EventPublisher) *ChatMethods {
3638
return &ChatMethods{agents: agents, sessions: sess, cfg: cfg, rateLimiter: rl, eventBus: eventBus}
3739
}
3840

41+
// SetAudioManager sets the audio manager for TTS auto-apply on WS responses.
42+
func (m *ChatMethods) SetAudioManager(mgr *audio.Manager) {
43+
m.audioMgr = mgr
44+
}
45+
3946
// SetPostTurnProcessor sets the post-turn processor for team task dispatch.
4047
func (m *ChatMethods) SetPostTurnProcessor(pt tools.PostTurnProcessor) {
4148
m.postTurn = pt
@@ -321,16 +328,40 @@ func (m *ChatMethods) handleSend(ctx context.Context, client *gateway.Client, re
321328
}()
322329
}
323330

331+
// TTS auto-apply: convert [[tts]] tagged responses to voice audio
332+
content := result.Content
333+
var ttsAudio *agent.MediaResult
334+
if m.audioMgr != nil && content != "" {
335+
// For WS, we don't have voice inbound info - use "tagged" mode only
336+
ttsResult, _ := m.audioMgr.AutoApplyToText(runCtx, content, "ws", false, "")
337+
if ttsResult != nil && ttsResult.AudioPath != "" {
338+
// Include audio in media results
339+
ttsAudio = &agent.MediaResult{
340+
Path: httpapi.SignMediaPath(ttsResult.AudioPath, httpapi.FileSigningKey()),
341+
ContentType: ttsResult.AudioMime,
342+
AsVoice: true,
343+
}
344+
content = ttsResult.Text // Use stripped text
345+
} else if ttsResult != nil {
346+
content = ttsResult.Text // Strip directives even if TTS not applied
347+
}
348+
}
349+
324350
resp := map[string]any{
325351
"runId": result.RunID,
326-
"content": result.Content,
352+
"content": content,
327353
"usage": result.Usage,
328354
}
329355
if result.Thinking != "" {
330356
resp["thinking"] = result.Thinking
331357
}
332-
if len(result.Media) > 0 {
333-
resp["media"] = result.Media
358+
// Combine existing media with TTS audio
359+
mediaResults := result.Media
360+
if ttsAudio != nil {
361+
mediaResults = append([]agent.MediaResult{*ttsAudio}, mediaResults...)
362+
}
363+
if len(mediaResults) > 0 {
364+
resp["media"] = mediaResults
334365
}
335366
client.SendResponse(protocol.NewOKResponse(req.ID, resp))
336367
}()

0 commit comments

Comments
 (0)