diff --git a/.nvmrc b/.nvmrc new file mode 100644 index 000000000..2bd5a0a98 --- /dev/null +++ b/.nvmrc @@ -0,0 +1 @@ +22 diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e177b835..5c1f9698d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai ### Added +- Web UI: add speech-to-text dictation to chat compose via Deepgram Flux — mic button, keyboard shortcut (Cmd/Ctrl+Shift+D), recording indicators, and end-of-thought detection. - Gateway: add `agents.create`, `agents.update`, `agents.delete` RPC methods for web UI agent management. (#11045) Thanks @advaitpaliwal. - Gateway: add node command allowlists (default-deny unknown node commands; configurable via `gateway.nodes.allowCommands` / `gateway.nodes.denyCommands`). (#11755) Thanks @mbelinky. - Plugins: add `device-pair` (Telegram `/pair` flow) and `phone-control` (iOS/Android node controls). (#11755) Thanks @mbelinky. diff --git a/docker-compose.yml b/docker-compose.yml index 614a1f8d5..9d066547a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,7 @@ services: openclaw-gateway: image: ${OPENCLAW_IMAGE:-openclaw:local} + user: root environment: HOME: /home/node TERM: xterm-256color @@ -8,12 +9,22 @@ services: CLAUDE_AI_SESSION_KEY: ${CLAUDE_AI_SESSION_KEY} CLAUDE_WEB_SESSION_KEY: ${CLAUDE_WEB_SESSION_KEY} CLAUDE_WEB_COOKIE: ${CLAUDE_WEB_COOKIE} + DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY} + OPENAI_API_KEY: ${OPENAI_API_KEY:-} + TWILIO_ACCOUNT_SID: ${TWILIO_ACCOUNT_SID} + TWILIO_AUTH_TOKEN: ${TWILIO_AUTH_TOKEN} + PUBLIC_URL: ${PUBLIC_URL:-} volumes: - ${OPENCLAW_CONFIG_DIR}:/home/node/.openclaw - ${OPENCLAW_WORKSPACE_DIR}:/home/node/.openclaw/workspace + - /Users/billgetman/openclaw/sandboxes:/Users/billgetman/openclaw/sandboxes + - /Users/billgetman/.docker/run/docker.sock:/var/run/docker.sock + - ${OPENCLAW_CONFIG_DIR}/docker-cli:/usr/local/bin/docker:ro + - ./dist:/app/dist:ro ports: - "${OPENCLAW_GATEWAY_PORT:-18789}:18789" - "${OPENCLAW_BRIDGE_PORT:-18790}:18790" + - "${OPENCLAW_VOICE_PORT:-8000}:3334" init: true restart: unless-stopped command: diff --git a/docs/plans/2026-02-07-dictation-design.md b/docs/plans/2026-02-07-dictation-design.md new file mode 100644 index 000000000..4ac383cda --- /dev/null +++ b/docs/plans/2026-02-07-dictation-design.md @@ -0,0 +1,214 @@ +# Voice Dictation in Web Chat + +**Date:** 2026-02-07 +**Status:** Design complete, ready for implementation + +## Overview + +Add voice dictation to the web chat compose box using Deepgram's Flux model for real-time speech-to-text with intelligent end-of-thought detection. + +## User Flow + +1. User clicks mic button (or presses `Cmd/Ctrl+Shift+D`) +2. Browser requests mic permission if not already granted +3. Mic button turns red/pulsing to indicate active recording +4. Audio streams to gateway, which proxies to Deepgram Flux +5. Live transcript appears in textarea as user speaks (interim results updating in real-time) +6. Flux detects end-of-thought → recording auto-stops (or user manually stops) +7. Final transcript remains in textarea; user can edit and press Enter to send + +## Technical Architecture + +### Browser Side + +**New module:** `ui/src/ui/dictation.ts` + +- Mic access via `navigator.mediaDevices.getUserMedia({ audio: true })` +- Audio capture using `AudioWorklet` (required; no ScriptProcessorNode fallback) +- Output format: linear16 PCM, 16kHz sample rate +- WebSocket connection to gateway dictation endpoint +- Receives transcript events, updates textarea draft state + +### Gateway Side + +**New WebSocket endpoint:** `/dictation/stream` + +- Authenticates using existing gateway auth mechanism +- Opens upstream WebSocket to Deepgram: + ``` + wss://api.deepgram.com/v2/listen?model=flux-general-en&encoding=linear16&sample_rate=16000&interim_results=true&punctuate=true&smart_format=true + ``` +- Proxies audio chunks: browser → Deepgram +- Proxies transcript events: Deepgram → browser +- Uses existing `DEEPGRAM_API_KEY` from provider config + +### Message Flow + +``` +Browser mic + ↓ +AudioWorklet (PCM chunks, ~80ms) + ↓ +Gateway WebSocket (/dictation/stream) + ↓ +Deepgram WebSocket (/v2/listen, Flux) + ↓ +Transcript events (Results, UtteranceEnd) + ↓ +Gateway → Browser + ↓ +Textarea updates +``` + +### Deepgram Flux Configuration + +| Parameter | Value | Purpose | +| ----------------- | ----------------- | ----------------------------------------------- | +| `model` | `flux-general-en` | Conversational model with end-of-turn detection | +| `encoding` | `linear16` | PCM audio format | +| `sample_rate` | `16000` | 16kHz sample rate | +| `interim_results` | `true` | Stream partial transcripts | +| `punctuate` | `true` | Auto-punctuation | +| `smart_format` | `true` | Formatting for numbers, dates, etc. | + +Flux provides ~260ms end-of-turn detection latency. + +## UI Components + +### Mic Button + +**Location:** `chat-compose__actions` div, before "New session" button + +**States:** + +- `idle` - Gray mic icon, clickable +- `recording` - Red pulsing mic icon, clickable to stop +- `disabled` - Grayed out (no Deepgram API key configured) + +**Tooltip:** + +- When enabled: "Dictate (⌘⇧D)" / "Dictate (Ctrl+Shift+D)" +- When disabled: "Configure Deepgram API key to enable dictation" + +### Recording Indicator + +- Mic icon pulses with CSS animation (`@keyframes pulse`) +- Visual state clearly indicates active recording + +### Textarea Behavior + +- Interim text may appear in lighter color or italic (distinguishes unconfirmed words) +- Final text renders in normal style as Deepgram confirms +- Existing draft text preserved; dictation appends at cursor position +- User can type while recording (both inputs work simultaneously) + +### Permission Modal + +Triggered when `getUserMedia()` fails with `NotAllowedError`. + +**Content:** + +- Header: "Microphone Access Required" +- Browser-specific instructions for Chrome, Safari, Firefox, Edge +- Buttons: "Try Again", "Cancel" + +## Keyboard Shortcut + +- **Shortcut:** `Cmd+Shift+D` (macOS) / `Ctrl+Shift+D` (Windows/Linux) +- **Behavior:** Toggles dictation on/off +- **Discoverability:** Shown in mic button tooltip + +## Feature Detection + +### Gateway Hello Response + +Add to gateway hello payload: + +```typescript +features: { + dictation: boolean; // true if DEEPGRAM_API_KEY is configured +} +``` + +### Browser Requirements + +- `navigator.mediaDevices.getUserMedia` support +- `AudioWorklet` support (Chrome 66+, Firefox 76+, Safari 14.1+) + +If AudioWorklet unavailable, mic button is hidden (no fallback for v1). + +## Error Handling + +### Connection Failures + +| Scenario | Behavior | +| ----------------------- | ----------------------------------------------------------- | +| Gateway WebSocket fails | Inline error: "Dictation unavailable. Check connection." | +| Deepgram upstream fails | Error event to browser: "Transcription service unavailable" | +| Transient failure | Auto-retry once, then show error | + +### During Recording + +| Scenario | Behavior | +| ------------------------ | -------------------------------------------------------- | +| WebSocket disconnects | Stop recording, keep transcript, show brief error | +| User navigates away | Stop recording gracefully (send CloseStream) | +| No audio for 10+ seconds | Subtle hint: "No audio detected. Check your microphone." | + +### Concurrent Usage + +- Only one dictation session at a time +- Click mic while recording = stop +- Typing while recording = both work (no conflict) + +## Configuration + +### Required + +- `DEEPGRAM_API_KEY` environment variable (existing) + +### No New Config + +- Dictation enabled automatically if Deepgram key is present +- No separate toggle to enable/disable dictation feature +- Uses system default microphone (no mic picker) + +## Scope + +**In scope (v1):** + +- Web UI chat only +- Single language (English via `flux-general-en`) +- System default microphone + +**Out of scope (future):** + +- Native apps (iOS, macOS, Android) +- TUI +- Language selection +- Microphone picker +- Waveform visualization + +## Files to Create/Modify + +### New Files + +- `ui/src/ui/dictation.ts` - Dictation state machine, mic handling, WebSocket client +- `ui/src/ui/audio-worklet.ts` - AudioWorklet processor for PCM capture +- `ui/src/ui/components/mic-permission-modal.ts` - Permission help modal +- `ui/src/styles/dictation.css` - Mic button states, pulse animation +- `src/gateway/server-dictation.ts` - Gateway WebSocket proxy to Deepgram + +### Modified Files + +- `ui/src/ui/views/chat.ts` - Add mic button to compose area +- `ui/src/ui/app-chat.ts` - Integrate dictation state +- `src/gateway/server.ts` - Register dictation WebSocket endpoint +- `src/gateway/protocol/schema/hello.ts` - Add `features.dictation` field + +## Testing + +- Unit tests for dictation state machine +- Integration test for gateway proxy (mock Deepgram) +- Manual browser testing for mic permission flows +- E2E test with real Deepgram (live test, requires key) diff --git a/docs/plans/2026-02-07-dictation-impl.md b/docs/plans/2026-02-07-dictation-impl.md new file mode 100644 index 000000000..3ff2de8d2 --- /dev/null +++ b/docs/plans/2026-02-07-dictation-impl.md @@ -0,0 +1,1262 @@ +# Voice Dictation Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add voice dictation to the web chat compose box using Deepgram Flux for real-time speech-to-text with end-of-thought detection. + +**Architecture:** Browser captures mic audio via AudioWorklet, streams PCM to gateway WebSocket, gateway proxies to Deepgram `/v2/listen` with Flux model, transcripts stream back to update textarea in real-time. + +**Tech Stack:** TypeScript, Lit (UI), WebSocket, AudioWorklet, Deepgram Flux API + +--- + +## Task 1: Add dictation feature flag to gateway hello + +**Files:** + +- Modify: `src/gateway/protocol/schema/frames.ts` +- Modify: `src/gateway/server/ws-connection/message-handler.ts` + +**Step 1: Add dictation to HelloOk features schema** + +In `src/gateway/protocol/schema/frames.ts`, update the `features` object in `HelloOkSchema`: + +```typescript +features: Type.Object( + { + methods: Type.Array(NonEmptyString), + events: Type.Array(NonEmptyString), + dictation: Type.Optional(Type.Boolean()), + }, + { additionalProperties: false }, +), +``` + +**Step 2: Populate dictation flag in hello response** + +Find where the hello-ok response is built (in `src/gateway/server/ws-connection/message-handler.ts` or related file) and add logic to check for `DEEPGRAM_API_KEY`: + +```typescript +// Add import at top +import { resolveProviderApiKey } from "../../../agents/model-auth.js"; + +// In the hello-ok response building: +const deepgramKey = resolveProviderApiKey("deepgram", config); +const dictationEnabled = Boolean(deepgramKey?.trim()); + +// Add to features: +features: { + methods: gatewayMethods, + events, + dictation: dictationEnabled, +}, +``` + +**Step 3: Commit** + +```bash +git add src/gateway/protocol/schema/frames.ts src/gateway/server/ws-connection/message-handler.ts +git commit -m "feat(gateway): add dictation feature flag to hello response" +``` + +--- + +## Task 2: Create gateway dictation WebSocket proxy + +**Files:** + +- Create: `src/gateway/server-dictation.ts` +- Modify: `src/gateway/server-http.ts` + +**Step 1: Create dictation WebSocket handler** + +Create `src/gateway/server-dictation.ts`: + +```typescript +import type { IncomingMessage } from "node:http"; +import type { Duplex } from "node:stream"; +import type { WebSocketServer } from "ws"; +import { WebSocket } from "ws"; +import type { createSubsystemLogger } from "../logging/subsystem.js"; +import { resolveProviderApiKey } from "../agents/model-auth.js"; +import { loadConfig } from "../config/config.js"; + +type SubsystemLogger = ReturnType; + +const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v2/listen"; + +type DictationParams = { + model?: string; + language?: string; + sampleRate?: string; +}; + +function buildDeepgramUrl(params: DictationParams): string { + const url = new URL(DEEPGRAM_WS_URL); + url.searchParams.set("model", params.model || "flux-general-en"); + url.searchParams.set("encoding", "linear16"); + url.searchParams.set("sample_rate", params.sampleRate || "16000"); + url.searchParams.set("interim_results", "true"); + url.searchParams.set("punctuate", "true"); + url.searchParams.set("smart_format", "true"); + if (params.language) { + url.searchParams.set("language", params.language); + } + return url.toString(); +} + +export function createDictationUpgradeHandler(opts: { log: SubsystemLogger }) { + const { log } = opts; + + return (req: IncomingMessage, socket: Duplex, head: Buffer, wss: WebSocketServer) => { + const config = loadConfig(); + const apiKey = resolveProviderApiKey("deepgram", config); + + if (!apiKey) { + log.warn("dictation: DEEPGRAM_API_KEY not configured"); + socket.write("HTTP/1.1 503 Service Unavailable\r\n\r\n"); + socket.destroy(); + return; + } + + wss.handleUpgrade(req, socket, head, (clientWs) => { + log.info("dictation: client connected"); + + const url = new URL(req.url ?? "/", "http://localhost"); + const params: DictationParams = { + model: url.searchParams.get("model") ?? undefined, + language: url.searchParams.get("language") ?? undefined, + sampleRate: url.searchParams.get("sample_rate") ?? undefined, + }; + + const deepgramUrl = buildDeepgramUrl(params); + const deepgramWs = new WebSocket(deepgramUrl, { + headers: { + Authorization: `Token ${apiKey}`, + }, + }); + + let deepgramReady = false; + const pendingAudio: Buffer[] = []; + + deepgramWs.on("open", () => { + log.info("dictation: connected to Deepgram"); + deepgramReady = true; + // Flush any pending audio + for (const chunk of pendingAudio) { + deepgramWs.send(chunk); + } + pendingAudio.length = 0; + }); + + deepgramWs.on("message", (data) => { + // Forward Deepgram responses to client + if (clientWs.readyState === WebSocket.OPEN) { + clientWs.send(data.toString()); + } + }); + + deepgramWs.on("error", (err) => { + log.error(`dictation: Deepgram error: ${err.message}`); + if (clientWs.readyState === WebSocket.OPEN) { + clientWs.send(JSON.stringify({ type: "Error", message: "Transcription service error" })); + clientWs.close(1011, "Deepgram error"); + } + }); + + deepgramWs.on("close", (code, reason) => { + log.info(`dictation: Deepgram closed (${code}): ${reason.toString()}`); + if (clientWs.readyState === WebSocket.OPEN) { + clientWs.close(1000, "Deepgram closed"); + } + }); + + clientWs.on("message", (data) => { + // Handle control messages + if (typeof data === "string" || (data instanceof Buffer && data[0] === 0x7b)) { + try { + const msg = JSON.parse(data.toString()); + if (msg.type === "CloseStream" || msg.type === "Finalize") { + if (deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(JSON.stringify(msg)); + } + return; + } + if (msg.type === "KeepAlive") { + if (deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(JSON.stringify(msg)); + } + return; + } + } catch { + // Not JSON, treat as audio + } + } + + // Forward audio data to Deepgram + if (deepgramReady && deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(data as Buffer); + } else { + // Buffer audio until Deepgram is ready + pendingAudio.push(data as Buffer); + } + }); + + clientWs.on("close", () => { + log.info("dictation: client disconnected"); + if (deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(JSON.stringify({ type: "CloseStream" })); + deepgramWs.close(); + } + }); + + clientWs.on("error", (err) => { + log.error(`dictation: client error: ${err.message}`); + deepgramWs.close(); + }); + }); + }; +} +``` + +**Step 2: Register dictation endpoint in HTTP server** + +In `src/gateway/server-http.ts`, add the upgrade handler. Find where WebSocket upgrades are handled and add: + +```typescript +// Import at top +import { createDictationUpgradeHandler } from "./server-dictation.js"; + +// In the server setup, add upgrade handling for /dictation/stream +// This requires finding where httpServer.on("upgrade") is handled +``` + +Note: The exact integration point depends on how upgrades are currently handled. Look for `server.on("upgrade"` patterns. + +**Step 3: Commit** + +```bash +git add src/gateway/server-dictation.ts src/gateway/server-http.ts +git commit -m "feat(gateway): add dictation WebSocket proxy endpoint" +``` + +--- + +## Task 3: Create browser AudioWorklet for PCM capture + +**Files:** + +- Create: `ui/src/ui/audio-worklet-processor.ts` + +**Step 1: Create the AudioWorklet processor** + +Create `ui/src/ui/audio-worklet-processor.ts`: + +```typescript +// This file runs in AudioWorklet context +// It must be a separate file loaded via addModule() + +const BUFFER_SIZE = 1280; // 80ms at 16kHz + +class PcmCaptureProcessor extends AudioWorkletProcessor { + private buffer: Float32Array; + private bufferIndex: number; + + constructor() { + super(); + this.buffer = new Float32Array(BUFFER_SIZE); + this.bufferIndex = 0; + } + + process( + inputs: Float32Array[][], + _outputs: Float32Array[][], + _parameters: Record, + ): boolean { + const input = inputs[0]?.[0]; + if (!input) { + return true; + } + + for (let i = 0; i < input.length; i++) { + this.buffer[this.bufferIndex++] = input[i]; + + if (this.bufferIndex >= BUFFER_SIZE) { + // Convert float32 to int16 PCM + const pcm = new Int16Array(BUFFER_SIZE); + for (let j = 0; j < BUFFER_SIZE; j++) { + const s = Math.max(-1, Math.min(1, this.buffer[j])); + pcm[j] = s < 0 ? s * 0x8000 : s * 0x7fff; + } + + this.port.postMessage(pcm.buffer, [pcm.buffer]); + this.buffer = new Float32Array(BUFFER_SIZE); + this.bufferIndex = 0; + } + } + + return true; + } +} + +registerProcessor("pcm-capture-processor", PcmCaptureProcessor); +``` + +**Step 2: Commit** + +```bash +git add ui/src/ui/audio-worklet-processor.ts +git commit -m "feat(ui): add AudioWorklet processor for PCM capture" +``` + +--- + +## Task 4: Create dictation state machine and WebSocket client + +**Files:** + +- Create: `ui/src/ui/dictation.ts` + +**Step 1: Create the dictation module** + +Create `ui/src/ui/dictation.ts`: + +```typescript +export type DictationState = + | "idle" + | "requesting-permission" + | "connecting" + | "recording" + | "error"; + +export type DictationTranscript = { + text: string; + isFinal: boolean; +}; + +export type DictationCallbacks = { + onStateChange: (state: DictationState) => void; + onTranscript: (transcript: DictationTranscript) => void; + onError: (error: string) => void; +}; + +type DeepgramResult = { + type: "Results"; + is_final?: boolean; + speech_final?: boolean; + channel?: { + alternatives?: Array<{ + transcript?: string; + }>; + }; +}; + +export class DictationClient { + private state: DictationState = "idle"; + private audioContext: AudioContext | null = null; + private mediaStream: MediaStream | null = null; + private workletNode: AudioWorkletNode | null = null; + private ws: WebSocket | null = null; + private callbacks: DictationCallbacks; + private gatewayUrl: string; + private interimText = ""; + + constructor(gatewayUrl: string, callbacks: DictationCallbacks) { + this.gatewayUrl = gatewayUrl; + this.callbacks = callbacks; + } + + get currentState(): DictationState { + return this.state; + } + + async start(): Promise { + if (this.state !== "idle" && this.state !== "error") { + return; + } + + this.setState("requesting-permission"); + + try { + this.mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: 16000, + echoCancellation: true, + noiseSuppression: true, + }, + }); + } catch (err) { + const error = err as Error; + if (error.name === "NotAllowedError" || error.name === "PermissionDeniedError") { + this.callbacks.onError("permission_denied"); + } else { + this.callbacks.onError(`Microphone error: ${error.message}`); + } + this.setState("error"); + return; + } + + this.setState("connecting"); + + try { + await this.connectWebSocket(); + await this.startAudioCapture(); + this.setState("recording"); + } catch (err) { + this.callbacks.onError(`Connection error: ${(err as Error).message}`); + this.cleanup(); + this.setState("error"); + } + } + + stop(): void { + if (this.state !== "recording") { + return; + } + + // Send finalize to get any remaining transcript + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: "Finalize" })); + } + + this.cleanup(); + this.setState("idle"); + } + + private setState(state: DictationState): void { + this.state = state; + this.callbacks.onStateChange(state); + } + + private async connectWebSocket(): Promise { + return new Promise((resolve, reject) => { + const wsUrl = this.gatewayUrl.replace(/^http/, "ws") + "/dictation/stream"; + this.ws = new WebSocket(wsUrl); + + const timeout = window.setTimeout(() => { + reject(new Error("Connection timeout")); + this.ws?.close(); + }, 10000); + + this.ws.onopen = () => { + window.clearTimeout(timeout); + resolve(); + }; + + this.ws.onerror = () => { + window.clearTimeout(timeout); + reject(new Error("WebSocket error")); + }; + + this.ws.onmessage = (event) => { + this.handleMessage(event.data); + }; + + this.ws.onclose = () => { + if (this.state === "recording") { + this.callbacks.onError("Connection closed unexpectedly"); + this.cleanup(); + this.setState("error"); + } + }; + }); + } + + private async startAudioCapture(): Promise { + this.audioContext = new AudioContext({ sampleRate: 16000 }); + + // Load the worklet processor + const workletUrl = new URL("./audio-worklet-processor.ts", import.meta.url); + await this.audioContext.audioWorklet.addModule(workletUrl.href); + + const source = this.audioContext.createMediaStreamSource(this.mediaStream!); + this.workletNode = new AudioWorkletNode(this.audioContext, "pcm-capture-processor"); + + this.workletNode.port.onmessage = (event) => { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(event.data); + } + }; + + source.connect(this.workletNode); + // Don't connect to destination - we don't want to play back the audio + } + + private handleMessage(data: string): void { + try { + const msg = JSON.parse(data) as DeepgramResult; + + if (msg.type === "Results") { + const transcript = msg.channel?.alternatives?.[0]?.transcript ?? ""; + + if (msg.is_final) { + // Final transcript for this utterance + this.interimText = ""; + this.callbacks.onTranscript({ text: transcript, isFinal: true }); + + // Check for end-of-thought (speech_final from Flux) + if (msg.speech_final) { + // Auto-stop on end of thought + this.stop(); + } + } else { + // Interim result + this.interimText = transcript; + this.callbacks.onTranscript({ text: transcript, isFinal: false }); + } + } + } catch { + // Ignore parse errors + } + } + + private cleanup(): void { + if (this.workletNode) { + this.workletNode.disconnect(); + this.workletNode = null; + } + + if (this.audioContext) { + this.audioContext.close().catch(() => {}); + this.audioContext = null; + } + + if (this.mediaStream) { + for (const track of this.mediaStream.getTracks()) { + track.stop(); + } + this.mediaStream = null; + } + + if (this.ws) { + if (this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: "CloseStream" })); + } + this.ws.close(); + this.ws = null; + } + + this.interimText = ""; + } +} + +export function isDictationSupported(): boolean { + return ( + typeof navigator !== "undefined" && + typeof navigator.mediaDevices !== "undefined" && + typeof navigator.mediaDevices.getUserMedia === "function" && + typeof AudioWorkletNode !== "undefined" + ); +} +``` + +**Step 2: Commit** + +```bash +git add ui/src/ui/dictation.ts +git commit -m "feat(ui): add dictation client with WebSocket and AudioWorklet" +``` + +--- + +## Task 5: Add mic icon to icons.ts + +**Files:** + +- Modify: `ui/src/ui/icons.ts` + +**Step 1: Add microphone icons** + +Add to `ui/src/ui/icons.ts` in the icons object: + +```typescript +mic: html` + + + + + +`, +micOff: html` + + + + + + + + +`, +``` + +**Step 2: Commit** + +```bash +git add ui/src/ui/icons.ts +git commit -m "feat(ui): add microphone icons for dictation" +``` + +--- + +## Task 6: Add dictation CSS styles + +**Files:** + +- Create: `ui/src/styles/dictation.css` +- Modify: `ui/src/styles/chat.css` + +**Step 1: Create dictation styles** + +Create `ui/src/styles/dictation.css`: + +```css +/* Dictation button states */ +.chat-dictation-btn { + display: flex; + align-items: center; + justify-content: center; + padding: 0.5rem; + border-radius: var(--radius-md); + transition: + background-color 0.15s, + color 0.15s; +} + +.chat-dictation-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.chat-dictation-btn--recording { + background-color: var(--color-danger); + color: white; + animation: pulse-recording 1.5s ease-in-out infinite; +} + +.chat-dictation-btn--recording:hover { + background-color: var(--color-danger-hover, #dc2626); +} + +@keyframes pulse-recording { + 0%, + 100% { + opacity: 1; + } + 50% { + opacity: 0.7; + } +} + +/* Interim text styling in textarea */ +.chat-compose__field textarea.has-interim::placeholder { + color: transparent; +} + +/* Permission modal */ +.dictation-permission-modal { + max-width: 28rem; +} + +.dictation-permission-modal__content { + display: flex; + flex-direction: column; + gap: 1rem; +} + +.dictation-permission-modal__browser-instructions { + background: var(--color-bg-subtle); + border-radius: var(--radius-md); + padding: 1rem; + font-size: 0.875rem; +} + +.dictation-permission-modal__browser-instructions h4 { + margin: 0 0 0.5rem; + font-weight: 600; +} + +.dictation-permission-modal__browser-instructions ol { + margin: 0; + padding-left: 1.25rem; +} + +.dictation-permission-modal__browser-instructions li { + margin-bottom: 0.25rem; +} + +.dictation-permission-modal__actions { + display: flex; + gap: 0.5rem; + justify-content: flex-end; +} +``` + +**Step 2: Import in chat.css** + +Add to `ui/src/styles/chat.css`: + +```css +@import "./dictation.css"; +``` + +**Step 3: Commit** + +```bash +git add ui/src/styles/dictation.css ui/src/styles/chat.css +git commit -m "feat(ui): add dictation button and modal styles" +``` + +--- + +## Task 7: Create mic permission modal component + +**Files:** + +- Create: `ui/src/ui/components/mic-permission-modal.ts` + +**Step 1: Create the modal component** + +Create `ui/src/ui/components/mic-permission-modal.ts`: + +```typescript +import { html, nothing } from "lit"; + +export type MicPermissionModalProps = { + open: boolean; + onClose: () => void; + onRetry: () => void; +}; + +function detectBrowser(): "chrome" | "safari" | "firefox" | "edge" | "other" { + const ua = navigator.userAgent.toLowerCase(); + if (ua.includes("edg/")) return "edge"; + if (ua.includes("chrome")) return "chrome"; + if (ua.includes("safari") && !ua.includes("chrome")) return "safari"; + if (ua.includes("firefox")) return "firefox"; + return "other"; +} + +function getBrowserInstructions(browser: ReturnType) { + switch (browser) { + case "chrome": + return html` +

Chrome

+
    +
  1. Click the lock/tune icon in the address bar
  2. +
  3. Find "Microphone" in the permissions list
  4. +
  5. Change it to "Allow"
  6. +
  7. Refresh the page
  8. +
+ `; + case "safari": + return html` +

Safari

+
    +
  1. Go to Safari → Settings → Websites
  2. +
  3. Click "Microphone" in the sidebar
  4. +
  5. Find this site and set to "Allow"
  6. +
  7. Refresh the page
  8. +
+ `; + case "firefox": + return html` +

Firefox

+
    +
  1. Click the lock icon in the address bar
  2. +
  3. Click "Connection secure" → "More information"
  4. +
  5. Go to Permissions tab
  6. +
  7. Find Microphone and uncheck "Use default"
  8. +
  9. Select "Allow" and refresh
  10. +
+ `; + case "edge": + return html` +

Edge

+
    +
  1. Click the lock icon in the address bar
  2. +
  3. Click "Permissions for this site"
  4. +
  5. Find "Microphone" and set to "Allow"
  6. +
  7. Refresh the page
  8. +
+ `; + default: + return html` +

Browser Settings

+
    +
  1. Open your browser's site settings
  2. +
  3. Find microphone permissions
  4. +
  5. Allow this site to use your microphone
  6. +
  7. Refresh the page
  8. +
+ `; + } +} + +export function renderMicPermissionModal(props: MicPermissionModalProps) { + if (!props.open) { + return nothing; + } + + const browser = detectBrowser(); + + return html` + + `; +} +``` + +**Step 2: Commit** + +```bash +git add ui/src/ui/components/mic-permission-modal.ts +git commit -m "feat(ui): add microphone permission help modal" +``` + +--- + +## Task 8: Integrate dictation into chat view + +**Files:** + +- Modify: `ui/src/ui/views/chat.ts` +- Modify: `ui/src/ui/app-chat.ts` +- Modify: `ui/src/ui/app.ts` + +**Step 1: Add dictation props to ChatProps** + +In `ui/src/ui/views/chat.ts`, add to `ChatProps`: + +```typescript +// Add to ChatProps type +dictationEnabled?: boolean; +dictationState?: "idle" | "requesting-permission" | "connecting" | "recording" | "error"; +dictationError?: string | null; +showMicPermissionModal?: boolean; +onDictationToggle?: () => void; +onMicPermissionModalClose?: () => void; +onMicPermissionRetry?: () => void; +``` + +**Step 2: Add mic button to compose area** + +In the `renderChat` function, add the mic button before the "New session" button: + +```typescript +// Import at top +import { icons } from "../icons.ts"; +import { renderMicPermissionModal } from "../components/mic-permission-modal.ts"; + +// In the chat-compose__actions div, before the New session button: +${props.dictationEnabled !== false ? html` + +` : nothing} +``` + +**Step 3: Add permission modal render** + +At the end of the `renderChat` function, before the closing ``: + +```typescript +${renderMicPermissionModal({ + open: Boolean(props.showMicPermissionModal), + onClose: () => props.onMicPermissionModalClose?.(), + onRetry: () => props.onMicPermissionRetry?.(), +})} +``` + +**Step 4: Commit** + +```bash +git add ui/src/ui/views/chat.ts +git commit -m "feat(ui): add dictation button and modal to chat view" +``` + +--- + +## Task 9: Add dictation state management to app + +**Files:** + +- Modify: `ui/src/ui/app.ts` +- Modify: `ui/src/ui/app-chat.ts` + +**Step 1: Add dictation state to app** + +In the main app class, add state properties and handlers: + +```typescript +// Import at top +import { DictationClient, isDictationSupported, type DictationState } from "./dictation.ts"; + +// Add state properties +private dictationClient: DictationClient | null = null; +private dictationState: DictationState = "idle"; +private dictationEnabled = false; +private showMicPermissionModal = false; +private pendingDictationText = ""; +private finalDictationText = ""; + +// In the hello handler, check for dictation feature +if (hello.features?.dictation) { + this.dictationEnabled = isDictationSupported(); +} + +// Add dictation handlers +private handleDictationToggle = () => { + if (!this.dictationEnabled) return; + + if (this.dictationState === "recording") { + this.dictationClient?.stop(); + } else if (this.dictationState === "idle" || this.dictationState === "error") { + this.startDictation(); + } +}; + +private startDictation = () => { + if (!this.dictationClient) { + this.dictationClient = new DictationClient( + this.gatewayUrl, + { + onStateChange: (state) => { + this.dictationState = state; + this.requestUpdate(); + }, + onTranscript: ({ text, isFinal }) => { + if (isFinal) { + // Append final text to message + const existing = this.chatMessage.trimEnd(); + const spacer = existing && !existing.endsWith(" ") ? " " : ""; + this.chatMessage = existing + spacer + text; + this.pendingDictationText = ""; + } else { + // Show interim text + this.pendingDictationText = text; + } + this.requestUpdate(); + }, + onError: (error) => { + if (error === "permission_denied") { + this.showMicPermissionModal = true; + } + this.requestUpdate(); + }, + } + ); + } + this.dictationClient.start(); +}; + +private handleMicPermissionModalClose = () => { + this.showMicPermissionModal = false; + this.requestUpdate(); +}; + +private handleMicPermissionRetry = () => { + this.showMicPermissionModal = false; + this.startDictation(); +}; +``` + +**Step 2: Add keyboard shortcut handler** + +Add to the app's keyboard event handling: + +```typescript +// In connectedCallback or init +document.addEventListener("keydown", this.handleGlobalKeydown); + +// Handler +private handleGlobalKeydown = (e: KeyboardEvent) => { + // Cmd/Ctrl + Shift + D for dictation + if ((e.metaKey || e.ctrlKey) && e.shiftKey && e.key.toLowerCase() === "d") { + e.preventDefault(); + this.handleDictationToggle(); + } +}; + +// In disconnectedCallback +document.removeEventListener("keydown", this.handleGlobalKeydown); +``` + +**Step 3: Pass props to chat view** + +When calling `renderChat`, add the dictation props: + +```typescript +dictationEnabled: this.dictationEnabled, +dictationState: this.dictationState, +showMicPermissionModal: this.showMicPermissionModal, +onDictationToggle: this.handleDictationToggle, +onMicPermissionModalClose: this.handleMicPermissionModalClose, +onMicPermissionRetry: this.handleMicPermissionRetry, +``` + +**Step 4: Commit** + +```bash +git add ui/src/ui/app.ts ui/src/ui/app-chat.ts +git commit -m "feat(ui): integrate dictation state management and keyboard shortcut" +``` + +--- + +## Task 10: Add tests for dictation client + +**Files:** + +- Create: `ui/src/ui/dictation.test.ts` + +**Step 1: Write unit tests** + +Create `ui/src/ui/dictation.test.ts`: + +```typescript +import { describe, expect, it, vi } from "vitest"; +import { isDictationSupported } from "./dictation.ts"; + +describe("dictation", () => { + describe("isDictationSupported", () => { + it("returns true when getUserMedia and AudioWorkletNode are available", () => { + // Mock browser APIs + const originalNavigator = global.navigator; + const originalAudioWorkletNode = global.AudioWorkletNode; + + Object.defineProperty(global, "navigator", { + value: { + mediaDevices: { + getUserMedia: vi.fn(), + }, + }, + writable: true, + }); + Object.defineProperty(global, "AudioWorkletNode", { + value: class {}, + writable: true, + }); + + expect(isDictationSupported()).toBe(true); + + // Restore + Object.defineProperty(global, "navigator", { value: originalNavigator, writable: true }); + Object.defineProperty(global, "AudioWorkletNode", { + value: originalAudioWorkletNode, + writable: true, + }); + }); + + it("returns false when getUserMedia is not available", () => { + const originalNavigator = global.navigator; + + Object.defineProperty(global, "navigator", { + value: { + mediaDevices: undefined, + }, + writable: true, + }); + + expect(isDictationSupported()).toBe(false); + + Object.defineProperty(global, "navigator", { value: originalNavigator, writable: true }); + }); + }); +}); +``` + +**Step 2: Run tests** + +```bash +cd open_claw && pnpm test ui/src/ui/dictation.test.ts +``` + +**Step 3: Commit** + +```bash +git add ui/src/ui/dictation.test.ts +git commit -m "test(ui): add dictation client unit tests" +``` + +--- + +## Task 11: Add tests for gateway dictation handler + +**Files:** + +- Create: `src/gateway/server-dictation.test.ts` + +**Step 1: Write unit tests** + +Create `src/gateway/server-dictation.test.ts`: + +```typescript +import { describe, expect, it, vi } from "vitest"; + +// Mock the imports +vi.mock("./server-dictation.js", async () => { + const actual = await vi.importActual("./server-dictation.js"); + return actual; +}); + +describe("server-dictation", () => { + it("module exports createDictationUpgradeHandler", async () => { + const mod = await import("./server-dictation.js"); + expect(typeof mod.createDictationUpgradeHandler).toBe("function"); + }); +}); +``` + +**Step 2: Run tests** + +```bash +cd open_claw && pnpm test src/gateway/server-dictation.test.ts +``` + +**Step 3: Commit** + +```bash +git add src/gateway/server-dictation.test.ts +git commit -m "test(gateway): add dictation handler tests" +``` + +--- + +## Task 12: Update GatewayHelloOk type in UI + +**Files:** + +- Modify: `ui/src/ui/gateway.ts` + +**Step 1: Add dictation to features type** + +Update the `GatewayHelloOk` type: + +```typescript +export type GatewayHelloOk = { + type: "hello-ok"; + protocol: number; + features?: { methods?: string[]; events?: string[]; dictation?: boolean }; + snapshot?: unknown; + auth?: { + deviceToken?: string; + role?: string; + scopes?: string[]; + issuedAtMs?: number; + }; + policy?: { tickIntervalMs?: number }; +}; +``` + +**Step 2: Commit** + +```bash +git add ui/src/ui/gateway.ts +git commit -m "feat(ui): add dictation to GatewayHelloOk type" +``` + +--- + +## Task 13: Build and verify + +**Step 1: Run full build** + +```bash +cd open_claw && pnpm build +``` + +**Step 2: Run linter** + +```bash +cd open_claw && pnpm check +``` + +**Step 3: Run all tests** + +```bash +cd open_claw && pnpm test +``` + +**Step 4: Fix any issues** + +Address any build errors, lint issues, or test failures. + +**Step 5: Commit fixes if needed** + +```bash +git add -A +git commit -m "fix: address build and test issues for dictation feature" +``` + +--- + +## Task 14: Manual testing checklist + +1. Start gateway with `DEEPGRAM_API_KEY` configured +2. Open web UI +3. Verify mic button appears in compose area +4. Click mic button - should request permission +5. Grant permission - should start recording (button pulses red) +6. Speak - should see interim text appear +7. Stop speaking - Flux should detect end-of-thought and stop automatically +8. Verify transcript is in textarea +9. Test `Cmd/Ctrl+Shift+D` shortcut +10. Test without API key - button should be disabled with tooltip +11. Test permission denial - modal should appear with instructions + +--- + +## Summary of files created/modified + +**New files:** + +- `src/gateway/server-dictation.ts` +- `src/gateway/server-dictation.test.ts` +- `ui/src/ui/dictation.ts` +- `ui/src/ui/dictation.test.ts` +- `ui/src/ui/audio-worklet-processor.ts` +- `ui/src/ui/components/mic-permission-modal.ts` +- `ui/src/styles/dictation.css` + +**Modified files:** + +- `src/gateway/protocol/schema/frames.ts` +- `src/gateway/server/ws-connection/message-handler.ts` +- `src/gateway/server-http.ts` +- `ui/src/ui/icons.ts` +- `ui/src/styles/chat.css` +- `ui/src/ui/views/chat.ts` +- `ui/src/ui/gateway.ts` +- `ui/src/ui/app.ts` +- `ui/src/ui/app-chat.ts` diff --git a/extensions/telnyx-sms/index.ts b/extensions/telnyx-sms/index.ts new file mode 100644 index 000000000..cc4603e47 --- /dev/null +++ b/extensions/telnyx-sms/index.ts @@ -0,0 +1,19 @@ +import type { OpenClawPluginApi } from "openclaw/plugin-sdk"; +import { emptyPluginConfigSchema } from "openclaw/plugin-sdk"; +import { telnyxSmsDock, telnyxSmsPlugin } from "./src/channel.js"; +import { handleTelnyxSmsWebhookRequest } from "./src/monitor.js"; +import { setTelnyxSmsRuntime } from "./src/runtime.js"; + +const plugin = { + id: "telnyx-sms", + name: "Telnyx SMS", + description: "OpenClaw Telnyx SMS channel plugin", + configSchema: emptyPluginConfigSchema(), + register(api: OpenClawPluginApi) { + setTelnyxSmsRuntime(api.runtime); + api.registerChannel({ plugin: telnyxSmsPlugin, dock: telnyxSmsDock }); + api.registerHttpHandler(handleTelnyxSmsWebhookRequest); + }, +}; + +export default plugin; diff --git a/extensions/telnyx-sms/openclaw.plugin.json b/extensions/telnyx-sms/openclaw.plugin.json new file mode 100644 index 000000000..4953f915c --- /dev/null +++ b/extensions/telnyx-sms/openclaw.plugin.json @@ -0,0 +1,9 @@ +{ + "id": "telnyx-sms", + "channels": ["telnyx-sms"], + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/telnyx-sms/package.json b/extensions/telnyx-sms/package.json new file mode 100644 index 000000000..c27b637d1 --- /dev/null +++ b/extensions/telnyx-sms/package.json @@ -0,0 +1,35 @@ +{ + "name": "@openclaw/telnyx-sms", + "version": "2026.2.6-1", + "description": "OpenClaw Telnyx SMS channel plugin", + "type": "module", + "devDependencies": { + "openclaw": "workspace:*" + }, + "peerDependencies": { + "openclaw": ">=2026.1.26" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ], + "channel": { + "id": "telnyx-sms", + "label": "Telnyx SMS", + "selectionLabel": "Telnyx SMS", + "detailLabel": "Telnyx SMS", + "docsPath": "/channels/telnyx-sms", + "docsLabel": "telnyx-sms", + "blurb": "SMS messaging via Telnyx Messaging API.", + "aliases": [ + "sms" + ], + "order": 76 + }, + "install": { + "npmSpec": "@openclaw/telnyx-sms", + "localPath": "extensions/telnyx-sms", + "defaultChoice": "npm" + } + } +} diff --git a/extensions/telnyx-sms/src/accounts.ts b/extensions/telnyx-sms/src/accounts.ts new file mode 100644 index 000000000..b46cc35a6 --- /dev/null +++ b/extensions/telnyx-sms/src/accounts.ts @@ -0,0 +1,155 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk"; +import { DEFAULT_ACCOUNT_ID, normalizeAccountId } from "openclaw/plugin-sdk"; + +export type TelnyxSmsCredentialSource = "config" | "env" | "none"; + +export type TelnyxSmsAccountConfig = { + enabled?: boolean; + name?: string; + apiKey?: string; + phoneNumber?: string; + publicKey?: string; + webhookPath?: string; + dm?: { + policy?: string; + allowFrom?: Array; + enabled?: boolean; + }; + textChunkLimit?: number; + mediaMaxMb?: number; +}; + +export type ResolvedTelnyxSmsAccount = { + accountId: string; + name?: string; + enabled: boolean; + config: TelnyxSmsAccountConfig; + credentialSource: TelnyxSmsCredentialSource; + apiKey: string | undefined; + phoneNumber: string | undefined; + publicKey: string | undefined; +}; + +const ENV_API_KEY = "TELNYX_API_KEY"; +const ENV_PHONE_NUMBER = "TELNYX_PHONE_NUMBER"; +const ENV_PUBLIC_KEY = "TELNYX_PUBLIC_KEY"; + +function getChannelSection(cfg: OpenClawConfig): Record | undefined { + const raw = cfg.channels?.["telnyx-sms"]; + if (!raw || typeof raw !== "object") { + return undefined; + } + return raw as Record; +} + +function listConfiguredAccountIds(cfg: OpenClawConfig): string[] { + const section = getChannelSection(cfg); + const accounts = section?.accounts; + if (!accounts || typeof accounts !== "object") { + return []; + } + return Object.keys(accounts as Record).filter(Boolean); +} + +export function listTelnyxSmsAccountIds(cfg: OpenClawConfig): string[] { + const ids = listConfiguredAccountIds(cfg); + if (ids.length === 0) { + return [DEFAULT_ACCOUNT_ID]; + } + return ids.toSorted((a, b) => a.localeCompare(b)); +} + +export function resolveDefaultTelnyxSmsAccountId(cfg: OpenClawConfig): string { + const section = getChannelSection(cfg); + const defaultAccount = (section?.defaultAccount as string | undefined)?.trim(); + if (defaultAccount) { + return defaultAccount; + } + const ids = listTelnyxSmsAccountIds(cfg); + if (ids.includes(DEFAULT_ACCOUNT_ID)) { + return DEFAULT_ACCOUNT_ID; + } + return ids[0] ?? DEFAULT_ACCOUNT_ID; +} + +function resolveAccountConfig( + cfg: OpenClawConfig, + accountId: string, +): TelnyxSmsAccountConfig | undefined { + const section = getChannelSection(cfg); + const accounts = section?.accounts as Record | undefined; + if (!accounts || typeof accounts !== "object") { + return undefined; + } + return accounts[accountId]; +} + +function mergeAccountConfig(cfg: OpenClawConfig, accountId: string): TelnyxSmsAccountConfig { + const section = getChannelSection(cfg) ?? {}; + const { accounts: _ignored, defaultAccount: _ignored2, ...base } = section; + const account = resolveAccountConfig(cfg, accountId) ?? {}; + return { ...base, ...account } as TelnyxSmsAccountConfig; +} + +function resolveCredentials(params: { accountId: string; account: TelnyxSmsAccountConfig }): { + apiKey: string | undefined; + phoneNumber: string | undefined; + publicKey: string | undefined; + source: TelnyxSmsCredentialSource; +} { + const { account, accountId } = params; + + const configApiKey = account.apiKey?.trim(); + const configPhone = account.phoneNumber?.trim(); + if (configApiKey) { + return { + apiKey: configApiKey, + phoneNumber: configPhone, + publicKey: account.publicKey?.trim(), + source: "config", + }; + } + + if (accountId === DEFAULT_ACCOUNT_ID) { + const envApiKey = process.env[ENV_API_KEY]?.trim(); + if (envApiKey) { + return { + apiKey: envApiKey, + phoneNumber: configPhone || process.env[ENV_PHONE_NUMBER]?.trim(), + publicKey: account.publicKey?.trim() || process.env[ENV_PUBLIC_KEY]?.trim(), + source: "env", + }; + } + } + + return { apiKey: undefined, phoneNumber: undefined, publicKey: undefined, source: "none" }; +} + +export function resolveTelnyxSmsAccount(params: { + cfg: OpenClawConfig; + accountId?: string | null; +}): ResolvedTelnyxSmsAccount { + const accountId = normalizeAccountId(params.accountId); + const baseEnabled = getChannelSection(params.cfg)?.enabled !== false; + const merged = mergeAccountConfig(params.cfg, accountId); + const accountEnabled = merged.enabled !== false; + const enabled = baseEnabled && accountEnabled; + const credentials = resolveCredentials({ accountId, account: merged }); + + return { + accountId, + name: merged.name?.trim() || undefined, + enabled, + config: merged, + credentialSource: credentials.source, + apiKey: credentials.apiKey, + phoneNumber: credentials.phoneNumber, + publicKey: credentials.publicKey, + }; +} + +export function listEnabledTelnyxSmsAccounts(cfg: OpenClawConfig): ResolvedTelnyxSmsAccount[] { + return listTelnyxSmsAccountIds(cfg) + .map((accountId) => resolveTelnyxSmsAccount({ cfg, accountId })) + .filter((account) => account.enabled); +} diff --git a/extensions/telnyx-sms/src/api.ts b/extensions/telnyx-sms/src/api.ts new file mode 100644 index 000000000..f2b10fb4c --- /dev/null +++ b/extensions/telnyx-sms/src/api.ts @@ -0,0 +1,77 @@ +import type { ResolvedTelnyxSmsAccount } from "./accounts.js"; +import type { TelnyxSmsSendResponse } from "./types.js"; + +const TELNYX_API_BASE = "https://api.telnyx.com/v2"; + +export async function sendTelnyxSms(params: { + account: ResolvedTelnyxSmsAccount; + to: string; + text?: string; + mediaUrls?: string[]; +}): Promise { + const { account, to, text, mediaUrls } = params; + if (!account.apiKey) { + throw new Error("Telnyx API key is not configured."); + } + if (!account.phoneNumber) { + throw new Error("Telnyx phone number is not configured."); + } + + const body: Record = { + from: account.phoneNumber, + to, + text: text ?? "", + }; + + if (mediaUrls && mediaUrls.length > 0) { + body.media_urls = mediaUrls; + body.type = "MMS"; + } + + const response = await fetch(`${TELNYX_API_BASE}/messages`, { + method: "POST", + headers: { + Authorization: `Bearer ${account.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Telnyx API error: ${response.status} ${errorText}`); + } + + return (await response.json()) as TelnyxSmsSendResponse; +} + +export async function probeTelnyxSms(account: ResolvedTelnyxSmsAccount): Promise<{ + ok: boolean; + status?: number; + error?: string; +}> { + if (!account.apiKey) { + return { ok: false, error: "API key not configured" }; + } + + try { + const response = await fetch(`${TELNYX_API_BASE}/messaging_profiles`, { + method: "GET", + headers: { + Authorization: `Bearer ${account.apiKey}`, + "Content-Type": "application/json", + }, + }); + + if (!response.ok) { + return { ok: false, status: response.status, error: `HTTP ${response.status}` }; + } + + return { ok: true }; + } catch (err) { + return { + ok: false, + error: err instanceof Error ? err.message : String(err), + }; + } +} diff --git a/extensions/telnyx-sms/src/channel.ts b/extensions/telnyx-sms/src/channel.ts new file mode 100644 index 000000000..f59e97f5a --- /dev/null +++ b/extensions/telnyx-sms/src/channel.ts @@ -0,0 +1,471 @@ +import { + applyAccountNameToChannelSection, + buildChannelConfigSchema, + DEFAULT_ACCOUNT_ID, + deleteAccountFromConfigSection, + formatPairingApproveHint, + migrateBaseNameToDefaultAccount, + missingTargetError, + normalizeAccountId, + normalizeE164, + PAIRING_APPROVED_MESSAGE, + setAccountEnabledInConfigSection, + type ChannelDock, + type ChannelPlugin, + type OpenClawConfig, +} from "openclaw/plugin-sdk"; +import { + listTelnyxSmsAccountIds, + resolveDefaultTelnyxSmsAccountId, + resolveTelnyxSmsAccount, + type ResolvedTelnyxSmsAccount, +} from "./accounts.js"; +import { probeTelnyxSms, sendTelnyxSms } from "./api.js"; +import { resolveTelnyxSmsWebhookPath, startTelnyxSmsMonitor } from "./monitor.js"; +import { getTelnyxSmsRuntime } from "./runtime.js"; + +const channel = "telnyx-sms"; + +const meta = { + id: channel, + label: "Telnyx SMS", + selectionLabel: "Telnyx SMS", + docsPath: "/channels/telnyx-sms", + docsLabel: "telnyx-sms", + blurb: "SMS messaging via Telnyx Messaging API.", + aliases: ["sms"], + order: 76, + quickstartAllowFrom: true, +}; + +function normalizePhone(raw: string): string { + return normalizeE164(raw.replace(/^(telnyx-sms|sms):/i, "")); +} + +function looksLikePhone(raw: string): boolean { + const cleaned = raw + .trim() + .replace(/^(telnyx-sms|sms):/i, "") + .trim(); + return /^\+?\d{10,15}$/.test(cleaned); +} + +export const telnyxSmsDock: ChannelDock = { + id: channel, + capabilities: { + chatTypes: ["direct"], + reactions: false, + media: true, + threads: false, + blockStreaming: true, + }, + outbound: { textChunkLimit: 1600 }, + config: { + resolveAllowFrom: ({ cfg, accountId }) => + (resolveTelnyxSmsAccount({ cfg, accountId }).config.dm?.allowFrom ?? []).map((entry) => + String(entry), + ), + formatAllowFrom: ({ allowFrom }) => + allowFrom + .map((entry) => String(entry)) + .filter(Boolean) + .map((entry) => normalizePhone(entry)), + }, +}; + +// Build a minimal config schema — Telnyx SMS has no Zod schema in core +const telnyxSmsConfigSchema = buildChannelConfigSchema({ + // Empty schema — no additional validation beyond what buildChannelConfigSchema provides +}); + +export const telnyxSmsPlugin: ChannelPlugin = { + id: channel, + meta: { ...meta }, + capabilities: { + chatTypes: ["direct"], + reactions: false, + media: true, + threads: false, + nativeCommands: false, + blockStreaming: true, + }, + streaming: { + blockStreamingCoalesceDefaults: { minChars: 1200, idleMs: 800 }, + }, + reload: { configPrefixes: ["channels.telnyx-sms"] }, + configSchema: telnyxSmsConfigSchema, + config: { + listAccountIds: (cfg) => listTelnyxSmsAccountIds(cfg), + resolveAccount: (cfg, accountId) => resolveTelnyxSmsAccount({ cfg, accountId }), + defaultAccountId: (cfg) => resolveDefaultTelnyxSmsAccountId(cfg), + setAccountEnabled: ({ cfg, accountId, enabled }) => + setAccountEnabledInConfigSection({ + cfg, + sectionKey: "telnyx-sms", + accountId, + enabled, + allowTopLevel: true, + }), + deleteAccount: ({ cfg, accountId }) => + deleteAccountFromConfigSection({ + cfg, + sectionKey: "telnyx-sms", + accountId, + clearBaseFields: ["apiKey", "phoneNumber", "publicKey", "webhookPath", "name"], + }), + isConfigured: (account) => account.credentialSource !== "none", + describeAccount: (account) => ({ + accountId: account.accountId, + name: account.name, + enabled: account.enabled, + configured: account.credentialSource !== "none", + credentialSource: account.credentialSource, + }), + resolveAllowFrom: ({ cfg, accountId }) => + (resolveTelnyxSmsAccount({ cfg, accountId }).config.dm?.allowFrom ?? []).map((entry) => + String(entry), + ), + formatAllowFrom: ({ allowFrom }) => + allowFrom + .map((entry) => String(entry)) + .filter(Boolean) + .map((entry) => normalizePhone(entry)), + }, + pairing: { + idLabel: "phoneNumber", + normalizeAllowEntry: (entry) => normalizePhone(entry), + notifyApproval: async ({ cfg, id }) => { + const account = resolveTelnyxSmsAccount({ cfg }); + if (account.credentialSource === "none") { + return; + } + const to = normalizePhone(id); + await sendTelnyxSms({ account, to, text: PAIRING_APPROVED_MESSAGE }); + }, + }, + security: { + resolveDmPolicy: ({ cfg, accountId, account }) => { + const resolvedAccountId = accountId ?? account.accountId ?? DEFAULT_ACCOUNT_ID; + const section = cfg.channels?.["telnyx-sms"] as Record | undefined; + const useAccountPath = Boolean( + (section?.accounts as Record | undefined)?.[resolvedAccountId], + ); + const allowFromPath = useAccountPath + ? `channels.telnyx-sms.accounts.${resolvedAccountId}.dm.` + : "channels.telnyx-sms.dm."; + return { + policy: account.config.dm?.policy ?? "pairing", + allowFrom: account.config.dm?.allowFrom ?? [], + allowFromPath, + approveHint: formatPairingApproveHint("telnyx-sms"), + normalizeEntry: (raw: string) => normalizePhone(raw), + }; + }, + collectWarnings: ({ account }) => { + const warnings: string[] = []; + if (account.config.dm?.policy === "open") { + warnings.push( + `- Telnyx SMS DMs are open to anyone. Set channels.telnyx-sms.dm.policy="pairing" or "allowlist".`, + ); + } + return warnings; + }, + }, + messaging: { + normalizeTarget: (raw) => { + const cleaned = raw + .trim() + .replace(/^(telnyx-sms|sms):/i, "") + .trim(); + if (!cleaned) { + return null; + } + return normalizeE164(cleaned); + }, + targetResolver: { + looksLikeId: (raw, normalized) => { + const value = normalized ?? raw.trim(); + return looksLikePhone(value); + }, + hint: "<+1XXXXXXXXXX>", + }, + }, + directory: { + self: async () => null, + listPeers: async ({ cfg, accountId, query, limit }) => { + const account = resolveTelnyxSmsAccount({ cfg, accountId }); + const q = query?.trim().toLowerCase() || ""; + const allowFrom = account.config.dm?.allowFrom ?? []; + const peers = Array.from( + new Set( + allowFrom + .map((entry) => String(entry).trim()) + .filter((entry) => Boolean(entry) && entry !== "*") + .map((entry) => normalizePhone(entry)), + ), + ) + .filter((id) => (q ? id.toLowerCase().includes(q) : true)) + .slice(0, limit && limit > 0 ? limit : undefined) + .map((id) => ({ kind: "user", id }) as const); + return peers; + }, + }, + resolver: { + resolveTargets: async ({ inputs, kind }) => { + const resolved = inputs.map((input) => { + const cleaned = input + .trim() + .replace(/^(telnyx-sms|sms):/i, "") + .trim(); + if (!cleaned) { + return { input, resolved: false, note: "empty target" }; + } + const normalized = normalizeE164(cleaned); + if (kind === "user" && looksLikePhone(cleaned)) { + return { input, resolved: true, id: normalized }; + } + return { + input, + resolved: false, + note: "use E.164 phone format (+1XXXXXXXXXX)", + }; + }); + return resolved; + }, + }, + setup: { + resolveAccountId: ({ accountId }) => normalizeAccountId(accountId), + applyAccountName: ({ cfg, accountId, name }) => + applyAccountNameToChannelSection({ + cfg, + channelKey: "telnyx-sms", + accountId, + name, + }), + validateInput: ({ accountId, input }) => { + if (input.useEnv && accountId !== DEFAULT_ACCOUNT_ID) { + return "TELNYX_API_KEY env vars can only be used for the default account."; + } + if (!input.useEnv && !input.token) { + return "Telnyx SMS requires --token (API key) or TELNYX_API_KEY env var."; + } + return null; + }, + applyAccountConfig: ({ cfg, accountId, input }) => { + const namedConfig = applyAccountNameToChannelSection({ + cfg, + channelKey: "telnyx-sms", + accountId, + name: input.name, + }); + const next = + accountId !== DEFAULT_ACCOUNT_ID + ? migrateBaseNameToDefaultAccount({ + cfg: namedConfig, + channelKey: "telnyx-sms", + }) + : namedConfig; + const patch = input.useEnv ? {} : input.token ? { apiKey: input.token } : {}; + const phoneNumber = input.phoneNumber?.trim(); + const webhookPath = input.webhookPath?.trim(); + const configPatch = { + ...patch, + ...(phoneNumber ? { phoneNumber } : {}), + ...(webhookPath ? { webhookPath } : {}), + }; + if (accountId === DEFAULT_ACCOUNT_ID) { + return { + ...next, + channels: { + ...next.channels, + "telnyx-sms": { + ...next.channels?.["telnyx-sms"], + enabled: true, + ...configPatch, + }, + }, + } as OpenClawConfig; + } + return { + ...next, + channels: { + ...next.channels, + "telnyx-sms": { + ...next.channels?.["telnyx-sms"], + enabled: true, + accounts: { + ...(next.channels?.["telnyx-sms"] as Record | undefined)?.accounts, + [accountId]: { + ...( + (next.channels?.["telnyx-sms"] as Record | undefined) + ?.accounts as Record | undefined + )?.[accountId], + enabled: true, + ...configPatch, + }, + }, + }, + }, + } as OpenClawConfig; + }, + }, + outbound: { + deliveryMode: "direct", + chunker: (text, limit) => getTelnyxSmsRuntime().channel.text.chunkMarkdownText(text, limit), + chunkerMode: "markdown", + textChunkLimit: 1600, + resolveTarget: ({ to, allowFrom, mode }) => { + const trimmed = to?.trim() ?? ""; + const allowListRaw = (allowFrom ?? []).map((entry) => String(entry).trim()).filter(Boolean); + const allowList = allowListRaw + .filter((entry) => entry !== "*") + .map((entry) => normalizePhone(entry)) + .filter(Boolean); + + if (trimmed) { + const normalized = normalizePhone(trimmed); + if (!normalized || !looksLikePhone(normalized)) { + if ((mode === "implicit" || mode === "heartbeat") && allowList.length > 0) { + return { ok: true, to: allowList[0] }; + } + return { + ok: false, + error: missingTargetError( + "Telnyx SMS", + "<+1XXXXXXXXXX> or channels.telnyx-sms.dm.allowFrom[0]", + ), + }; + } + return { ok: true, to: normalized }; + } + + if (allowList.length > 0) { + return { ok: true, to: allowList[0] }; + } + return { + ok: false, + error: missingTargetError( + "Telnyx SMS", + "<+1XXXXXXXXXX> or channels.telnyx-sms.dm.allowFrom[0]", + ), + }; + }, + sendText: async ({ cfg, to, text, accountId }) => { + const account = resolveTelnyxSmsAccount({ cfg, accountId }); + const normalizedTo = normalizePhone(to); + const result = await sendTelnyxSms({ account, to: normalizedTo, text }); + return { + channel: "telnyx-sms", + messageId: result?.data?.id ?? "", + chatId: normalizedTo, + }; + }, + sendMedia: async ({ cfg, to, text, mediaUrl, accountId }) => { + if (!mediaUrl) { + throw new Error("Telnyx SMS mediaUrl is required."); + } + const account = resolveTelnyxSmsAccount({ cfg, accountId }); + const normalizedTo = normalizePhone(to); + const result = await sendTelnyxSms({ + account, + to: normalizedTo, + text, + mediaUrls: [mediaUrl], + }); + return { + channel: "telnyx-sms", + messageId: result?.data?.id ?? "", + chatId: normalizedTo, + }; + }, + }, + status: { + defaultRuntime: { + accountId: DEFAULT_ACCOUNT_ID, + running: false, + lastStartAt: null, + lastStopAt: null, + lastError: null, + }, + collectStatusIssues: (accounts) => + accounts.flatMap((entry) => { + const accountId = String(entry.accountId ?? DEFAULT_ACCOUNT_ID); + const enabled = entry.enabled !== false; + const configured = entry.configured === true; + if (!enabled || !configured) { + return []; + } + const issues = []; + if (!entry.phoneNumber) { + issues.push({ + channel: "telnyx-sms", + accountId, + kind: "config", + message: + "Telnyx phone number is missing (set channels.telnyx-sms.phoneNumber or TELNYX_PHONE_NUMBER).", + fix: "Set channels.telnyx-sms.phoneNumber or TELNYX_PHONE_NUMBER env var.", + }); + } + return issues; + }), + buildChannelSummary: ({ snapshot }) => ({ + configured: snapshot.configured ?? false, + credentialSource: snapshot.credentialSource ?? "none", + phoneNumber: snapshot.phoneNumber ?? null, + webhookPath: snapshot.webhookPath ?? null, + running: snapshot.running ?? false, + lastStartAt: snapshot.lastStartAt ?? null, + lastStopAt: snapshot.lastStopAt ?? null, + lastError: snapshot.lastError ?? null, + probe: snapshot.probe, + lastProbeAt: snapshot.lastProbeAt ?? null, + }), + probeAccount: async ({ account }) => probeTelnyxSms(account), + buildAccountSnapshot: ({ account, runtime, probe }) => ({ + accountId: account.accountId, + name: account.name, + enabled: account.enabled, + configured: account.credentialSource !== "none", + credentialSource: account.credentialSource, + phoneNumber: account.phoneNumber, + webhookPath: account.config.webhookPath, + running: runtime?.running ?? false, + lastStartAt: runtime?.lastStartAt ?? null, + lastStopAt: runtime?.lastStopAt ?? null, + lastError: runtime?.lastError ?? null, + lastInboundAt: runtime?.lastInboundAt ?? null, + lastOutboundAt: runtime?.lastOutboundAt ?? null, + dmPolicy: account.config.dm?.policy ?? "pairing", + probe, + }), + }, + gateway: { + startAccount: async (ctx) => { + const account = ctx.account; + ctx.log?.info(`[${account.accountId}] starting Telnyx SMS webhook`); + ctx.setStatus({ + accountId: account.accountId, + running: true, + lastStartAt: Date.now(), + webhookPath: resolveTelnyxSmsWebhookPath({ account }), + phoneNumber: account.phoneNumber, + }); + const unregister = await startTelnyxSmsMonitor({ + account, + config: ctx.cfg, + runtime: ctx.runtime, + abortSignal: ctx.abortSignal, + webhookPath: account.config.webhookPath, + statusSink: (patch) => ctx.setStatus({ accountId: account.accountId, ...patch }), + }); + return () => { + unregister?.(); + ctx.setStatus({ + accountId: account.accountId, + running: false, + lastStopAt: Date.now(), + }); + }; + }, + }, +}; diff --git a/extensions/telnyx-sms/src/monitor.ts b/extensions/telnyx-sms/src/monitor.ts new file mode 100644 index 000000000..6f2146636 --- /dev/null +++ b/extensions/telnyx-sms/src/monitor.ts @@ -0,0 +1,553 @@ +import type { IncomingMessage, ServerResponse } from "node:http"; +import type { OpenClawConfig } from "openclaw/plugin-sdk"; +import crypto from "node:crypto"; +import { createReplyPrefixOptions, normalizeE164 } from "openclaw/plugin-sdk"; +import type { ResolvedTelnyxSmsAccount } from "./accounts.js"; +import type { TelnyxSmsWebhookEvent } from "./types.js"; +import { sendTelnyxSms } from "./api.js"; +import { getTelnyxSmsRuntime } from "./runtime.js"; + +export type TelnyxSmsRuntimeEnv = { + log?: (message: string) => void; + error?: (message: string) => void; +}; + +export type TelnyxSmsMonitorOptions = { + account: ResolvedTelnyxSmsAccount; + config: OpenClawConfig; + runtime: TelnyxSmsRuntimeEnv; + abortSignal: AbortSignal; + webhookPath?: string; + statusSink?: (patch: { lastInboundAt?: number; lastOutboundAt?: number }) => void; +}; + +type TelnyxSmsCoreRuntime = ReturnType; + +type WebhookTarget = { + account: ResolvedTelnyxSmsAccount; + config: OpenClawConfig; + runtime: TelnyxSmsRuntimeEnv; + core: TelnyxSmsCoreRuntime; + path: string; + statusSink?: (patch: { lastInboundAt?: number; lastOutboundAt?: number }) => void; +}; + +const webhookTargets = new Map(); + +function logVerbose(core: TelnyxSmsCoreRuntime, runtime: TelnyxSmsRuntimeEnv, message: string) { + if (core.logging.shouldLogVerbose()) { + runtime.log?.(`[telnyx-sms] ${message}`); + } +} + +function normalizeWebhookPath(raw: string): string { + const trimmed = raw.trim(); + if (!trimmed) { + return "/"; + } + const withSlash = trimmed.startsWith("/") ? trimmed : `/${trimmed}`; + if (withSlash.length > 1 && withSlash.endsWith("/")) { + return withSlash.slice(0, -1); + } + return withSlash; +} + +function resolveWebhookPath(webhookPath?: string): string { + const trimmedPath = webhookPath?.trim(); + if (trimmedPath) { + return normalizeWebhookPath(trimmedPath); + } + return "/telnyx-sms"; +} + +function registerWebhookTarget(target: WebhookTarget): () => void { + const key = normalizeWebhookPath(target.path); + const normalizedTarget = { ...target, path: key }; + const existing = webhookTargets.get(key) ?? []; + const next = [...existing, normalizedTarget]; + webhookTargets.set(key, next); + return () => { + const updated = (webhookTargets.get(key) ?? []).filter((entry) => entry !== normalizedTarget); + if (updated.length > 0) { + webhookTargets.set(key, updated); + } else { + webhookTargets.delete(key); + } + }; +} + +function verifyTelnyxWebhook(params: { + publicKey: string | undefined; + signature: string | undefined; + timestamp: string | undefined; + rawBody: string; +}): { ok: boolean; reason?: string } { + const { publicKey, signature, timestamp, rawBody } = params; + + if (!publicKey) { + // No public key configured — allow unsigned (warn in logs) + return { ok: true, reason: "verification skipped (no public key configured)" }; + } + + if (!signature || !timestamp) { + return { ok: false, reason: "Missing signature or timestamp header" }; + } + + try { + const signedPayload = `${timestamp}|${rawBody}`; + const signatureBuffer = Buffer.from(signature, "base64"); + const publicKeyBuffer = Buffer.from(publicKey, "base64"); + + const isValid = crypto.verify( + null, // Ed25519 doesn't use a digest + Buffer.from(signedPayload), + { + key: publicKeyBuffer, + format: "der", + type: "spki", + }, + signatureBuffer, + ); + + if (!isValid) { + return { ok: false, reason: "Invalid signature" }; + } + + // Check timestamp is within 5 minutes + const eventTime = parseInt(timestamp, 10) * 1000; + const now = Date.now(); + if (Math.abs(now - eventTime) > 5 * 60 * 1000) { + return { ok: false, reason: "Timestamp too old" }; + } + + return { ok: true }; + } catch (err) { + return { + ok: false, + reason: `Verification error: ${err instanceof Error ? err.message : String(err)}`, + }; + } +} + +async function readRawBody(req: IncomingMessage, maxBytes: number): Promise { + const chunks: Buffer[] = []; + let total = 0; + return await new Promise((resolve) => { + let resolved = false; + const doResolve = (value: string | null) => { + if (resolved) { + return; + } + resolved = true; + req.removeAllListeners(); + resolve(value); + }; + req.on("data", (chunk: Buffer) => { + total += chunk.length; + if (total > maxBytes) { + doResolve(null); + req.destroy(); + return; + } + chunks.push(chunk); + }); + req.on("end", () => { + doResolve(Buffer.concat(chunks).toString("utf8")); + }); + req.on("error", () => { + doResolve(null); + }); + }); +} + +export async function handleTelnyxSmsWebhookRequest( + req: IncomingMessage, + res: ServerResponse, +): Promise { + const url = new URL(req.url ?? "/", "http://localhost"); + const path = normalizeWebhookPath(url.pathname); + const targets = webhookTargets.get(path); + if (!targets || targets.length === 0) { + return false; + } + + if (req.method !== "POST") { + res.statusCode = 405; + res.setHeader("Allow", "POST"); + res.end("Method Not Allowed"); + return true; + } + + const rawBody = await readRawBody(req, 1024 * 1024); + if (!rawBody) { + res.statusCode = 400; + res.end("invalid payload"); + return true; + } + + let event: TelnyxSmsWebhookEvent; + try { + event = JSON.parse(rawBody) as TelnyxSmsWebhookEvent; + } catch { + res.statusCode = 400; + res.end("invalid JSON"); + return true; + } + + if (!event.data || typeof event.data !== "object") { + res.statusCode = 400; + res.end("invalid payload"); + return true; + } + + const signature = req.headers["telnyx-signature-ed25519"]; + const timestamp = req.headers["telnyx-timestamp"]; + const signatureStr = Array.isArray(signature) ? signature[0] : signature; + const timestampStr = Array.isArray(timestamp) ? timestamp[0] : timestamp; + + let selected: WebhookTarget | undefined; + for (const target of targets) { + const verification = verifyTelnyxWebhook({ + publicKey: target.account.publicKey, + signature: signatureStr, + timestamp: timestampStr, + rawBody, + }); + if (verification.ok) { + selected = target; + break; + } + } + + if (!selected) { + res.statusCode = 401; + res.end("unauthorized"); + return true; + } + + // Respond 200 immediately, process async + res.statusCode = 200; + res.setHeader("Content-Type", "application/json"); + res.end("{}"); + + selected.statusSink?.({ lastInboundAt: Date.now() }); + processInboundMessage(event, selected).catch((err) => { + selected?.runtime.error?.( + `[${selected.account.accountId}] Telnyx SMS webhook failed: ${String(err)}`, + ); + }); + + return true; +} + +async function processInboundMessage( + event: TelnyxSmsWebhookEvent, + target: WebhookTarget, +): Promise { + const { account, config, runtime, core, statusSink } = target; + const data = event.data; + if (!data) { + return; + } + + // Only process inbound messages + if (data.event_type !== "message.received") { + return; + } + + const payload = data.payload; + if (!payload) { + return; + } + + if (payload.direction !== "inbound") { + return; + } + + const fromPhone = payload.from?.phone_number?.trim(); + if (!fromPhone) { + logVerbose(core, runtime, "skip message with no sender phone"); + return; + } + + const messageText = (payload.text ?? "").trim(); + const mediaAttachments = payload.media ?? []; + const hasMedia = mediaAttachments.length > 0; + const rawBody = messageText || (hasMedia ? "" : ""); + if (!rawBody) { + return; + } + + const normalizedPhone = normalizeE164(fromPhone); + + // DM policy check + const dmPolicy = account.config.dm?.policy ?? "pairing"; + const configAllowFrom = (account.config.dm?.allowFrom ?? []).map((v) => String(v)); + + const shouldComputeAuth = core.channel.commands.shouldComputeCommandAuthorized(rawBody, config); + const storeAllowFrom = + dmPolicy !== "open" || shouldComputeAuth + ? await core.channel.pairing.readAllowFromStore("telnyx-sms").catch(() => []) + : []; + const effectiveAllowFrom = [...configAllowFrom, ...storeAllowFrom]; + + const senderAllowed = isSenderAllowed(normalizedPhone, effectiveAllowFrom); + const useAccessGroups = config.commands?.useAccessGroups !== false; + const commandAuthorized = shouldComputeAuth + ? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({ + useAccessGroups, + authorizers: [{ configured: effectiveAllowFrom.length > 0, allowed: senderAllowed }], + }) + : undefined; + + if (dmPolicy === "disabled" || account.config.dm?.enabled === false) { + logVerbose(core, runtime, `Blocked Telnyx SMS from ${normalizedPhone} (dmPolicy=disabled)`); + return; + } + + if (dmPolicy !== "open") { + if (!senderAllowed) { + if (dmPolicy === "pairing") { + const { code, created } = await core.channel.pairing.upsertPairingRequest({ + channel: "telnyx-sms", + id: normalizedPhone, + meta: { name: undefined, phone: normalizedPhone }, + }); + if (created) { + logVerbose(core, runtime, `telnyx-sms pairing request sender=${normalizedPhone}`); + try { + await sendTelnyxSms({ + account, + to: normalizedPhone, + text: core.channel.pairing.buildPairingReply({ + channel: "telnyx-sms", + idLine: `Your phone number: ${normalizedPhone}`, + code, + }), + }); + statusSink?.({ lastOutboundAt: Date.now() }); + } catch (err) { + logVerbose( + core, + runtime, + `pairing reply failed for ${normalizedPhone}: ${String(err)}`, + ); + } + } + } else { + logVerbose( + core, + runtime, + `Blocked unauthorized Telnyx SMS sender ${normalizedPhone} (dmPolicy=${dmPolicy})`, + ); + } + return; + } + } + + if ( + core.channel.commands.isControlCommandMessage(rawBody, config) && + commandAuthorized !== true + ) { + logVerbose(core, runtime, `telnyx-sms: drop control command from ${normalizedPhone}`); + return; + } + + const route = core.channel.routing.resolveAgentRoute({ + cfg: config, + channel: "telnyx-sms", + accountId: account.accountId, + peer: { + kind: "direct", + id: normalizedPhone, + }, + }); + + // Handle media attachments + let mediaPath: string | undefined; + let mediaType: string | undefined; + if (mediaAttachments.length > 0) { + const first = mediaAttachments[0]; + if (first.url) { + try { + const maxBytes = (account.config.mediaMaxMb ?? 20) * 1024 * 1024; + const loaded = await core.channel.media.fetchRemoteMedia(first.url, { maxBytes }); + const saved = await core.channel.media.saveMediaBuffer( + loaded.buffer, + loaded.contentType ?? first.content_type, + "inbound", + maxBytes, + ); + mediaPath = saved.path; + mediaType = saved.contentType; + } catch (err) { + runtime.error?.(`telnyx-sms: failed downloading media: ${String(err)}`); + } + } + } + + const storePath = core.channel.session.resolveStorePath(config.session?.store, { + agentId: route.agentId, + }); + const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(config); + const previousTimestamp = core.channel.session.readSessionUpdatedAt({ + storePath, + sessionKey: route.sessionKey, + }); + const body = core.channel.reply.formatAgentEnvelope({ + channel: "Telnyx SMS", + from: normalizedPhone, + timestamp: payload.received_at ? Date.parse(payload.received_at) : undefined, + previousTimestamp, + envelope: envelopeOptions, + body: rawBody, + }); + + const ctxPayload = core.channel.reply.finalizeInboundContext({ + Body: body, + RawBody: rawBody, + CommandBody: rawBody, + From: `telnyx-sms:${normalizedPhone}`, + To: `telnyx-sms:${account.phoneNumber ?? ""}`, + SessionKey: route.sessionKey, + AccountId: route.accountId, + ChatType: "direct", + ConversationLabel: normalizedPhone, + SenderName: undefined, + SenderId: normalizedPhone, + CommandAuthorized: commandAuthorized, + Provider: "telnyx-sms", + Surface: "telnyx-sms", + MessageSid: payload.id, + MessageSidFull: payload.id, + MediaPath: mediaPath, + MediaType: mediaType, + MediaUrl: mediaPath, + OriginatingChannel: "telnyx-sms", + OriginatingTo: `telnyx-sms:${account.phoneNumber ?? ""}`, + }); + + void core.channel.session + .recordSessionMetaFromInbound({ + storePath, + sessionKey: ctxPayload.SessionKey ?? route.sessionKey, + ctx: ctxPayload, + }) + .catch((err) => { + runtime.error?.(`telnyx-sms: failed updating session meta: ${String(err)}`); + }); + + const { onModelSelected, ...prefixOptions } = createReplyPrefixOptions({ + cfg: config, + agentId: route.agentId, + channel: "telnyx-sms", + accountId: route.accountId, + }); + + await core.channel.reply.dispatchReplyWithBufferedBlockDispatcher({ + ctx: ctxPayload, + cfg: config, + dispatcherOptions: { + ...prefixOptions, + deliver: async (replyPayload) => { + await deliverTelnyxSmsReply({ + payload: replyPayload, + account, + to: normalizedPhone, + runtime, + core, + config, + statusSink, + }); + }, + onError: (err, info) => { + runtime.error?.( + `[${account.accountId}] Telnyx SMS ${info.kind} reply failed: ${String(err)}`, + ); + }, + }, + replyOptions: { + onModelSelected, + }, + }); +} + +function isSenderAllowed(normalizedPhone: string, allowFrom: string[]): boolean { + if (allowFrom.includes("*")) { + return true; + } + return allowFrom.some((entry) => { + const normalized = normalizeE164(String(entry)); + return normalized === normalizedPhone; + }); +} + +async function deliverTelnyxSmsReply(params: { + payload: { text?: string; mediaUrls?: string[]; mediaUrl?: string }; + account: ResolvedTelnyxSmsAccount; + to: string; + runtime: TelnyxSmsRuntimeEnv; + core: TelnyxSmsCoreRuntime; + config: OpenClawConfig; + statusSink?: (patch: { lastInboundAt?: number; lastOutboundAt?: number }) => void; +}): Promise { + const { payload, account, to, runtime, core, config, statusSink } = params; + const mediaList = payload.mediaUrls?.length + ? payload.mediaUrls + : payload.mediaUrl + ? [payload.mediaUrl] + : []; + + if (mediaList.length > 0) { + try { + await sendTelnyxSms({ + account, + to, + text: payload.text, + mediaUrls: mediaList, + }); + statusSink?.({ lastOutboundAt: Date.now() }); + } catch (err) { + runtime.error?.(`Telnyx SMS MMS send failed: ${String(err)}`); + } + return; + } + + if (payload.text) { + const chunkLimit = account.config.textChunkLimit ?? 1600; + const chunkMode = core.channel.text.resolveChunkMode(config, "telnyx-sms", account.accountId); + const chunks = core.channel.text.chunkMarkdownTextWithMode(payload.text, chunkLimit, chunkMode); + for (const chunk of chunks) { + try { + await sendTelnyxSms({ account, to, text: chunk }); + statusSink?.({ lastOutboundAt: Date.now() }); + } catch (err) { + runtime.error?.(`Telnyx SMS message send failed: ${String(err)}`); + } + } + } +} + +export function monitorTelnyxSms(options: TelnyxSmsMonitorOptions): () => void { + const core = getTelnyxSmsRuntime(); + const webhookPath = resolveWebhookPath(options.webhookPath); + + const unregister = registerWebhookTarget({ + account: options.account, + config: options.config, + runtime: options.runtime, + core, + path: webhookPath, + statusSink: options.statusSink, + }); + + return unregister; +} + +export async function startTelnyxSmsMonitor(params: TelnyxSmsMonitorOptions): Promise<() => void> { + return monitorTelnyxSms(params); +} + +export function resolveTelnyxSmsWebhookPath(params: { account: ResolvedTelnyxSmsAccount }): string { + return resolveWebhookPath(params.account.config.webhookPath); +} diff --git a/extensions/telnyx-sms/src/runtime.ts b/extensions/telnyx-sms/src/runtime.ts new file mode 100644 index 000000000..ae21f9bde --- /dev/null +++ b/extensions/telnyx-sms/src/runtime.ts @@ -0,0 +1,14 @@ +import type { PluginRuntime } from "openclaw/plugin-sdk"; + +let runtime: PluginRuntime | null = null; + +export function setTelnyxSmsRuntime(next: PluginRuntime) { + runtime = next; +} + +export function getTelnyxSmsRuntime(): PluginRuntime { + if (!runtime) { + throw new Error("Telnyx SMS runtime not initialized"); + } + return runtime; +} diff --git a/extensions/telnyx-sms/src/types.ts b/extensions/telnyx-sms/src/types.ts new file mode 100644 index 000000000..031f5b71a --- /dev/null +++ b/extensions/telnyx-sms/src/types.ts @@ -0,0 +1,64 @@ +/** Telnyx webhook event wrapper. */ +export type TelnyxSmsWebhookEvent = { + data?: { + event_type?: string; + id?: string; + occurred_at?: string; + payload?: TelnyxSmsMessagePayload; + record_type?: string; + }; + meta?: { + attempt?: number; + delivered_to?: string; + }; +}; + +/** Inbound SMS/MMS message payload from Telnyx webhook. */ +export type TelnyxSmsMessagePayload = { + id?: string; + direction?: string; + type?: string; + from?: { phone_number?: string; carrier?: string; line_type?: string }; + to?: Array<{ phone_number?: string; status?: string }>; + text?: string; + media?: TelnyxSmsMedia[]; + completed_at?: string; + sent_at?: string; + received_at?: string; + messaging_profile_id?: string; + parts?: number; + cost?: { amount?: string; currency?: string }; +}; + +/** Media attachment in a Telnyx MMS message. */ +export type TelnyxSmsMedia = { + url?: string; + content_type?: string; + size?: number; + hash_sha256?: string; +}; + +/** Request body for sending a Telnyx SMS/MMS. */ +export type TelnyxSmsSendRequest = { + from: string; + to: string; + text?: string; + media_urls?: string[]; + messaging_profile_id?: string; + type?: "SMS" | "MMS"; +}; + +/** Telnyx send message API response. */ +export type TelnyxSmsSendResponse = { + data?: { + id?: string; + record_type?: string; + direction?: string; + from?: { phone_number?: string }; + to?: Array<{ phone_number?: string; status?: string }>; + text?: string; + media?: TelnyxSmsMedia[]; + type?: string; + parts?: number; + }; +}; diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index 04f50218f..656232d08 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -5,6 +5,15 @@ "label": "Provider", "help": "Use twilio, telnyx, or mock for dev/no-network." }, + "defaultAgentId": { + "label": "Default Agent ID", + "help": "Fallback agent for unmapped phone numbers (defaults to 'main')" + }, + "numbers": { + "label": "Phone Number Routing", + "help": "Map phone numbers to agents (E.164 format)", + "advanced": true + }, "fromNumber": { "label": "From Number", "placeholder": "+15550001234" @@ -41,6 +50,26 @@ "label": "Twilio Auth Token", "sensitive": true }, + "deepgram.apiKey": { + "label": "Deepgram API Key", + "sensitive": true + }, + "deepgram.telephonyProvider": { + "label": "Telephony Provider", + "advanced": true + }, + "deepgram.stt.model": { + "label": "Deepgram STT Model", + "advanced": true + }, + "deepgram.tts.model": { + "label": "Deepgram TTS Model", + "advanced": true + }, + "deepgram.language": { + "label": "Deepgram Language", + "advanced": true + }, "outbound.defaultMode": { "label": "Default Call Mode" }, @@ -168,7 +197,7 @@ }, "provider": { "type": "string", - "enum": ["telnyx", "twilio", "plivo", "mock"] + "enum": ["telnyx", "twilio", "plivo", "deepgram", "mock"] }, "telnyx": { "type": "object", @@ -209,6 +238,65 @@ } } }, + "deepgram": { + "type": "object", + "additionalProperties": false, + "properties": { + "apiKey": { "type": "string" }, + "telephonyProvider": { "type": "string", "enum": ["twilio"] }, + "stt": { + "type": "object", + "additionalProperties": false, + "properties": { + "model": { "type": "string" } + } + }, + "tts": { + "type": "object", + "additionalProperties": false, + "properties": { + "model": { "type": "string" } + } + }, + "language": { "type": "string" }, + "latency": { + "type": "object", + "additionalProperties": false, + "properties": { + "fillerThresholdMs": { "type": "integer", "minimum": 0 }, + "fillerPhrases": { "type": "array", "items": { "type": "string" } } + } + }, + "fallback": { + "type": "object", + "additionalProperties": false, + "properties": { + "openclawTimeoutMs": { "type": "integer", "minimum": 1 }, + "cannedResponses": { "type": "array", "items": { "type": "string" } }, + "maxRetries": { "type": "integer", "minimum": 0 } + } + } + } + }, + "defaultAgentId": { + "type": "string", + "default": "main" + }, + "numbers": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "agentId": { "type": "string" }, + "direction": { + "type": "string", + "enum": ["inbound", "outbound", "both"], + "default": "both" + } + }, + "required": ["agentId"] + } + }, "fromNumber": { "type": "string", "pattern": "^\\+[1-9]\\d{1,14}$" @@ -547,6 +635,10 @@ "responseModel": { "type": "string" }, + "timezone": { + "type": "string", + "default": "UTC" + }, "responseSystemPrompt": { "type": "string" }, diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index cfe82b425..7592cfc2d 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -62,6 +62,48 @@ export const PlivoConfigSchema = z .strict(); export type PlivoConfig = z.infer; +// ----------------------------------------------------------------------------- +// Deepgram-Specific Configuration +// ----------------------------------------------------------------------------- + +export const DeepgramLatencyConfigSchema = z + .object({ + fillerThresholdMs: z.number().int().nonnegative().default(1500), + fillerPhrases: z.array(z.string()).default([]), + }) + .strict() + .default({ fillerThresholdMs: 1500, fillerPhrases: [] }); +export type DeepgramLatencyConfig = z.infer; + +export const DeepgramFallbackConfigSchema = z + .object({ + openclawTimeoutMs: z.number().int().positive().default(5000), + cannedResponses: z.array(z.string()).default([]), + maxRetries: z.number().int().nonnegative().default(2), + }) + .strict() + .default({ openclawTimeoutMs: 5000, cannedResponses: [], maxRetries: 2 }); +export type DeepgramFallbackConfig = z.infer; + +export const DeepgramConfigSchema = z + .object({ + apiKey: z.string().min(1).optional(), + telephonyProvider: z.enum(["twilio"]).default("twilio"), + stt: z + .object({ model: z.string().default("nova-3") }) + .strict() + .default({ model: "nova-3" }), + tts: z + .object({ model: z.string().default("aura-2-thalia-en") }) + .strict() + .default({ model: "aura-2-thalia-en" }), + language: z.string().default("en"), + latency: DeepgramLatencyConfigSchema, + fallback: DeepgramFallbackConfigSchema, + }) + .strict(); +export type DeepgramConfig = z.infer; + // ----------------------------------------------------------------------------- // STT/TTS Configuration // ----------------------------------------------------------------------------- @@ -242,6 +284,18 @@ export const VoiceCallWebhookSecurityConfigSchema = z .default({ allowedHosts: [], trustForwardingHeaders: false, trustedProxyIPs: [] }); export type WebhookSecurityConfig = z.infer; +// ----------------------------------------------------------------------------- +// Number Routing Configuration +// ----------------------------------------------------------------------------- + +export const NumberRoutingSchema = z + .object({ + agentId: z.string().min(1), + direction: z.enum(["inbound", "outbound", "both"]).default("both"), + }) + .strict(); +export type NumberRouting = z.infer; + // ----------------------------------------------------------------------------- // Outbound Call Configuration // ----------------------------------------------------------------------------- @@ -306,8 +360,8 @@ export const VoiceCallConfigSchema = z /** Enable voice call functionality */ enabled: z.boolean().default(false), - /** Active provider (telnyx, twilio, plivo, or mock) */ - provider: z.enum(["telnyx", "twilio", "plivo", "mock"]).optional(), + /** Active provider (telnyx, twilio, plivo, deepgram, or mock) */ + provider: z.enum(["telnyx", "twilio", "plivo", "deepgram", "mock"]).optional(), /** Telnyx-specific configuration */ telnyx: TelnyxConfigSchema.optional(), @@ -318,6 +372,15 @@ export const VoiceCallConfigSchema = z /** Plivo-specific configuration */ plivo: PlivoConfigSchema.optional(), + /** Deepgram-specific configuration */ + deepgram: DeepgramConfigSchema.optional(), + + /** Default agent ID for unmapped numbers (defaults to "main") */ + defaultAgentId: z.string().default("main"), + + /** Map of E.164 phone numbers to agent routing config */ + numbers: z.record(z.string().regex(/^\+[1-9]\d{1,14}$/), NumberRoutingSchema).default({}), + /** Phone number to call from (E.164) */ fromNumber: E164Schema.optional(), @@ -384,6 +447,9 @@ export const VoiceCallConfigSchema = z /** Model for generating voice responses (e.g., "anthropic/claude-sonnet-4", "openai/gpt-4o") */ responseModel: z.string().default("openai/gpt-4o-mini"), + /** IANA timezone for voice responses (e.g. "America/New_York"). Times are presented in this zone. */ + timezone: z.string().default("UTC"), + /** System prompt for voice responses */ responseSystemPrompt: z.string().optional(), @@ -394,6 +460,16 @@ export const VoiceCallConfigSchema = z export type VoiceCallConfig = z.infer; +// ----------------------------------------------------------------------------- +// Caller History Configuration +// ----------------------------------------------------------------------------- + +export type CallerHistoryConfig = { + lookbackDays: number; + maxSessions: number; + summaryTokens: number; +}; + // ----------------------------------------------------------------------------- // Configuration Helpers // ----------------------------------------------------------------------------- @@ -427,6 +503,21 @@ export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig resolved.plivo.authToken = resolved.plivo.authToken ?? process.env.PLIVO_AUTH_TOKEN; } + // Deepgram (hybrid mode: needs both Deepgram + Twilio credentials) + if (resolved.provider === "deepgram") { + resolved.deepgram = resolved.deepgram ?? ({} as any); + resolved.deepgram!.apiKey = resolved.deepgram!.apiKey ?? process.env.DEEPGRAM_API_KEY; + // Deepgram hybrid mode uses Twilio for telephony + resolved.twilio = resolved.twilio ?? {}; + resolved.twilio.accountSid = resolved.twilio.accountSid ?? process.env.TWILIO_ACCOUNT_SID; + resolved.twilio.authToken = resolved.twilio.authToken ?? process.env.TWILIO_AUTH_TOKEN; + } + + // Public URL from environment + if (!resolved.publicUrl && process.env.PUBLIC_URL) { + resolved.publicUrl = process.env.PUBLIC_URL; + } + // Tunnel Config resolved.tunnel = resolved.tunnel ?? { provider: "none", @@ -506,6 +597,24 @@ export function validateProviderConfig(config: VoiceCallConfig): { } } + if (config.provider === "deepgram") { + if (!config.deepgram?.apiKey) { + errors.push( + "plugins.entries.voice-call.config.deepgram.apiKey is required (or set DEEPGRAM_API_KEY env)", + ); + } + if (!config.twilio?.accountSid) { + errors.push( + "plugins.entries.voice-call.config.twilio.accountSid is required for Deepgram hybrid mode (or set TWILIO_ACCOUNT_SID env)", + ); + } + if (!config.twilio?.authToken) { + errors.push( + "plugins.entries.voice-call.config.twilio.authToken is required for Deepgram hybrid mode (or set TWILIO_AUTH_TOKEN env)", + ); + } + } + if (config.provider === "plivo") { if (!config.plivo?.authId) { errors.push( @@ -521,3 +630,43 @@ export function validateProviderConfig(config: VoiceCallConfig): { return { valid: errors.length === 0, errors }; } + +/** + * Resolve the agent ID for a given phone number and direction. + * Looks up the number in config.numbers, checks direction match, falls back to defaultAgentId. + */ +export function resolveAgentForNumber( + config: VoiceCallConfig, + phoneNumber: string | undefined, + direction: "inbound" | "outbound", +): string { + if (phoneNumber && config.numbers) { + const entry = config.numbers[phoneNumber]; + if (entry && (entry.direction === "both" || entry.direction === direction)) { + return entry.agentId; + } + } + return config.defaultAgentId; +} + +/** + * Find the phone number configured for a given agent ID and direction. + * Used for outbound calls to determine which number to call from. + */ +export function resolveNumberForAgent( + config: VoiceCallConfig, + agentId: string, + direction: "outbound" | "inbound", +): string | undefined { + if (config.numbers) { + for (const [number, entry] of Object.entries(config.numbers)) { + if ( + entry.agentId === agentId && + (entry.direction === "both" || entry.direction === direction) + ) { + return number; + } + } + } + return undefined; +} diff --git a/extensions/voice-call/src/deepgram-media-bridge.ts b/extensions/voice-call/src/deepgram-media-bridge.ts new file mode 100644 index 000000000..1ce02aee9 --- /dev/null +++ b/extensions/voice-call/src/deepgram-media-bridge.ts @@ -0,0 +1,441 @@ +/** + * Deepgram Media Bridge + * + * Bridges Twilio media stream WebSocket audio to/from a Deepgram Voice Agent + * session. Used in hybrid mode where Twilio handles telephony and Deepgram + * handles voice AI (STT, LLM, TTS). + * + * Flow: + * Twilio media stream WS → DeepgramMediaBridge → Deepgram Voice Agent API + * ↓ ↓ + * audio back to Twilio function calls → OpenClaw + */ + +import type { IncomingMessage } from "node:http"; +import type { Duplex } from "node:stream"; +import crypto from "node:crypto"; +import { WebSocket, WebSocketServer } from "ws"; +import type { VoiceCallConfig } from "./config.js"; +import type { CoreConfig } from "./core-bridge.js"; +import type { CallManager } from "./manager.js"; +import type { DeepgramVoiceAgentClient } from "./providers/deepgram-voice-agent.js"; +import type { DeepgramProvider, DeepgramSessionOverrides } from "./providers/deepgram.js"; +import type { NormalizedEvent } from "./types.js"; +import { resolveAgentForNumber } from "./config.js"; +import { loadCoreAgentDeps } from "./core-bridge.js"; + +/** + * Configuration for the Deepgram media bridge. + */ +export interface DeepgramMediaBridgeConfig { + /** DeepgramProvider instance for creating sessions */ + deepgramProvider: DeepgramProvider; + /** CallManager for event processing and speaking */ + manager: CallManager; + /** Validate whether to accept a media stream for the given call ID */ + shouldAcceptStream?: (params: { callId: string; streamSid: string; token?: string }) => boolean; + /** Callback when stream connects */ + onConnect?: (callId: string, streamSid: string) => void; + /** Callback when stream disconnects */ + onDisconnect?: (callId: string) => void; + /** Gateway URL for LLM proxy (e.g., "http://127.0.0.1:18789") */ + gatewayUrl?: string; + /** Gateway auth token */ + gatewayToken?: string; + /** Public URL of webhook server (for think.endpoint) */ + publicUrl?: string; + /** Core config for agent identity resolution */ + coreConfig?: CoreConfig; + /** Voice call config for number-to-agent routing */ + voiceCallConfig?: VoiceCallConfig; +} + +/** + * Active bridge session linking a Twilio stream to a Deepgram agent. + */ +interface BridgeSession { + callId: string; + streamSid: string; + ws: WebSocket; + client: DeepgramVoiceAgentClient; +} + +/** + * Bridges Twilio media streams to Deepgram Voice Agent sessions. + * + * Replaces MediaStreamHandler when in Deepgram hybrid mode. Instead of + * forwarding audio to OpenAI Realtime STT, it forwards to Deepgram's + * Voice Agent which handles STT, LLM, and TTS internally. + */ +export class DeepgramMediaBridge { + private wss: WebSocketServer | null = null; + private sessions = new Map(); + private config: DeepgramMediaBridgeConfig; + + constructor(config: DeepgramMediaBridgeConfig) { + this.config = config; + } + + /** + * Handle WebSocket upgrade for media stream connections. + */ + handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void { + if (!this.wss) { + this.wss = new WebSocketServer({ noServer: true }); + this.wss.on("connection", (ws, req) => { + void this.handleConnection(ws, req); + }); + } + + this.wss.handleUpgrade(request, socket, head, (ws) => { + this.wss?.emit("connection", ws, request); + }); + } + + /** + * Handle new WebSocket connection from Twilio. + */ + private async handleConnection(ws: WebSocket, request: IncomingMessage): Promise { + let session: BridgeSession | null = null; + let mediaCount = 0; + // Try URL first (fallback), but Twilio sends token via start.customParameters + let streamToken = this.getStreamToken(request); + + ws.on("message", async (data: Buffer) => { + try { + const message = JSON.parse(data.toString()) as TwilioMediaMessage; + + switch (message.event) { + case "connected": + console.log("[DeepgramBridge] Twilio connected"); + break; + + case "start": + // Extract token from Twilio's customParameters (preferred over URL query) + if (message.start?.customParameters?.token) { + streamToken = message.start.customParameters.token; + console.log( + `[DeepgramBridge] Got token from customParameters: ${streamToken.substring(0, 8)}...`, + ); + } + session = await this.handleStart(ws, message, streamToken); + break; + + case "media": + if (session && message.media?.payload) { + const audioBuffer = Buffer.from(message.media.payload, "base64"); + if (!mediaCount) { + console.log(`[DeepgramBridge] First media packet: ${audioBuffer.length}B`); + } + mediaCount++; + this.config.deepgramProvider.sendAudio(session.callId, audioBuffer); + } + break; + + case "stop": + if (session) { + this.handleStop(session); + session = null; + } + break; + } + } catch (error) { + console.error("[DeepgramBridge] Error processing message:", error); + } + }); + + ws.on("close", () => { + if (session) { + this.handleStop(session); + } + }); + + ws.on("error", (error) => { + console.error("[DeepgramBridge] WebSocket error:", error); + }); + } + + /** + * Handle stream start event — create Deepgram session and wire bidirectional audio. + */ + private async handleStart( + ws: WebSocket, + message: TwilioMediaMessage, + streamToken?: string, + ): Promise { + const streamSid = message.streamSid || ""; + const callSid = message.start?.callSid || ""; + + console.log(`[DeepgramBridge] Stream started: ${streamSid} (call: ${callSid})`); + + if (!callSid) { + console.warn("[DeepgramBridge] Missing callSid; closing stream"); + ws.close(1008, "Missing callSid"); + return null; + } + + if ( + this.config.shouldAcceptStream && + !this.config.shouldAcceptStream({ callId: callSid, streamSid, token: streamToken }) + ) { + console.warn(`[DeepgramBridge] Rejecting stream for unknown call: ${callSid}`); + ws.close(1008, "Unknown call"); + return null; + } + + try { + // Build per-call overrides for gateway integration + const overrides = await this.buildSessionOverrides(callSid, message); + const client = await this.config.deepgramProvider.createSession(callSid, callSid, overrides); + + const session: BridgeSession = { callId: callSid, streamSid, ws, client }; + this.sessions.set(streamSid, session); + + // Bridge: Deepgram audio → Twilio + let audioChunks = 0; + let audioBytes = 0; + client.on("audio", (audio: Buffer) => { + audioChunks++; + audioBytes += audio.length; + if (audioChunks === 1 || audioChunks % 50 === 0) { + console.log( + `[DeepgramBridge] Audio chunk #${audioChunks}: ${audio.length}B (total: ${audioBytes}B) wsState=${ws.readyState}`, + ); + } + if (ws.readyState === WebSocket.OPEN) { + ws.send( + JSON.stringify({ + event: "media", + streamSid, + media: { payload: audio.toString("base64") }, + }), + ); + } + }); + + // Bridge: Deepgram conversation text → manager events + client.on("conversationText", (role: string, content: string) => { + console.log( + `[DeepgramBridge] ConversationText: role=${role} content="${content.substring(0, 100)}"`, + ); + const event: NormalizedEvent = { + id: `dg-bridge-${crypto.randomUUID()}`, + callId: callSid, + providerCallId: callSid, + timestamp: Date.now(), + ...(role === "user" + ? { type: "call.speech" as const, transcript: content, isFinal: true, confidence: 1.0 } + : { type: "call.speaking" as const, text: content }), + }; + this.config.manager.processEvent(event); + }); + + // Barge-in: when user starts speaking, clear Twilio's audio buffer + client.on("userStartedSpeaking", () => { + console.log(`[DeepgramBridge] User started speaking (barge-in)`); + if (ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify({ event: "clear", streamSid })); + } + }); + + client.on("agentStartedSpeaking", (latency) => { + console.log( + `[DeepgramBridge] Agent started speaking (total=${latency?.total}ms tts=${latency?.tts}ms ttt=${latency?.ttt}ms)`, + ); + }); + + client.on("agentAudioDone", () => { + console.log( + `[DeepgramBridge] Agent audio done (sent ${audioChunks} chunks, ${audioBytes}B)`, + ); + }); + + client.on("agentThinking", () => { + console.log(`[DeepgramBridge] Agent thinking...`); + }); + + client.on("injectionRefused", (reason) => { + console.warn(`[DeepgramBridge] Injection refused: ${reason}`); + }); + + client.on("error", (error: Error) => { + console.error(`[DeepgramBridge] Deepgram error for ${callSid}:`, error.message); + }); + + // Notify connection + this.config.onConnect?.(callSid, streamSid); + + // Speak initial greeting via Deepgram agent (not TwilioProvider.playTts) + setTimeout(() => { + const call = this.config.manager.getCallByProviderCallId(callSid); + const initialMessage = + typeof call?.metadata?.initialMessage === "string" + ? call.metadata.initialMessage.trim() + : ""; + if (initialMessage && call?.metadata) { + delete call.metadata.initialMessage; + console.log(`[DeepgramBridge] Injecting initial greeting via Deepgram agent`); + client.injectAgentMessage(initialMessage); + } + }, 500); + + return session; + } catch (error) { + console.error(`[DeepgramBridge] Failed to create Deepgram session for ${callSid}:`, error); + ws.close(1011, "Deepgram session creation failed"); + return null; + } + } + + /** + * Build per-call session overrides for gateway integration. + * Routes LLM calls through the gateway's /v1/chat/completions endpoint. + */ + private async buildSessionOverrides( + callSid: string, + _message: TwilioMediaMessage, + ): Promise { + if (!this.config.gatewayUrl || !this.config.gatewayToken || !this.config.publicUrl) { + return undefined; + } + + // Get caller info from manager + const call = this.config.manager.getCallByProviderCallId(callSid); + const callerNumber = call?.from || "unknown"; + const calledNumber = call?.to; + + // Resolve agent ID from number routing config + const agentId = this.config.voiceCallConfig + ? resolveAgentForNumber(this.config.voiceCallConfig, calledNumber, "inbound") + : "main"; + + // Resolve agent identity for system prompt + let agentName = "Assistant"; + try { + if (this.config.coreConfig) { + const deps = await loadCoreAgentDeps(); + const identity = deps.resolveAgentIdentity(this.config.coreConfig, agentId); + if (identity?.name) { + agentName = identity.name; + } + } + } catch (err) { + console.warn("[DeepgramBridge] Failed to resolve agent identity:", err); + } + + console.log( + `[DeepgramBridge] Resolved agent: id=${agentId} name=${agentName} calledNumber=${calledNumber}`, + ); + + // Build system prompt with voice-specific behavioral instructions only. + // Agent identity comes from workspace files (SOUL.md, IDENTITY.md, BOOTSTRAP.md) + // loaded by the Pi agent's normal startup path. + // Format current date/time in the configured timezone + const tz = this.config.voiceCallConfig?.timezone ?? "UTC"; + const now = new Date(); + const localTime = now.toLocaleString("en-US", { + timeZone: tz, + weekday: "long", + year: "numeric", + month: "long", + day: "numeric", + hour: "numeric", + minute: "2-digit", + timeZoneName: "short", + }); + + const systemPrompt = [ + `You are on a phone call. Keep responses brief and conversational (1-2 sentences max).`, + `Speak naturally as if in a real phone conversation.`, + `IMPORTANT: Your responses will be spoken aloud via text-to-speech. Do NOT use any text formatting — no markdown, no bullet points, no asterisks, no numbered lists, no headers. Write plain conversational sentences only.`, + `When you need to use a tool or look something up, ALWAYS say a brief acknowledgment first (e.g. "Let me check that for you" or "One moment") so the caller isn't waiting in silence.`, + `Today is ${localTime}. Always present times in this timezone.`, + `The caller's phone number is ${callerNumber}.`, + ].join("\n"); + + // Per-call session key — each call gets a fresh session. + // Agent identity persists via workspace files (SOUL.md, IDENTITY.md), not session history. + return { + systemPrompt, + llmProvider: "open_ai", + llmEndpoint: { + url: `${this.config.publicUrl}/v1/chat/completions`, + headers: { + Authorization: `Bearer ${this.config.gatewayToken}`, + "x-openclaw-session-key": `agent:${agentId}:voice:${callSid}`, + "x-openclaw-agent-id": agentId, + }, + }, + }; + } + + /** + * Handle stream stop event — close Deepgram session and clean up. + */ + private handleStop(session: BridgeSession): void { + console.log(`[DeepgramBridge] Stream stopped: ${session.streamSid}`); + this.config.deepgramProvider.closeSession(session.callId); + this.sessions.delete(session.streamSid); + this.config.onDisconnect?.(session.callId); + } + + private getStreamToken(request: IncomingMessage): string | undefined { + console.log(`[DeepgramBridge] getStreamToken: url=${request.url} host=${request.headers.host}`); + if (!request.url || !request.headers.host) { + console.log(`[DeepgramBridge] getStreamToken: missing url or host`); + return undefined; + } + try { + const url = new URL(request.url, `http://${request.headers.host}`); + const token = url.searchParams.get("token") ?? undefined; + console.log( + `[DeepgramBridge] getStreamToken: parsed token=${token ? token.substring(0, 8) + "..." : "NONE"} fullUrl=${url.toString()}`, + ); + return token; + } catch (err) { + console.log(`[DeepgramBridge] getStreamToken: URL parse error: ${err}`); + return undefined; + } + } + + /** + * Close all active bridge sessions. + */ + closeAll(): void { + for (const session of this.sessions.values()) { + this.config.deepgramProvider.closeSession(session.callId); + session.ws.close(); + } + this.sessions.clear(); + } +} + +/** + * Twilio Media Stream message format (same protocol as MediaStreamHandler). + */ +interface TwilioMediaMessage { + event: "connected" | "start" | "media" | "stop" | "mark" | "clear"; + sequenceNumber?: string; + streamSid?: string; + start?: { + streamSid: string; + accountSid: string; + callSid: string; + tracks: string[]; + customParameters?: Record; + mediaFormat: { + encoding: string; + sampleRate: number; + channels: number; + }; + }; + media?: { + track?: string; + chunk?: string; + timestamp?: string; + payload?: string; + }; + mark?: { + name: string; + }; +} diff --git a/extensions/voice-call/src/manager.ts b/extensions/voice-call/src/manager.ts index 0cfc9158e..c3af965cd 100644 --- a/extensions/voice-call/src/manager.ts +++ b/extensions/voice-call/src/manager.ts @@ -6,6 +6,7 @@ import path from "node:path"; import type { CallMode, VoiceCallConfig } from "./config.js"; import type { VoiceCallProvider } from "./providers/base.js"; import { isAllowlistedCaller, normalizePhoneNumber } from "./allowlist.js"; +import { resolveNumberForAgent } from "./config.js"; import { type CallId, type CallRecord, @@ -102,6 +103,7 @@ export class CallManager { typeof options === "string" ? { message: options } : (options ?? {}); const initialMessage = opts.message; const mode = opts.mode ?? this.config.outbound.defaultMode; + const agentId = opts.agentId; if (!this.provider) { return { callId: "", success: false, error: "Provider not initialized" }; } @@ -125,8 +127,14 @@ export class CallManager { } const callId = crypto.randomUUID(); + // Resolve from number: agent-specific number > config.fromNumber > mock fallback + const agentNumber = agentId + ? resolveNumberForAgent(this.config, agentId, "outbound") + : undefined; const from = - this.config.fromNumber || (this.provider?.name === "mock" ? "+15550000000" : undefined); + agentNumber || + this.config.fromNumber || + (this.provider?.name === "mock" ? "+15550000000" : undefined); if (!from) { return { callId: "", success: false, error: "fromNumber not configured" }; } @@ -145,6 +153,7 @@ export class CallManager { processedEventIds: [], metadata: { ...(initialMessage && { initialMessage }), + ...(agentId && { agentId }), mode, }, }; diff --git a/extensions/voice-call/src/manager/outbound.ts b/extensions/voice-call/src/manager/outbound.ts index 2f810fec6..ff0980833 100644 --- a/extensions/voice-call/src/manager/outbound.ts +++ b/extensions/voice-call/src/manager/outbound.ts @@ -1,6 +1,7 @@ import crypto from "node:crypto"; import type { CallMode } from "../config.js"; import type { CallManagerContext } from "./context.js"; +import { resolveNumberForAgent } from "../config.js"; import { TerminalStates, type CallId, @@ -29,6 +30,7 @@ export async function initiateCall( typeof options === "string" ? { message: options } : (options ?? {}); const initialMessage = opts.message; const mode = opts.mode ?? ctx.config.outbound.defaultMode; + const agentId = opts.agentId; if (!ctx.provider) { return { callId: "", success: false, error: "Provider not initialized" }; @@ -46,8 +48,12 @@ export async function initiateCall( } const callId = crypto.randomUUID(); + // Resolve from number: agent-specific number > config.fromNumber > mock fallback + const agentNumber = agentId ? resolveNumberForAgent(ctx.config, agentId, "outbound") : undefined; const from = - ctx.config.fromNumber || (ctx.provider?.name === "mock" ? "+15550000000" : undefined); + agentNumber || + ctx.config.fromNumber || + (ctx.provider?.name === "mock" ? "+15550000000" : undefined); if (!from) { return { callId: "", success: false, error: "fromNumber not configured" }; } @@ -65,6 +71,7 @@ export async function initiateCall( processedEventIds: [], metadata: { ...(initialMessage && { initialMessage }), + ...(agentId && { agentId }), mode, }, }; diff --git a/extensions/voice-call/src/manager/store.test.ts b/extensions/voice-call/src/manager/store.test.ts new file mode 100644 index 000000000..e0222b0d9 --- /dev/null +++ b/extensions/voice-call/src/manager/store.test.ts @@ -0,0 +1,293 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import type { CallerHistoryConfig } from "../config.js"; +import type { CallRecord } from "../types.js"; +import { getCallerHistory, getCallHistoryFromStore, loadActiveCallsFromStore } from "./store.js"; + +function makeTempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), "voice-call-store-test-")); +} + +function writeCallsJsonl(storePath: string, records: CallRecord[]): void { + const logPath = path.join(storePath, "calls.jsonl"); + const lines = records.map((r) => JSON.stringify(r)).join("\n") + "\n"; + fs.writeFileSync(logPath, lines); +} + +function makeCallRecord(overrides: Partial = {}): CallRecord { + return { + callId: "call-1", + providerCallId: "prov-1", + provider: "twilio", + direction: "inbound", + state: "completed", + from: "+15551234567", + to: "+15550000000", + startedAt: Date.now() - 60_000, + endedAt: Date.now(), + endReason: "completed", + transcript: [], + processedEventIds: [], + ...overrides, + }; +} + +describe("loadActiveCallsFromStore", () => { + let tempDir: string; + + afterEach(() => { + if (tempDir) fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it("returns empty maps when no file exists", () => { + tempDir = makeTempDir(); + const result = loadActiveCallsFromStore(tempDir); + + expect(result.activeCalls.size).toBe(0); + expect(result.providerCallIdMap.size).toBe(0); + expect(result.processedEventIds.size).toBe(0); + }); + + it("loads only non-terminal calls", () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ callId: "active-1", state: "answered" }), + makeCallRecord({ callId: "done-1", state: "completed" }), + makeCallRecord({ callId: "active-2", state: "ringing", providerCallId: "prov-2" }), + ]); + + const result = loadActiveCallsFromStore(tempDir); + + expect(result.activeCalls.size).toBe(2); + expect(result.activeCalls.has("active-1")).toBe(true); + expect(result.activeCalls.has("active-2")).toBe(true); + expect(result.activeCalls.has("done-1")).toBe(false); + }); + + it("maps providerCallId to callId", () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ callId: "call-a", providerCallId: "prov-a", state: "active" }), + ]); + + const result = loadActiveCallsFromStore(tempDir); + + expect(result.providerCallIdMap.get("prov-a")).toBe("call-a"); + }); + + it("skips invalid JSON lines", () => { + tempDir = makeTempDir(); + const logPath = path.join(tempDir, "calls.jsonl"); + fs.writeFileSync( + logPath, + `${JSON.stringify(makeCallRecord({ callId: "good", state: "active" }))}\n{INVALID_JSON}\n`, + ); + + const result = loadActiveCallsFromStore(tempDir); + + expect(result.activeCalls.size).toBe(1); + expect(result.activeCalls.has("good")).toBe(true); + }); +}); + +describe("getCallHistoryFromStore", () => { + let tempDir: string; + + afterEach(() => { + if (tempDir) fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it("returns empty array when no file exists", async () => { + tempDir = makeTempDir(); + const result = await getCallHistoryFromStore(tempDir); + + expect(result).toEqual([]); + }); + + it("returns records up to limit", async () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ callId: "c1" }), + makeCallRecord({ callId: "c2" }), + makeCallRecord({ callId: "c3" }), + ]); + + const result = await getCallHistoryFromStore(tempDir, 2); + + expect(result).toHaveLength(2); + expect(result[0]!.callId).toBe("c2"); + expect(result[1]!.callId).toBe("c3"); + }); +}); + +describe("getCallerHistory", () => { + let tempDir: string; + + const defaultOpts: CallerHistoryConfig = { + lookbackDays: 30, + maxSessions: 5, + summaryTokens: 512, + }; + + afterEach(() => { + if (tempDir) fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it("returns empty array when no store file exists", async () => { + tempDir = makeTempDir(); + const result = await getCallerHistory(tempDir, "+15551234567", defaultOpts); + + expect(result).toEqual([]); + }); + + it("filters by caller phone number", async () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ callId: "c1", from: "+15551234567", state: "completed" }), + makeCallRecord({ callId: "c2", from: "+15559999999", state: "completed" }), + makeCallRecord({ callId: "c3", from: "+15551234567", state: "completed" }), + ]); + + const result = await getCallerHistory(tempDir, "+15551234567", defaultOpts); + + expect(result).toHaveLength(2); + const ids = result.map((r) => r.callId); + expect(ids).toContain("c1"); + expect(ids).toContain("c3"); + }); + + it("only returns terminal (completed) calls", async () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ callId: "c1", from: "+15551234567", state: "completed" }), + makeCallRecord({ callId: "c2", from: "+15551234567", state: "active" }), + ]); + + const result = await getCallerHistory(tempDir, "+15551234567", defaultOpts); + + expect(result).toHaveLength(1); + expect(result[0]!.callId).toBe("c1"); + }); + + it("respects lookbackDays window", async () => { + tempDir = makeTempDir(); + const now = Date.now(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ + callId: "recent", + from: "+15551234567", + state: "completed", + startedAt: now - 1000 * 60 * 60, // 1 hour ago + }), + makeCallRecord({ + callId: "old", + from: "+15551234567", + state: "completed", + startedAt: now - 1000 * 60 * 60 * 24 * 60, // 60 days ago + }), + ]); + + const result = await getCallerHistory(tempDir, "+15551234567", { + ...defaultOpts, + lookbackDays: 30, + }); + + expect(result).toHaveLength(1); + expect(result[0]!.callId).toBe("recent"); + }); + + it("caps results at maxSessions", async () => { + tempDir = makeTempDir(); + const now = Date.now(); + writeCallsJsonl( + tempDir, + Array.from({ length: 10 }, (_, i) => + makeCallRecord({ + callId: `c${i}`, + from: "+15551234567", + state: "completed", + startedAt: now - i * 1000, + }), + ), + ); + + const result = await getCallerHistory(tempDir, "+15551234567", { + ...defaultOpts, + maxSessions: 3, + }); + + expect(result).toHaveLength(3); + }); + + it("returns most recent calls first", async () => { + tempDir = makeTempDir(); + const now = Date.now(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ + callId: "older", + from: "+15551234567", + state: "completed", + startedAt: now - 10_000, + }), + makeCallRecord({ + callId: "newer", + from: "+15551234567", + state: "completed", + startedAt: now - 1000, + }), + ]); + + const result = await getCallerHistory(tempDir, "+15551234567", defaultOpts); + + expect(result[0]!.callId).toBe("newer"); + expect(result[1]!.callId).toBe("older"); + }); + + it("builds transcript excerpts", async () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ + callId: "c1", + from: "+15551234567", + state: "completed", + transcript: [ + { timestamp: 1, speaker: "user", text: "Hello", isFinal: true }, + { timestamp: 2, speaker: "bot", text: "Hi there", isFinal: true }, + { timestamp: 3, speaker: "user", text: "partial...", isFinal: false }, + ], + }), + ]); + + const result = await getCallerHistory(tempDir, "+15551234567", defaultOpts); + + expect(result).toHaveLength(1); + // Only final transcripts should be included + expect(result[0]!.excerpt).toContain("Caller: Hello"); + expect(result[0]!.excerpt).toContain("Bot: Hi there"); + expect(result[0]!.excerpt).not.toContain("partial"); + }); + + it("truncates excerpts to stay within summaryTokens budget", async () => { + tempDir = makeTempDir(); + writeCallsJsonl(tempDir, [ + makeCallRecord({ + callId: "c1", + from: "+15551234567", + state: "completed", + startedAt: Date.now(), + transcript: [{ timestamp: 1, speaker: "user", text: "A".repeat(500), isFinal: true }], + }), + ]); + + // Very small token budget: 10 tokens = ~40 chars + const result = await getCallerHistory(tempDir, "+15551234567", { + ...defaultOpts, + summaryTokens: 10, + }); + + expect(result).toHaveLength(1); + expect(result[0]!.excerpt.length).toBeLessThanOrEqual(43); // 40 + "..." + }); +}); diff --git a/extensions/voice-call/src/manager/store.ts b/extensions/voice-call/src/manager/store.ts index 888381c33..b07ca00ef 100644 --- a/extensions/voice-call/src/manager/store.ts +++ b/extensions/voice-call/src/manager/store.ts @@ -1,8 +1,17 @@ import fs from "node:fs"; import fsp from "node:fs/promises"; import path from "node:path"; +import type { CallerHistoryConfig } from "../config.js"; import { CallRecordSchema, TerminalStates, type CallId, type CallRecord } from "../types.js"; +export type CallerHistoryEntry = { + callId: string; + startedAt: number; + endedAt?: number; + direction: string; + excerpt: string; +}; + export function persistCallRecord(storePath: string, call: CallRecord): void { const logPath = path.join(storePath, "calls.jsonl"); const line = `${JSON.stringify(call)}\n`; @@ -89,3 +98,73 @@ export async function getCallHistoryFromStore( return calls; } + +function buildExcerpt(call: CallRecord, maxChars: number): string { + const lines: string[] = []; + for (const entry of call.transcript) { + if (!entry.isFinal) { + continue; + } + const speaker = entry.speaker === "user" ? "Caller" : "Bot"; + lines.push(`${speaker}: ${entry.text}`); + } + const full = lines.join("\n"); + if (full.length <= maxChars) { + return full; + } + return full.slice(0, maxChars) + "..."; +} + +export async function getCallerHistory( + storePath: string, + callerNumber: string, + opts: CallerHistoryConfig, +): Promise { + const logPath = path.join(storePath, "calls.jsonl"); + + try { + await fsp.access(logPath); + } catch { + return []; + } + + const content = await fsp.readFile(logPath, "utf-8"); + const lines = content.trim().split("\n").filter(Boolean); + + const cutoff = Date.now() - opts.lookbackDays * 24 * 60 * 60 * 1000; + // ~4 chars per token + const maxChars = opts.summaryTokens * 4; + + const matching: CallRecord[] = []; + for (const line of lines) { + try { + const call = CallRecordSchema.parse(JSON.parse(line)); + if (call.from !== callerNumber) { + continue; + } + if (!TerminalStates.has(call.state)) { + continue; + } + if (call.startedAt < cutoff) { + continue; + } + matching.push(call); + } catch { + // Skip invalid lines. + } + } + + // Most recent first + matching.sort((a, b) => b.startedAt - a.startedAt); + + // Cap at maxSessions + const capped = matching.slice(0, opts.maxSessions); + + return capped.map((call) => ({ + callId: call.callId, + startedAt: call.startedAt, + endedAt: call.endedAt, + direction: call.direction, + excerpt: buildExcerpt(call, maxChars), + })); +} diff --git a/extensions/voice-call/src/media-stream.ts b/extensions/voice-call/src/media-stream.ts index 2525019cd..cb71f554a 100644 --- a/extensions/voice-call/src/media-stream.ts +++ b/extensions/voice-call/src/media-stream.ts @@ -100,9 +100,12 @@ export class MediaStreamHandler { console.log("[MediaStream] Twilio connected"); break; - case "start": - session = await this.handleStart(ws, message, streamToken); + case "start": { + // Prefer token from Twilio's customParameters (Parameter elements in TwiML) + const paramToken = message.start?.customParameters?.token; + session = await this.handleStart(ws, message, paramToken ?? streamToken); break; + } case "media": if (session && message.media?.payload) { @@ -393,6 +396,7 @@ interface TwilioMediaMessage { accountSid: string; callSid: string; tracks: string[]; + customParameters?: Record; mediaFormat: { encoding: string; sampleRate: number; diff --git a/extensions/voice-call/src/providers/deepgram-fallback.test.ts b/extensions/voice-call/src/providers/deepgram-fallback.test.ts new file mode 100644 index 000000000..49ed78ade --- /dev/null +++ b/extensions/voice-call/src/providers/deepgram-fallback.test.ts @@ -0,0 +1,208 @@ +import { describe, expect, it, vi, beforeEach } from "vitest"; +import type { DeepgramFallbackConfig } from "../config.js"; +import type { DeepgramVoiceAgentClient } from "./deepgram-voice-agent.js"; +import { FallbackManager, type FallbackEvent } from "./deepgram-fallback.js"; + +function createMockClient(): DeepgramVoiceAgentClient { + return { + updatePrompt: vi.fn(), + injectAgentMessage: vi.fn(), + sendFunctionCallResponse: vi.fn(), + } as unknown as DeepgramVoiceAgentClient; +} + +function createConfig(overrides: Partial = {}): DeepgramFallbackConfig { + return { + openclawTimeoutMs: 100, + cannedResponses: ["Could you say that again?", "One moment please..."], + maxRetries: 2, + deepgramFallbackPrompt: "You are a basic assistant.", + exitMessage: "Goodbye.", + ...overrides, + }; +} + +describe("FallbackManager", () => { + let events: FallbackEvent[]; + let hangups: string[]; + + beforeEach(() => { + events = []; + hangups = []; + }); + + function createManager(configOverrides: Partial = {}) { + return new FallbackManager({ + config: createConfig(configOverrides), + onHangup: (callId) => hangups.push(callId), + onFallbackEvent: (event) => events.push(event), + }); + } + + it("returns result on success", async () => { + const manager = createManager(); + const client = createMockClient(); + const handler = vi.fn().mockResolvedValue("ok"); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + const result = await wrapped("test_fn", { arg: "value" }, "fc-1"); + + expect(result).toBe("ok"); + expect(handler).toHaveBeenCalledWith("test_fn", { arg: "value" }, "fc-1"); + expect(events).toHaveLength(0); + }); + + it("tier 1: updates prompt on first failure when fallback prompt is set", async () => { + const manager = createManager(); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("timeout")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + const result = await wrapped("test_fn", {}, "fc-1"); + + const parsed = JSON.parse(result); + expect(parsed.tier).toBe(1); + expect(parsed.fallback).toBe(true); + expect(client.updatePrompt).toHaveBeenCalledWith("You are a basic assistant."); + expect(events).toHaveLength(1); + expect(events[0]!.tier).toBe(1); + }); + + it("tier 2: uses canned response on second failure", async () => { + const manager = createManager(); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("timeout")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + // First failure → tier 1 + await wrapped("test_fn", {}, "fc-1"); + // Second failure → tier 2 + const result = await wrapped("test_fn", {}, "fc-2"); + + const parsed = JSON.parse(result); + expect(parsed.tier).toBe(2); + expect(client.injectAgentMessage).toHaveBeenCalledWith("Could you say that again?"); + }); + + it("tier 2: cycles canned responses round-robin", async () => { + const manager = createManager(); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("timeout")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + await wrapped("fn", {}, "fc-1"); // tier 1 + await wrapped("fn", {}, "fc-2"); // tier 2: index 0 + await wrapped("fn", {}, "fc-3"); // tier 4 (maxRetries=2, 3rd failure exceeds) + + // With maxRetries=2, third consecutive failure triggers tier 4 + const calls = (client.injectAgentMessage as ReturnType).mock.calls; + expect(calls.some(([msg]: string[]) => msg === "Could you say that again?")).toBe(true); + }); + + it("tier 3: honest timeout when no fallback prompt or canned responses", async () => { + const manager = createManager({ + deepgramFallbackPrompt: undefined, + cannedResponses: [], + }); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("timeout")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + const result = await wrapped("test_fn", {}, "fc-1"); + + const parsed = JSON.parse(result); + expect(parsed.tier).toBe(3); + expect(client.injectAgentMessage).toHaveBeenCalledWith( + "I'm having trouble right now, one moment...", + ); + }); + + it("tier 4: graceful exit after maxRetries", async () => { + vi.useFakeTimers(); + const manager = createManager({ maxRetries: 1 }); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("timeout")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + await wrapped("fn", {}, "fc-1"); // tier 1 + const result = await wrapped("fn", {}, "fc-2"); // tier 4 (maxRetries=1, 2nd failure) + + const parsed = JSON.parse(result); + expect(parsed.tier).toBe(4); + expect(parsed.hangup).toBe(true); + expect(client.injectAgentMessage).toHaveBeenCalledWith("Goodbye."); + + // Advance timer to trigger hangup callback + vi.advanceTimersByTime(3000); + expect(hangups).toContain("call-1"); + + vi.useRealTimers(); + }); + + it("resets failure count on success", async () => { + const manager = createManager(); + const client = createMockClient(); + let callCount = 0; + const handler = vi.fn().mockImplementation(() => { + callCount++; + if (callCount <= 1) return Promise.reject(new Error("fail")); + return Promise.resolve("ok"); + }); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + await wrapped("fn", {}, "fc-1"); // failure → tier 1 + await wrapped("fn", {}, "fc-2"); // success → resets count + + // Next failure should be tier 1 again (not tier 2) + callCount = 0; + const result = await wrapped("fn", {}, "fc-3"); // failure → tier 1 + const parsed = JSON.parse(result); + expect(parsed.tier).toBe(1); + }); + + it("times out slow function calls", async () => { + const manager = createManager({ openclawTimeoutMs: 50 }); + const client = createMockClient(); + const handler = vi + .fn() + .mockImplementation(() => new Promise((resolve) => setTimeout(resolve, 200))); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + const result = await wrapped("slow_fn", {}, "fc-1"); + + const parsed = JSON.parse(result); + expect(parsed.fallback).toBe(true); + }); + + it("cleanup removes call state", async () => { + const manager = createManager(); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("fail")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + await wrapped("fn", {}, "fc-1"); // failure, state exists + manager.cleanup("call-1"); + + // After cleanup, next failure should be fresh (tier 1 again) + const result = await wrapped("fn", {}, "fc-2"); + const parsed = JSON.parse(result); + expect(parsed.tier).toBe(1); + }); + + it("emits fallback events with correct metadata", async () => { + const manager = createManager(); + const client = createMockClient(); + const handler = vi.fn().mockRejectedValue(new Error("fail")); + const wrapped = manager.wrapFunctionCall("call-1", client, handler); + + await wrapped("my_tool", { x: 1 }, "fc-1"); + + expect(events).toHaveLength(1); + expect(events[0]).toMatchObject({ + callId: "call-1", + tier: 1, + functionName: "my_tool", + }); + expect(events[0]!.timestamp).toBeGreaterThan(0); + }); +}); diff --git a/extensions/voice-call/src/providers/deepgram-fallback.ts b/extensions/voice-call/src/providers/deepgram-fallback.ts new file mode 100644 index 000000000..17e8f4cfe --- /dev/null +++ b/extensions/voice-call/src/providers/deepgram-fallback.ts @@ -0,0 +1,280 @@ +/** + * Graceful Degradation Chain for Deepgram Voice Calls + * + * Implements a 4-tier fallback when the primary function call handler + * (OpenClaw tool execution) fails or times out: + * + * Tier 1 — Deepgram LLM fallback: Switch to Deepgram's built-in LLM + * with a basic fallback prompt so the agent can still converse. + * Tier 2 — Canned response: Inject a pre-configured canned response. + * Tier 3 — Honest timeout: "I'm having trouble right now, one moment..." + * Tier 4 — Graceful exit: After maxRetries, speak exit message and hang up. + * + * Consecutive successes reset the failure counter. + */ + +import type { DeepgramFallbackConfig } from "../config.js"; +import type { DeepgramVoiceAgentClient } from "./deepgram-voice-agent.js"; + +const LOG_PREFIX = "[DeepgramFallback]"; + +const DEFAULT_TIMEOUT_MSG = "I'm having trouble right now, one moment..."; + +export type FallbackTier = 1 | 2 | 3 | 4; + +export interface FallbackEvent { + callId: string; + tier: FallbackTier; + functionName: string; + message: string; + timestamp: number; +} + +export interface FallbackManagerOptions { + config: DeepgramFallbackConfig; + /** Called when a graceful exit (tier 4) triggers a hangup. */ + onHangup?: (callId: string) => void; + /** Called when a fallback event occurs (for observability). */ + onFallbackEvent?: (event: FallbackEvent) => void; +} + +interface CallFallbackState { + consecutiveFailures: number; + cannedResponseIndex: number; +} + +export class FallbackManager { + private readonly config: DeepgramFallbackConfig; + private readonly callState = new Map(); + private readonly options: FallbackManagerOptions; + + constructor(options: FallbackManagerOptions) { + this.config = options.config; + this.options = options; + } + + /** + * Wrap a function call handler with timeout and fallback logic. + * + * Returns a new handler that applies the timeout from config and + * escalates through fallback tiers on failure. + */ + wrapFunctionCall( + callId: string, + client: DeepgramVoiceAgentClient, + handler: (name: string, args: Record, fnCallId: string) => Promise, + ): (name: string, args: Record, fnCallId: string) => Promise { + return async (name, args, fnCallId) => { + const timeoutMs = this.config.openclawTimeoutMs; + + try { + const result = await this.withTimeout(handler(name, args, fnCallId), timeoutMs); + this.recordSuccess(callId); + return result; + } catch (error) { + const msg = error instanceof Error ? error.message : String(error); + console.warn(`${LOG_PREFIX} Function call "${name}" failed for call ${callId}: ${msg}`); + + return this.handleFailure(callId, client, name); + } + }; + } + + /** + * Clean up state for a call that ended. + */ + cleanup(callId: string): void { + this.callState.delete(callId); + } + + // ------------------------------------------------------------------------- + // Internal + // ------------------------------------------------------------------------- + + private getState(callId: string): CallFallbackState { + let state = this.callState.get(callId); + if (!state) { + state = { consecutiveFailures: 0, cannedResponseIndex: 0 }; + this.callState.set(callId, state); + } + return state; + } + + private recordSuccess(callId: string): void { + const state = this.getState(callId); + state.consecutiveFailures = 0; + } + + private handleFailure( + callId: string, + client: DeepgramVoiceAgentClient, + functionName: string, + ): string { + const state = this.getState(callId); + state.consecutiveFailures++; + + const tier = this.determineTier(state); + + switch (tier) { + case 1: + return this.tier1FallbackPrompt(callId, client, functionName); + case 2: + return this.tier2CannedResponse(callId, client, functionName, state); + case 3: + return this.tier3HonestTimeout(callId, client, functionName); + case 4: + return this.tier4GracefulExit(callId, client, functionName); + } + } + + private determineTier(state: CallFallbackState): FallbackTier { + const maxRetries = this.config.maxRetries; + + if (state.consecutiveFailures <= 1) { + // First failure: try Deepgram LLM fallback if available + if (this.config.deepgramFallbackPrompt) return 1; + // No fallback prompt → skip to canned response + if (this.config.cannedResponses.length > 0) return 2; + return 3; + } + + if (state.consecutiveFailures <= maxRetries) { + // Subsequent failures: cycle through canned responses + if (this.config.cannedResponses.length > 0) return 2; + return 3; + } + + // Exceeded max retries → graceful exit + return 4; + } + + /** + * Tier 1: Switch to Deepgram's built-in LLM with a fallback prompt. + * The agent can still converse using Deepgram's own model. + */ + private tier1FallbackPrompt( + callId: string, + client: DeepgramVoiceAgentClient, + functionName: string, + ): string { + const prompt = this.config.deepgramFallbackPrompt!; + console.log(`${LOG_PREFIX} Tier 1: Updating prompt for call ${callId}`); + client.updatePrompt(prompt); + + this.emitEvent(callId, 1, functionName, "Switched to fallback LLM prompt"); + + return JSON.stringify({ + fallback: true, + tier: 1, + message: "Primary tool execution failed. Using fallback mode.", + }); + } + + /** + * Tier 2: Inject a canned response. + * Cycles through the configured responses round-robin. + */ + private tier2CannedResponse( + callId: string, + client: DeepgramVoiceAgentClient, + functionName: string, + state: CallFallbackState, + ): string { + const responses = this.config.cannedResponses; + const response = responses[state.cannedResponseIndex % responses.length]!; + state.cannedResponseIndex++; + + console.log(`${LOG_PREFIX} Tier 2: Canned response for call ${callId}: "${response}"`); + client.injectAgentMessage(response); + + this.emitEvent(callId, 2, functionName, response); + + return JSON.stringify({ + fallback: true, + tier: 2, + message: response, + }); + } + + /** + * Tier 3: Honest timeout message. + */ + private tier3HonestTimeout( + callId: string, + client: DeepgramVoiceAgentClient, + functionName: string, + ): string { + console.log(`${LOG_PREFIX} Tier 3: Honest timeout for call ${callId}`); + client.injectAgentMessage(DEFAULT_TIMEOUT_MSG); + + this.emitEvent(callId, 3, functionName, DEFAULT_TIMEOUT_MSG); + + return JSON.stringify({ + fallback: true, + tier: 3, + message: DEFAULT_TIMEOUT_MSG, + }); + } + + /** + * Tier 4: Graceful exit. Speak exit message and request hangup. + */ + private tier4GracefulExit( + callId: string, + client: DeepgramVoiceAgentClient, + functionName: string, + ): string { + const exitMsg = this.config.exitMessage ?? "I apologize, I need to call you back. Goodbye."; + + console.log(`${LOG_PREFIX} Tier 4: Graceful exit for call ${callId}: "${exitMsg}"`); + client.injectAgentMessage(exitMsg); + + this.emitEvent(callId, 4, functionName, exitMsg); + + // Schedule hangup after a short delay to let the exit message be spoken + setTimeout(() => { + this.options.onHangup?.(callId); + }, 3000); + + return JSON.stringify({ + fallback: true, + tier: 4, + message: exitMsg, + hangup: true, + }); + } + + private emitEvent( + callId: string, + tier: FallbackTier, + functionName: string, + message: string, + ): void { + this.options.onFallbackEvent?.({ + callId, + tier, + functionName, + message, + timestamp: Date.now(), + }); + } + + private withTimeout(promise: Promise, timeoutMs: number): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + reject(new Error(`Timed out after ${timeoutMs}ms`)); + }, timeoutMs); + + promise.then( + (value) => { + clearTimeout(timer); + resolve(value); + }, + (error) => { + clearTimeout(timer); + reject(error); + }, + ); + }); + } +} diff --git a/extensions/voice-call/src/providers/deepgram-voice-agent.ts b/extensions/voice-call/src/providers/deepgram-voice-agent.ts new file mode 100644 index 000000000..1cd151c49 --- /dev/null +++ b/extensions/voice-call/src/providers/deepgram-voice-agent.ts @@ -0,0 +1,832 @@ +/** + * Deepgram Voice Agent WebSocket Client + * + * Manages a WebSocket connection to the Deepgram Voice Agent API for + * fully managed voice conversations. Deepgram handles STT, LLM, and TTS + * internally; this client bridges telephony audio and function calls. + * + * Protocol: wss://agent.deepgram.com/v1/agent/converse + * + * @see https://developers.deepgram.com/docs/voice-agent + */ + +import { EventEmitter } from "node:events"; +import WebSocket from "ws"; +import type { DeepgramLatencyConfig } from "../config.js"; + +// --------------------------------------------------------------------------- +// Types — Deepgram Voice Agent Protocol +// --------------------------------------------------------------------------- + +/** Audio encoding formats supported by the Deepgram Voice Agent API. */ +export type DeepgramAudioEncoding = + | "linear16" + | "mulaw" + | "alaw" + | "flac" + | "opus" + | "ogg-opus" + | "speex" + | "amr-nb" + | "amr-wb"; + +/** Output-only encoding formats (subset of input). */ +export type DeepgramOutputEncoding = "linear16" | "mulaw" | "alaw"; + +/** LLM provider identifiers accepted by the Deepgram think stage. */ +export type DeepgramThinkProvider = + | "deepgram" + | "open_ai" + | "anthropic" + | "google" + | "groq" + | "aws_bedrock" + | "custom"; + +/** TTS provider identifiers accepted by the Deepgram speak stage. */ +export type DeepgramSpeakProvider = + | "deepgram" + | "eleven_labs" + | "cartesia" + | "open_ai" + | "aws_polly"; + +// -- Function definition for Settings ---------------------------------------- + +export interface DeepgramFunctionParameter { + type: string; + properties: Record; + required?: string[]; +} + +export interface DeepgramFunctionEndpoint { + url: string; + method: string; + headers?: Record; +} + +export interface DeepgramFunctionDef { + name: string; + description: string; + parameters: DeepgramFunctionParameter; + /** If provided, Deepgram executes the function server-side. Omit for client-side. */ + endpoint?: DeepgramFunctionEndpoint; +} + +// -- Client → Server messages ------------------------------------------------ + +export interface DeepgramSettings { + type: "Settings"; + audio: { + input: { + encoding: DeepgramAudioEncoding; + sample_rate: number; + }; + output: { + encoding: DeepgramOutputEncoding; + sample_rate: number; + container?: string; + bitrate?: number; + }; + }; + agent: { + language?: string; + listen: { + provider: { + type: "deepgram"; + model?: string; + keyterms?: string[]; + smart_format?: boolean; + }; + }; + think: { + provider: { + type: DeepgramThinkProvider; + model?: string; + temperature?: number; + }; + prompt?: string; + context_length?: number | "max"; + functions?: DeepgramFunctionDef[]; + endpoint?: { url: string; headers?: Record }; + }; + speak: { + provider: { + type: DeepgramSpeakProvider; + model?: string; + model_id?: string; + voice?: string; + language_code?: string; + }; + endpoint?: { url: string; headers?: Record }; + }; + context?: { + messages?: Array<{ + role: "user" | "assistant"; + content: string; + }>; + }; + greeting?: string; + }; + tags?: string[]; + experimental?: boolean; + mip_opt_out?: boolean; + flags?: { history?: boolean }; +} + +export interface DeepgramUpdateInstructions { + type: "UpdateInstructions"; + instructions: string; +} + +export interface DeepgramUpdateSpeak { + type: "UpdateSpeak"; + model: string; +} + +export interface DeepgramInjectAgentMessage { + type: "InjectAgentMessage"; + message: string; +} + +export interface DeepgramInjectUserMessage { + type: "InjectUserMessage"; + message: string; +} + +export interface DeepgramClientFunctionCallResponse { + type: "FunctionCallResponse"; + id: string; + name: string; + content: string; +} + +export interface DeepgramKeepAlive { + type: "KeepAlive"; +} + +export type DeepgramClientMessage = + | DeepgramSettings + | DeepgramUpdateInstructions + | DeepgramUpdateSpeak + | DeepgramInjectAgentMessage + | DeepgramInjectUserMessage + | DeepgramClientFunctionCallResponse + | DeepgramKeepAlive; + +// -- Server → Client messages ------------------------------------------------ + +export interface DeepgramWelcome { + type: "Welcome"; + request_id: string; +} + +export interface DeepgramSettingsApplied { + type: "SettingsApplied"; +} + +export interface DeepgramUserStartedSpeaking { + type: "UserStartedSpeaking"; +} + +export interface DeepgramAgentStartedSpeaking { + type: "AgentStartedSpeaking"; + total_latency?: number; + tts_latency?: number; + ttt_latency?: number; +} + +export interface DeepgramAgentThinking { + type: "AgentThinking"; +} + +export interface DeepgramConversationText { + type: "ConversationText"; + role: "user" | "assistant"; + content: string; +} + +export interface DeepgramFunctionCallRequest { + type: "FunctionCallRequest"; + functions: Array<{ + id: string; + name: string; + arguments: string; + client_side: boolean; + }>; +} + +export interface DeepgramAgentAudioDone { + type: "AgentAudioDone"; +} + +export interface DeepgramPromptUpdated { + type: "PromptUpdated"; +} + +export interface DeepgramSpeakUpdated { + type: "SpeakUpdated"; +} + +export interface DeepgramInjectionRefused { + type: "InjectionRefused"; + reason?: string; +} + +export interface DeepgramServerError { + type: "Error"; + description: string; + code?: string; +} + +export interface DeepgramServerWarning { + type: "Warning"; + description: string; + code?: string; +} + +export type DeepgramServerMessage = + | DeepgramWelcome + | DeepgramSettingsApplied + | DeepgramUserStartedSpeaking + | DeepgramAgentStartedSpeaking + | DeepgramAgentThinking + | DeepgramConversationText + | DeepgramFunctionCallRequest + | DeepgramAgentAudioDone + | DeepgramPromptUpdated + | DeepgramSpeakUpdated + | DeepgramInjectionRefused + | DeepgramServerError + | DeepgramServerWarning; + +// --------------------------------------------------------------------------- +// Client Configuration +// --------------------------------------------------------------------------- + +export interface DeepgramVoiceAgentConfig { + /** Deepgram API key. */ + apiKey: string; + + /** STT model (default: "nova-3"). */ + sttModel?: string; + /** TTS model (default: "aura-2-thalia-en"). */ + ttsModel?: string; + /** TTS provider (default: "deepgram"). */ + ttsProvider?: DeepgramSpeakProvider; + + /** LLM provider type (default: "open_ai"). */ + llmProvider?: DeepgramThinkProvider; + /** LLM model (default: "gpt-4o-mini"). */ + llmModel?: string; + /** LLM temperature (0-2). */ + llmTemperature?: number; + /** Custom LLM endpoint (for custom/self-hosted providers). */ + llmEndpoint?: { url: string; headers?: Record }; + + /** System prompt for the agent. */ + systemPrompt?: string; + /** Greeting spoken when the agent connects. */ + greeting?: string; + /** Language code (default: "en"). */ + language?: string; + + /** Client-side function definitions. */ + functions?: DeepgramFunctionDef[]; + + /** + * Callback invoked when Deepgram requests a client-side function call. + * Must return the result as a string (JSON or plain text). + */ + onFunctionCall?: (name: string, args: Record, callId: string) => Promise; + + /** Input audio encoding (default: "mulaw" — matches Twilio). */ + inputEncoding?: DeepgramAudioEncoding; + /** Input audio sample rate (default: 8000 for telephony). */ + inputSampleRate?: number; + /** Output audio encoding (default: "mulaw"). */ + outputEncoding?: DeepgramOutputEncoding; + /** Output audio sample rate (default: 8000). */ + outputSampleRate?: number; + + /** Optional conversation context to restore. */ + context?: Array<{ role: "user" | "assistant"; content: string }>; + + /** Key terms to boost recognition accuracy. */ + keyterms?: string[]; + + /** Latency-hiding filler configuration. */ + latency?: DeepgramLatencyConfig; + + /** Connection timeout in ms (default: 10000). */ + connectTimeoutMs?: number; + /** Keep-alive interval in ms (default: 5000). */ + keepAliveIntervalMs?: number; + /** Max reconnect attempts (default: 3). */ + maxReconnectAttempts?: number; +} + +// --------------------------------------------------------------------------- +// Event map +// --------------------------------------------------------------------------- + +export interface DeepgramVoiceAgentEvents { + /** Synthesized audio from Deepgram TTS (ready to forward to telephony). */ + audio: [audio: Buffer]; + /** Transcript of user or agent speech. */ + conversationText: [role: "user" | "assistant", content: string]; + /** User started speaking (barge-in signal). */ + userStartedSpeaking: []; + /** Agent started speaking (with latency metrics). */ + agentStartedSpeaking: [latency?: { total?: number; tts?: number; ttt?: number }]; + /** Agent thinking (processing). */ + agentThinking: []; + /** Agent finished sending audio for current utterance. */ + agentAudioDone: []; + /** Function call requested by the agent. */ + functionCall: [name: string, args: Record, callId: string]; + /** Filler phrase injected while waiting for a function call result. */ + fillerInjected: [phrase: string]; + /** Prompt was updated successfully. */ + promptUpdated: []; + /** Speak model was updated successfully. */ + speakUpdated: []; + /** Injection was refused (user speaking or agent responding). */ + injectionRefused: [reason?: string]; + /** Connection established. */ + connected: [requestId: string]; + /** Settings applied by server. */ + settingsApplied: []; + /** Warning from server. */ + warning: [description: string, code?: string]; + /** Error from server or connection. */ + error: [error: Error]; + /** Connection closed. */ + closed: [code: number, reason: string]; +} + +// --------------------------------------------------------------------------- +// Client +// --------------------------------------------------------------------------- + +const LOG_PREFIX = "[DeepgramVoiceAgent]"; +const AGENT_WS_URL = "wss://agent.deepgram.com/v1/agent/converse"; + +export class DeepgramVoiceAgentClient extends EventEmitter { + private ws: WebSocket | null = null; + private connected = false; + private closed = false; + private reconnectAttempts = 0; + private keepAliveTimer: ReturnType | null = null; + private readonly config: Required< + Pick< + DeepgramVoiceAgentConfig, + | "apiKey" + | "sttModel" + | "ttsModel" + | "ttsProvider" + | "llmProvider" + | "llmModel" + | "llmTemperature" + | "systemPrompt" + | "language" + | "inputEncoding" + | "inputSampleRate" + | "outputEncoding" + | "outputSampleRate" + | "connectTimeoutMs" + | "keepAliveIntervalMs" + | "maxReconnectAttempts" + > + > & + Omit< + DeepgramVoiceAgentConfig, + | "sttModel" + | "ttsModel" + | "ttsProvider" + | "llmProvider" + | "llmModel" + | "llmTemperature" + | "systemPrompt" + | "language" + | "inputEncoding" + | "inputSampleRate" + | "outputEncoding" + | "outputSampleRate" + | "connectTimeoutMs" + | "keepAliveIntervalMs" + | "maxReconnectAttempts" + >; + + constructor(config: DeepgramVoiceAgentConfig) { + super(); + this.config = { + ...config, + sttModel: config.sttModel ?? "nova-3", + ttsModel: config.ttsModel ?? "aura-2-thalia-en", + ttsProvider: config.ttsProvider ?? "deepgram", + llmProvider: config.llmProvider ?? "open_ai", + llmModel: config.llmModel ?? "gpt-4o-mini", + llmTemperature: config.llmTemperature ?? 0.7, + systemPrompt: config.systemPrompt ?? "", + language: config.language ?? "en", + inputEncoding: config.inputEncoding ?? "mulaw", + inputSampleRate: config.inputSampleRate ?? 8000, + outputEncoding: config.outputEncoding ?? "mulaw", + outputSampleRate: config.outputSampleRate ?? 8000, + connectTimeoutMs: config.connectTimeoutMs ?? 10_000, + keepAliveIntervalMs: config.keepAliveIntervalMs ?? 5_000, + maxReconnectAttempts: config.maxReconnectAttempts ?? 3, + }; + } + + // ------------------------------------------------------------------------- + // Connection lifecycle + // ------------------------------------------------------------------------- + + async connect(): Promise { + this.closed = false; + this.reconnectAttempts = 0; + return this.doConnect(); + } + + private doConnect(): Promise { + return new Promise((resolve, reject) => { + const ws = new WebSocket(AGENT_WS_URL, { + headers: { + Authorization: `Token ${this.config.apiKey}`, + }, + }); + + this.ws = ws; + + const timeout = setTimeout(() => { + if (!this.connected) { + ws.terminate(); + reject(new Error(`${LOG_PREFIX} Connection timeout`)); + } + }, this.config.connectTimeoutMs); + + ws.on("open", () => { + clearTimeout(timeout); + console.log(`${LOG_PREFIX} WebSocket connected`); + this.connected = true; + this.reconnectAttempts = 0; + + // Send settings immediately on open + this.sendSettings(); + + // Start keep-alive + this.startKeepAlive(); + }); + + ws.on("message", (data: Buffer | string, isBinary: boolean) => { + if (isBinary) { + // Binary frames are audio data from TTS + const audioBuffer = Buffer.isBuffer(data) ? data : Buffer.from(data as string, "binary"); + this.emit("audio", audioBuffer); + return; + } + + // Text frames are JSON protocol messages + try { + const message = JSON.parse(data.toString()) as DeepgramServerMessage; + this.handleServerMessage(message, resolve); + } catch (e) { + console.error(`${LOG_PREFIX} Failed to parse message:`, e); + } + }); + + ws.on("error", (error) => { + console.error(`${LOG_PREFIX} WebSocket error:`, error); + this.emit("error", error); + if (!this.connected) { + clearTimeout(timeout); + reject(error); + } + }); + + ws.on("close", (code, reason) => { + const reasonStr = reason?.toString() || "none"; + console.log(`${LOG_PREFIX} WebSocket closed (code: ${code}, reason: ${reasonStr})`); + this.connected = false; + this.stopKeepAlive(); + this.emit("closed", code, reasonStr); + + if (!this.closed) { + void this.attemptReconnect(); + } + }); + }); + } + + private async attemptReconnect(): Promise { + if (this.closed) return; + if (this.reconnectAttempts >= this.config.maxReconnectAttempts) { + console.error( + `${LOG_PREFIX} Max reconnect attempts (${this.config.maxReconnectAttempts}) reached`, + ); + return; + } + + this.reconnectAttempts++; + const delay = 1000 * 2 ** (this.reconnectAttempts - 1); + console.log( + `${LOG_PREFIX} Reconnecting ${this.reconnectAttempts}/${this.config.maxReconnectAttempts} in ${delay}ms...`, + ); + + await new Promise((resolve) => setTimeout(resolve, delay)); + if (this.closed) return; + + try { + await this.doConnect(); + console.log(`${LOG_PREFIX} Reconnected successfully`); + } catch (error) { + console.error(`${LOG_PREFIX} Reconnect failed:`, error); + } + } + + // ------------------------------------------------------------------------- + // Server message handling + // ------------------------------------------------------------------------- + + private handleServerMessage( + msg: DeepgramServerMessage, + onSettingsApplied?: (value: void) => void, + ): void { + switch (msg.type) { + case "Welcome": + console.log(`${LOG_PREFIX} Welcome (request_id: ${msg.request_id})`); + this.emit("connected", msg.request_id); + break; + + case "SettingsApplied": + console.log(`${LOG_PREFIX} Settings applied`); + this.emit("settingsApplied"); + // Resolve the connect() promise once settings are confirmed + onSettingsApplied?.(); + break; + + case "UserStartedSpeaking": + this.emit("userStartedSpeaking"); + break; + + case "AgentStartedSpeaking": + this.emit("agentStartedSpeaking", { + total: msg.total_latency, + tts: msg.tts_latency, + ttt: msg.ttt_latency, + }); + break; + + case "AgentThinking": + this.emit("agentThinking"); + break; + + case "ConversationText": + this.emit("conversationText", msg.role, msg.content); + break; + + case "FunctionCallRequest": + this.handleFunctionCallRequest(msg); + break; + + case "AgentAudioDone": + this.emit("agentAudioDone"); + break; + + case "PromptUpdated": + this.emit("promptUpdated"); + break; + + case "SpeakUpdated": + this.emit("speakUpdated"); + break; + + case "InjectionRefused": + console.warn(`${LOG_PREFIX} Injection refused: ${msg.reason ?? "unknown"}`); + this.emit("injectionRefused", msg.reason); + break; + + case "Error": + console.error(`${LOG_PREFIX} Server error: ${msg.description} (code: ${msg.code})`); + this.emit("error", new Error(`Deepgram error: ${msg.description}`)); + break; + + case "Warning": + console.warn(`${LOG_PREFIX} Server warning: ${msg.description} (code: ${msg.code})`); + this.emit("warning", msg.description, msg.code); + break; + } + } + + // ------------------------------------------------------------------------- + // Function calling + // ------------------------------------------------------------------------- + + private handleFunctionCallRequest(msg: DeepgramFunctionCallRequest): void { + for (const fn of msg.functions) { + if (!fn.client_side) continue; + + let parsedArgs: Record; + try { + parsedArgs = JSON.parse(fn.arguments) as Record; + } catch { + parsedArgs = {}; + } + + this.emit("functionCall", fn.name, parsedArgs, fn.id); + + // If an onFunctionCall callback is configured, execute automatically + if (this.config.onFunctionCall) { + this.executeFunctionCallWithFiller(fn.id, fn.name, parsedArgs); + } + } + } + + /** + * Execute a function call with optional filler phrase injection. + * + * If the call takes longer than `fillerThresholdMs`, a randomly selected + * filler phrase is injected via `InjectAgentMessage` so the caller hears + * something while waiting. + */ + private executeFunctionCallWithFiller( + fnId: string, + fnName: string, + args: Record, + ): void { + const latency = this.config.latency; + const thresholdMs = latency?.fillerThresholdMs ?? 0; + const phrases = latency?.fillerPhrases ?? []; + + let fillerTimer: ReturnType | null = null; + let completed = false; + + // Start a filler timer only when we have a threshold and phrases. + if (thresholdMs > 0 && phrases.length > 0) { + fillerTimer = setTimeout(() => { + if (completed) return; + const phrase = phrases[Math.floor(Math.random() * phrases.length)]!; + console.log(`${LOG_PREFIX} Injecting filler for "${fnName}": "${phrase}"`); + this.injectAgentMessage(phrase); + this.emit("fillerInjected", phrase); + }, thresholdMs); + } + + void this.config.onFunctionCall!(fnName, args, fnId) + .then((result) => { + completed = true; + if (fillerTimer) clearTimeout(fillerTimer); + this.sendFunctionCallResponse(fnId, fnName, result); + }) + .catch((error) => { + completed = true; + if (fillerTimer) clearTimeout(fillerTimer); + const errorMsg = error instanceof Error ? error.message : String(error); + console.error(`${LOG_PREFIX} Function call "${fnName}" failed:`, errorMsg); + this.sendFunctionCallResponse(fnId, fnName, JSON.stringify({ error: errorMsg })); + }); + } + + // ------------------------------------------------------------------------- + // Client → Server messages + // ------------------------------------------------------------------------- + + private sendJson(message: DeepgramClientMessage): void { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify(message)); + } + } + + private sendSettings(): void { + const settings: DeepgramSettings = { + type: "Settings", + audio: { + input: { + encoding: this.config.inputEncoding, + sample_rate: this.config.inputSampleRate, + }, + output: { + encoding: this.config.outputEncoding, + sample_rate: this.config.outputSampleRate, + container: "none", + }, + }, + agent: { + language: this.config.language, + listen: { + provider: { + type: "deepgram", + model: this.config.sttModel, + keyterms: this.config.keyterms, + }, + }, + think: { + provider: { + type: this.config.llmProvider, + model: this.config.llmModel, + temperature: this.config.llmTemperature, + }, + prompt: this.config.systemPrompt || undefined, + functions: this.config.functions?.length ? this.config.functions : undefined, + endpoint: this.config.llmEndpoint, + }, + speak: { + provider: { + type: this.config.ttsProvider, + model: this.config.ttsModel, + }, + }, + greeting: this.config.greeting || undefined, + context: this.config.context?.length ? { messages: this.config.context } : undefined, + }, + }; + + this.sendJson(settings); + } + + /** + * Forward raw telephony audio to Deepgram. + * Send the audio buffer directly as a binary WebSocket frame. + */ + sendAudio(audio: Buffer): void { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(audio); + } + } + + /** + * Update the system prompt mid-conversation (e.g. for agent handoffs). + */ + updatePrompt(instructions: string): void { + this.sendJson({ type: "UpdateInstructions", instructions }); + } + + /** + * Update the TTS voice/model mid-conversation. + */ + updateSpeak(model: string): void { + this.sendJson({ type: "UpdateSpeak", model }); + } + + /** + * Inject a message as if the agent said it. + * Will be refused if user is speaking or agent is already responding. + */ + injectAgentMessage(message: string): void { + this.sendJson({ type: "InjectAgentMessage", message }); + } + + /** + * Inject a message as if the user said it. + */ + injectUserMessage(message: string): void { + this.sendJson({ type: "InjectUserMessage", message }); + } + + /** + * Send a function call response back to Deepgram. + */ + sendFunctionCallResponse(id: string, name: string, content: string): void { + this.sendJson({ type: "FunctionCallResponse", id, name, content }); + } + + // ------------------------------------------------------------------------- + // Keep-alive + // ------------------------------------------------------------------------- + + private startKeepAlive(): void { + this.stopKeepAlive(); + this.keepAliveTimer = setInterval(() => { + this.sendJson({ type: "KeepAlive" }); + }, this.config.keepAliveIntervalMs); + } + + private stopKeepAlive(): void { + if (this.keepAliveTimer) { + clearInterval(this.keepAliveTimer); + this.keepAliveTimer = null; + } + } + + // ------------------------------------------------------------------------- + // Shutdown + // ------------------------------------------------------------------------- + + /** Close the WebSocket connection. */ + close(): void { + this.closed = true; + this.stopKeepAlive(); + if (this.ws) { + this.ws.close(); + this.ws = null; + } + this.connected = false; + } + + /** Check if the client is currently connected. */ + isConnected(): boolean { + return this.connected; + } +} diff --git a/extensions/voice-call/src/providers/deepgram.test.ts b/extensions/voice-call/src/providers/deepgram.test.ts new file mode 100644 index 000000000..6da976326 --- /dev/null +++ b/extensions/voice-call/src/providers/deepgram.test.ts @@ -0,0 +1,280 @@ +import { describe, expect, it, vi, beforeEach } from "vitest"; +import type { DeepgramConfig } from "../config.js"; +import { DeepgramProvider } from "./deepgram.js"; + +// Mock the DeepgramVoiceAgentClient so tests don't require a WebSocket +vi.mock("./deepgram-voice-agent.js", () => { + const { EventEmitter } = require("node:events"); + + class MockDeepgramVoiceAgentClient extends EventEmitter { + connected = false; + closed = false; + + async connect() { + this.connected = true; + this.emit("connected", "test-request-id"); + this.emit("settingsApplied"); + } + + sendAudio = vi.fn(); + updatePrompt = vi.fn(); + updateSpeak = vi.fn(); + injectAgentMessage = vi.fn(); + injectUserMessage = vi.fn(); + sendFunctionCallResponse = vi.fn(); + + close() { + this.closed = true; + this.connected = false; + } + + isConnected() { + return this.connected; + } + } + + return { DeepgramVoiceAgentClient: MockDeepgramVoiceAgentClient }; +}); + +function createConfig(): DeepgramConfig { + return { + apiKey: "test-key", + stt: { model: "nova-3" }, + tts: { model: "aura-2-thalia-en" }, + language: "en", + fallback: { + openclawTimeoutMs: 5000, + cannedResponses: [], + maxRetries: 2, + }, + latency: { + fillerThresholdMs: 1500, + fillerPhrases: [], + }, + }; +} + +describe("DeepgramProvider", () => { + it("requires apiKey", () => { + expect( + () => new DeepgramProvider({ ...createConfig(), apiKey: undefined } as DeepgramConfig), + ).toThrow("Deepgram API key is required"); + }); + + it("has name 'deepgram'", () => { + const provider = new DeepgramProvider(createConfig()); + expect(provider.name).toBe("deepgram"); + }); + + it("verifyWebhook always returns ok", () => { + const provider = new DeepgramProvider(createConfig()); + const result = provider.verifyWebhook({ + headers: {}, + rawBody: "", + url: "", + method: "POST", + }); + expect(result.ok).toBe(true); + }); + + it("parseWebhookEvent returns empty events", () => { + const provider = new DeepgramProvider(createConfig()); + const result = provider.parseWebhookEvent({ + headers: {}, + rawBody: "", + url: "", + method: "POST", + }); + expect(result.events).toEqual([]); + expect(result.statusCode).toBe(200); + }); + + it("initiateCall creates a session and connects", async () => { + const provider = new DeepgramProvider(createConfig()); + const result = await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + expect(result.providerCallId).toBe("dg-call-1"); + expect(result.status).toBe("initiated"); + expect(provider.getClient("call-1")).toBeDefined(); + }); + + it("hangupCall closes the session", async () => { + const provider = new DeepgramProvider(createConfig()); + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + const client = provider.getClient("call-1"); + expect(client).toBeDefined(); + + await provider.hangupCall({ + callId: "call-1", + providerCallId: "dg-call-1", + reason: "completed", + }); + + expect(provider.getClient("call-1")).toBeUndefined(); + }); + + it("playTts injects agent message", async () => { + const provider = new DeepgramProvider(createConfig()); + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + await provider.playTts({ + callId: "call-1", + providerCallId: "dg-call-1", + text: "Hello there!", + }); + + const client = provider.getClient("call-1")!; + expect(client.injectAgentMessage).toHaveBeenCalledWith("Hello there!"); + }); + + it("playTts throws for unknown call", async () => { + const provider = new DeepgramProvider(createConfig()); + await expect( + provider.playTts({ + callId: "unknown", + providerCallId: "dg-unknown", + text: "Hello", + }), + ).rejects.toThrow("No active Deepgram session"); + }); + + it("startListening and stopListening are no-ops", async () => { + const provider = new DeepgramProvider(createConfig()); + // Should not throw + await provider.startListening({ callId: "call-1", providerCallId: "dg-call-1" }); + await provider.stopListening({ callId: "call-1", providerCallId: "dg-call-1" }); + }); + + it("sendAudio forwards to client", async () => { + const provider = new DeepgramProvider(createConfig()); + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + const audio = Buffer.from([1, 2, 3]); + provider.sendAudio("call-1", audio); + + const client = provider.getClient("call-1")!; + expect(client.sendAudio).toHaveBeenCalledWith(audio); + }); + + it("updatePrompt forwards to client", async () => { + const provider = new DeepgramProvider(createConfig()); + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + provider.updatePrompt("call-1", "New prompt"); + + const client = provider.getClient("call-1")!; + expect(client.updatePrompt).toHaveBeenCalledWith("New prompt"); + }); + + it("closeAll closes all sessions", async () => { + const provider = new DeepgramProvider(createConfig()); + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + await provider.initiateCall({ + callId: "call-2", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + provider.closeAll(); + + expect(provider.getClient("call-1")).toBeUndefined(); + expect(provider.getClient("call-2")).toBeUndefined(); + }); + + it("emits NormalizedEvent for user speech", async () => { + const onEvent = vi.fn(); + const provider = new DeepgramProvider(createConfig(), { onEvent }); + + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + const client = provider.getClient("call-1")!; + client.emit("conversationText", "user", "Hello there"); + + expect(onEvent).toHaveBeenCalledWith( + "call-1", + expect.objectContaining({ + type: "call.speech", + transcript: "Hello there", + isFinal: true, + }), + ); + }); + + it("emits NormalizedEvent for agent speech", async () => { + const onEvent = vi.fn(); + const provider = new DeepgramProvider(createConfig(), { onEvent }); + + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + const client = provider.getClient("call-1")!; + client.emit("conversationText", "assistant", "Hi, how can I help?"); + + expect(onEvent).toHaveBeenCalledWith( + "call-1", + expect.objectContaining({ + type: "call.speaking", + text: "Hi, how can I help?", + }), + ); + }); + + it("forwards audio events", async () => { + const onAudio = vi.fn(); + const provider = new DeepgramProvider(createConfig(), { onAudio }); + + await provider.initiateCall({ + callId: "call-1", + from: "+15551234567", + to: "+15559876543", + webhookUrl: "https://example.com/webhook", + }); + + const client = provider.getClient("call-1")!; + const audioBuffer = Buffer.from([1, 2, 3]); + client.emit("audio", audioBuffer); + + expect(onAudio).toHaveBeenCalledWith("call-1", audioBuffer); + }); +}); diff --git a/extensions/voice-call/src/providers/deepgram.ts b/extensions/voice-call/src/providers/deepgram.ts new file mode 100644 index 000000000..7a644344c --- /dev/null +++ b/extensions/voice-call/src/providers/deepgram.ts @@ -0,0 +1,382 @@ +/** + * Deepgram Voice Agent Provider + * + * Implements VoiceCallProvider by wrapping DeepgramVoiceAgentClient. + * Unlike Twilio/Telnyx/Plivo which handle telephony + media separately, + * Deepgram manages the full voice pipeline (STT → LLM → TTS) internally. + * + * This provider: + * - Manages DeepgramVoiceAgentClient instances per active call + * - Bridges audio between telephony (Twilio media streams) and Deepgram + * - Translates Deepgram events into NormalizedEvent format + * - Handles function calls from Deepgram by delegating to a callback + */ + +import crypto from "node:crypto"; +import type { DeepgramConfig } from "../config.js"; +import type { + HangupCallInput, + InitiateCallInput, + InitiateCallResult, + NormalizedEvent, + PlayTtsInput, + ProviderWebhookParseResult, + StartListeningInput, + StopListeningInput, + WebhookContext, + WebhookVerificationResult, +} from "../types.js"; +import type { VoiceCallProvider } from "./base.js"; +import { FallbackManager, type FallbackEvent } from "./deepgram-fallback.js"; +import { + DeepgramVoiceAgentClient, + type DeepgramFunctionDef, + type DeepgramVoiceAgentConfig, +} from "./deepgram-voice-agent.js"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface DeepgramSessionOverrides { + systemPrompt?: string; + llmProvider?: DeepgramVoiceAgentConfig["llmProvider"]; + llmEndpoint?: { url: string; headers?: Record }; + greeting?: string; +} + +export interface DeepgramProviderOptions { + /** LLM provider for the think stage (default: "open_ai") */ + llmProvider?: DeepgramVoiceAgentConfig["llmProvider"]; + /** LLM model (default: "gpt-4o-mini") */ + llmModel?: string; + /** Custom LLM endpoint (for self-hosted or custom providers) */ + llmEndpoint?: { url: string; headers?: Record }; + /** System prompt for the agent */ + systemPrompt?: string; + /** Greeting spoken when agent connects */ + greeting?: string; + /** Client-side function definitions */ + functions?: DeepgramFunctionDef[]; + /** + * Callback invoked when Deepgram requests a client-side function call. + * Returns the result as a string. + */ + onFunctionCall?: (name: string, args: Record, callId: string) => Promise; + /** Callback invoked when an event is emitted for a call */ + onEvent?: (callId: string, event: NormalizedEvent) => void; + /** Callback invoked when synthesized audio is received for a call */ + onAudio?: (callId: string, audio: Buffer) => void; + /** Key terms for speech recognition boosting */ + keyterms?: string[]; + /** Callback invoked on fallback events (for observability/logging). */ + onFallbackEvent?: (event: FallbackEvent) => void; +} + +interface ActiveSession { + client: DeepgramVoiceAgentClient; + callId: string; + providerCallId: string; +} + +// --------------------------------------------------------------------------- +// Provider +// --------------------------------------------------------------------------- + +export class DeepgramProvider implements VoiceCallProvider { + readonly name = "deepgram" as const; + + private readonly apiKey: string; + private readonly dgConfig: DeepgramConfig; + private readonly options: DeepgramProviderOptions; + private readonly sessions = new Map(); + private readonly fallback: FallbackManager; + + constructor(config: DeepgramConfig, options: DeepgramProviderOptions = {}) { + if (!config.apiKey) { + throw new Error("Deepgram API key is required"); + } + this.apiKey = config.apiKey; + this.dgConfig = config; + this.options = options; + this.fallback = new FallbackManager({ + config: config.fallback, + onHangup: (callId) => this.closeSession(callId), + onFallbackEvent: options.onFallbackEvent, + }); + } + + // ----------------------------------------------------------------------- + // VoiceCallProvider interface + // ----------------------------------------------------------------------- + + /** + * Deepgram voice agent uses WebSocket, not webhooks. + * Always returns ok for compatibility with the provider interface. + */ + verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult { + return { ok: true }; + } + + /** + * Deepgram events arrive via WebSocket, not HTTP webhooks. + * This is a no-op for direct Deepgram mode; events are emitted through + * the DeepgramVoiceAgentClient EventEmitter instead. + */ + parseWebhookEvent(_ctx: WebhookContext): ProviderWebhookParseResult { + return { events: [], statusCode: 200 }; + } + + /** + * Create a new Deepgram Voice Agent session for a call. + * + * In a typical telephony flow, the actual phone call is initiated by the + * telephony provider (Twilio/Telnyx). This method creates the Deepgram + * agent that handles the voice AI pipeline for that call. + */ + async initiateCall(input: InitiateCallInput): Promise { + const providerCallId = `dg-${input.callId}`; + + const client = this.createClient(input.callId); + + const session: ActiveSession = { + client, + callId: input.callId, + providerCallId, + }; + this.sessions.set(input.callId, session); + + this.wireEvents(session); + + try { + await client.connect(); + } catch (error) { + this.sessions.delete(input.callId); + throw error; + } + + return { providerCallId, status: "initiated" }; + } + + /** + * Hang up by closing the Deepgram Voice Agent WebSocket. + */ + async hangupCall(input: HangupCallInput): Promise { + const session = this.sessions.get(input.callId); + if (session) { + session.client.close(); + this.sessions.delete(input.callId); + } + } + + /** + * Inject text as an agent message. + * Deepgram handles TTS internally; this sends text to be spoken. + */ + async playTts(input: PlayTtsInput): Promise { + const session = this.sessions.get(input.callId); + if (!session) { + throw new Error(`No active Deepgram session for call ${input.callId}`); + } + session.client.injectAgentMessage(input.text); + } + + /** + * No-op: Deepgram Voice Agent handles listening automatically via its + * built-in STT pipeline. The agent is always listening for user speech. + */ + async startListening(_input: StartListeningInput): Promise { + // Deepgram handles STT automatically + } + + /** + * No-op: Deepgram Voice Agent handles listening automatically. + */ + async stopListening(_input: StopListeningInput): Promise { + // Deepgram handles STT automatically + } + + // ----------------------------------------------------------------------- + // Public API (beyond VoiceCallProvider) + // ----------------------------------------------------------------------- + + /** + * Get the active DeepgramVoiceAgentClient for a call. + */ + getClient(callId: string): DeepgramVoiceAgentClient | undefined { + return this.sessions.get(callId)?.client; + } + + /** + * Create a Deepgram Voice Agent session for an existing call + * (e.g. inbound call already connected via Twilio). + */ + async createSession( + callId: string, + providerCallId: string, + overrides?: DeepgramSessionOverrides, + ): Promise { + const client = this.createClient(callId, overrides); + + const session: ActiveSession = { + client, + callId, + providerCallId, + }; + this.sessions.set(callId, session); + this.wireEvents(session); + + await client.connect(); + return client; + } + + /** + * Send raw audio to the Deepgram agent for a call. + * Audio is forwarded from telephony media streams. + */ + sendAudio(callId: string, audio: Buffer): void { + this.sessions.get(callId)?.client.sendAudio(audio); + } + + /** + * Update the system prompt mid-call (e.g. agent handoff). + */ + updatePrompt(callId: string, instructions: string): void { + this.sessions.get(callId)?.client.updatePrompt(instructions); + } + + /** + * Update the TTS voice/model mid-call. + */ + updateSpeak(callId: string, model: string): void { + this.sessions.get(callId)?.client.updateSpeak(model); + } + + /** + * Close a specific session. + */ + closeSession(callId: string): void { + const session = this.sessions.get(callId); + if (session) { + session.client.close(); + this.fallback.cleanup(callId); + this.sessions.delete(callId); + } + } + + /** + * Close all active sessions. + */ + closeAll(): void { + for (const session of this.sessions.values()) { + session.client.close(); + this.fallback.cleanup(session.callId); + } + this.sessions.clear(); + } + + // ----------------------------------------------------------------------- + // Internal + // ----------------------------------------------------------------------- + + private createClient( + callId: string, + overrides?: DeepgramSessionOverrides, + ): DeepgramVoiceAgentClient { + // Wrap function call handler with fallback degradation chain. + // The fallback wrapper adds timeout + 4-tier escalation. Then the + // client's built-in filler injection (from latency config) wraps + // around it so callers hear something while waiting. + let onFunctionCall = this.options.onFunctionCall; + + // We need a reference to the client for fallback to inject messages, + // but the client needs the wrapped handler at construction time. + // Use a deferred approach: create a mutable wrapper, create the client, + // then wire up the fallback which needs the client reference. + const wrappedRef: { + fn?: (name: string, args: Record, id: string) => Promise; + } = {}; + + const client = new DeepgramVoiceAgentClient({ + apiKey: this.apiKey, + sttModel: this.dgConfig.stt.model, + ttsModel: this.dgConfig.tts.model, + language: this.dgConfig.language, + latency: this.dgConfig.latency, + llmProvider: overrides?.llmProvider ?? this.options.llmProvider, + llmModel: this.options.llmModel, + llmEndpoint: overrides?.llmEndpoint ?? this.options.llmEndpoint, + systemPrompt: overrides?.systemPrompt ?? this.options.systemPrompt, + greeting: overrides?.greeting ?? this.options.greeting, + functions: this.options.functions, + keyterms: this.options.keyterms, + inputEncoding: "mulaw", + inputSampleRate: 8000, + outputEncoding: "mulaw", + outputSampleRate: 8000, + onFunctionCall: onFunctionCall + ? (name, args, fnCallId) => wrappedRef.fn!(name, args, fnCallId) + : undefined, + }); + + if (onFunctionCall) { + wrappedRef.fn = this.fallback.wrapFunctionCall(callId, client, onFunctionCall); + } + + return client; + } + + private wireEvents(session: ActiveSession): void { + const { client, callId, providerCallId } = session; + + const makeBase = () => ({ + id: crypto.randomUUID(), + callId, + providerCallId, + timestamp: Date.now(), + }); + + // Forward synthesized audio + client.on("audio", (audio) => { + this.options.onAudio?.(callId, audio); + }); + + // Conversation text → NormalizedEvent + client.on("conversationText", (role, content) => { + if (role === "user") { + const event: NormalizedEvent = { + ...makeBase(), + type: "call.speech", + transcript: content, + isFinal: true, + confidence: 1.0, + }; + this.options.onEvent?.(callId, event); + } else { + const event: NormalizedEvent = { + ...makeBase(), + type: "call.speaking", + text: content, + }; + this.options.onEvent?.(callId, event); + } + }); + + // Error → NormalizedEvent + client.on("error", (error) => { + const event: NormalizedEvent = { + ...makeBase(), + type: "call.error", + error: error.message, + retryable: true, + }; + this.options.onEvent?.(callId, event); + }); + + // Connection closed → clean up + client.on("closed", () => { + this.fallback.cleanup(callId); + this.sessions.delete(callId); + }); + } +} diff --git a/extensions/voice-call/src/providers/twilio.test.ts b/extensions/voice-call/src/providers/twilio.test.ts index 36b25005f..ea3a71412 100644 --- a/extensions/voice-call/src/providers/twilio.test.ts +++ b/extensions/voice-call/src/providers/twilio.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest"; import type { WebhookContext } from "../types.js"; import { TwilioProvider } from "./twilio.js"; -const STREAM_URL_PREFIX = "wss://example.ngrok.app/voice/stream?token="; +const STREAM_URL = "wss://example.ngrok.app/voice/stream"; function createProvider(): TwilioProvider { return new TwilioProvider( @@ -30,7 +30,8 @@ describe("TwilioProvider", () => { const result = provider.parseWebhookEvent(ctx); - expect(result.providerResponseBody).toContain(STREAM_URL_PREFIX); + expect(result.providerResponseBody).toContain(STREAM_URL); + expect(result.providerResponseBody).toContain('"); }); @@ -54,7 +55,8 @@ describe("TwilioProvider", () => { const result = provider.parseWebhookEvent(ctx); - expect(result.providerResponseBody).toContain(STREAM_URL_PREFIX); + expect(result.providerResponseBody).toContain(STREAM_URL); + expect(result.providerResponseBody).toContain('"); }); }); diff --git a/extensions/voice-call/src/providers/twilio.ts b/extensions/voice-call/src/providers/twilio.ts index b1f03b211..1e90d1fb6 100644 --- a/extensions/voice-call/src/providers/twilio.ts +++ b/extensions/voice-call/src/providers/twilio.ts @@ -351,8 +351,10 @@ export class TwilioProvider implements VoiceCallProvider { // Conversation mode: return streaming TwiML immediately for outbound calls. if (isOutbound) { - const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null; - return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML; + const streamInfo = callSid ? this.getStreamUrlForCall(callSid) : null; + return streamInfo + ? this.getStreamConnectXml(streamInfo.url, streamInfo.token) + : TwilioProvider.PAUSE_TWIML; } } @@ -364,8 +366,10 @@ export class TwilioProvider implements VoiceCallProvider { // Handle subsequent webhook requests (status callbacks, etc.) // For inbound calls, answer immediately with stream if (direction === "inbound") { - const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null; - return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML; + const streamInfo = callSid ? this.getStreamUrlForCall(callSid) : null; + return streamInfo + ? this.getStreamConnectXml(streamInfo.url, streamInfo.token) + : TwilioProvider.PAUSE_TWIML; } // For outbound calls, only connect to stream when call is in-progress @@ -373,8 +377,10 @@ export class TwilioProvider implements VoiceCallProvider { return TwilioProvider.EMPTY_TWIML; } - const streamUrl = callSid ? this.getStreamUrlForCall(callSid) : null; - return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML; + const streamInfo = callSid ? this.getStreamUrlForCall(callSid) : null; + return streamInfo + ? this.getStreamConnectXml(streamInfo.url, streamInfo.token) + : TwilioProvider.PAUSE_TWIML; } /** @@ -411,15 +417,13 @@ export class TwilioProvider implements VoiceCallProvider { return token; } - private getStreamUrlForCall(callSid: string): string | null { + private getStreamUrlForCall(callSid: string): { url: string; token: string } | null { const baseUrl = this.getStreamUrl(); if (!baseUrl) { return null; } const token = this.getStreamAuthToken(callSid); - const url = new URL(baseUrl); - url.searchParams.set("token", token); - return url.toString(); + return { url: baseUrl, token }; } /** @@ -428,11 +432,13 @@ export class TwilioProvider implements VoiceCallProvider { * * @param streamUrl - WebSocket URL (wss://...) for the media stream */ - getStreamConnectXml(streamUrl: string): string { + getStreamConnectXml(streamUrl: string, token?: string): string { + const paramXml = token ? `\n ` : ""; return ` - + ${paramXml} + `; } diff --git a/extensions/voice-call/src/response-generator.ts b/extensions/voice-call/src/response-generator.ts index a13ebc372..c5bc4c25e 100644 --- a/extensions/voice-call/src/response-generator.ts +++ b/extensions/voice-call/src/response-generator.ts @@ -5,6 +5,7 @@ import crypto from "node:crypto"; import type { VoiceCallConfig } from "./config.js"; +import { resolveAgentForNumber } from "./config.js"; import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js"; export type VoiceResponseParams = { @@ -16,6 +17,8 @@ export type VoiceResponseParams = { callId: string; /** Caller's phone number */ from: string; + /** Called phone number (for agent routing) */ + calledNumber?: string; /** Conversation transcript */ transcript: Array<{ speaker: "user" | "bot"; text: string }>; /** Latest user message */ @@ -39,7 +42,7 @@ type SessionEntry = { export async function generateVoiceResponse( params: VoiceResponseParams, ): Promise { - const { voiceConfig, callId, from, transcript, userMessage, coreConfig } = params; + const { voiceConfig, callId, from, calledNumber, transcript, userMessage, coreConfig } = params; if (!coreConfig) { return { text: null, error: "Core config unavailable for voice response" }; @@ -56,10 +59,12 @@ export async function generateVoiceResponse( } const cfg = coreConfig; - // Build voice-specific session key based on phone number - const normalizedPhone = from.replace(/\D/g, ""); - const sessionKey = `voice:${normalizedPhone}`; - const agentId = "main"; + // Resolve agent ID from number routing config + const agentId = resolveAgentForNumber(voiceConfig, calledNumber, "inbound"); + + // Per-call session key — each call gets a fresh session. + // Agent identity persists via workspace files (SOUL.md, IDENTITY.md), not session history. + const sessionKey = `agent:${agentId}:voice:${callId}`; // Resolve paths const storePath = deps.resolveStorePath(cfg.session?.store, { agentId }); @@ -97,14 +102,23 @@ export async function generateVoiceResponse( // Resolve thinking level const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model }); - // Resolve agent identity for personalized prompt - const identity = deps.resolveAgentIdentity(cfg, agentId); - const agentName = identity?.name?.trim() || "assistant"; - - // Build system prompt with conversation history + // Build system prompt with voice-specific behavioral instructions only. + // Agent identity comes from workspace files (SOUL.md, IDENTITY.md, BOOTSTRAP.md) + // loaded by the Pi agent's normal startup path. + const tz = voiceConfig.timezone ?? "UTC"; + const localTime = new Date().toLocaleString("en-US", { + timeZone: tz, + weekday: "long", + year: "numeric", + month: "long", + day: "numeric", + hour: "numeric", + minute: "2-digit", + timeZoneName: "short", + }); const basePrompt = voiceConfig.responseSystemPrompt ?? - `You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`; + `You are on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. IMPORTANT: Your responses will be spoken aloud via text-to-speech. Do NOT use any text formatting — no markdown, no bullet points, no asterisks, no numbered lists, no headers. Write plain conversational sentences only. When you need to use a tool or look something up, ALWAYS say a brief acknowledgment first so the caller isn't waiting in silence. Today is ${localTime}. Always present times in this timezone. The caller's phone number is ${from}. You have access to tools - use them when helpful.`; let extraSystemPrompt = basePrompt; if (transcript.length > 0) { diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 6d37d8ac2..46e01e48c 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -3,7 +3,9 @@ import type { CoreConfig } from "./core-bridge.js"; import type { VoiceCallProvider } from "./providers/base.js"; import type { TelephonyTtsRuntime } from "./telephony-tts.js"; import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js"; +import { DeepgramMediaBridge } from "./deepgram-media-bridge.js"; import { CallManager } from "./manager.js"; +import { DeepgramProvider } from "./providers/deepgram.js"; import { MockProvider } from "./providers/mock.js"; import { PlivoProvider } from "./providers/plivo.js"; import { TelnyxProvider } from "./providers/telnyx.js"; @@ -86,6 +88,20 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider { webhookSecurity: config.webhookSecurity, }, ); + case "deepgram": + return new TwilioProvider( + { + accountSid: config.twilio?.accountSid, + authToken: config.twilio?.authToken, + }, + { + allowNgrokFreeTierLoopbackBypass, + publicUrl: config.publicUrl, + skipVerification: config.skipSignatureVerification, + streamPath: config.streaming?.streamPath || "/voice/stream", + webhookSecurity: config.webhookSecurity, + }, + ); case "mock": return new MockProvider(); default: @@ -184,6 +200,47 @@ export async function createVoiceCallRuntime(params: { } } + // Deepgram hybrid mode: Twilio handles telephony, Deepgram handles voice AI + if (config.provider === "deepgram" && config.deepgram) { + const deepgramProvider = new DeepgramProvider(config.deepgram); + const twilioProvider = provider as TwilioProvider; + + // Ensure public URL is set for stream URL generation + if (publicUrl && !twilioProvider.getPublicUrl()) { + twilioProvider.setPublicUrl(publicUrl); + } + + const gatewayPort = process.env.OPENCLAW_GATEWAY_PORT || "18789"; + const gatewayToken = process.env.OPENCLAW_GATEWAY_TOKEN || ""; + const gatewayUrl = `http://127.0.0.1:${gatewayPort}`; + + const bridge = new DeepgramMediaBridge({ + deepgramProvider, + manager, + gatewayUrl, + gatewayToken, + publicUrl: publicUrl ?? undefined, + coreConfig, + voiceCallConfig: config, + shouldAcceptStream: ({ callId, token }) => { + const call = manager.getCallByProviderCallId(callId); + if (!call) return false; + if (!twilioProvider.isValidStreamToken(callId, token)) { + console.warn(`[voice-call] Rejecting media stream: invalid token for ${callId}`); + return false; + } + return true; + }, + }); + + webhookServer.setDeepgramMediaBridge(bridge); + if (gatewayToken) { + webhookServer.setGatewayConfig(gatewayUrl, gatewayToken); + } + + log.info("[voice-call] Deepgram hybrid mode enabled (Twilio + Deepgram + Gateway)"); + } + manager.initialize(provider, webhookUrl); const stop = async () => { diff --git a/extensions/voice-call/src/types.ts b/extensions/voice-call/src/types.ts index 38091baa4..006cc5c35 100644 --- a/extensions/voice-call/src/types.ts +++ b/extensions/voice-call/src/types.ts @@ -5,7 +5,7 @@ import type { CallMode } from "./config.js"; // Provider Identifiers // ----------------------------------------------------------------------------- -export const ProviderNameSchema = z.enum(["telnyx", "twilio", "plivo", "mock"]); +export const ProviderNameSchema = z.enum(["telnyx", "twilio", "plivo", "deepgram", "mock"]); export type ProviderName = z.infer; // ----------------------------------------------------------------------------- @@ -242,6 +242,8 @@ export type OutboundCallOptions = { message?: string; /** Call mode (overrides config default) */ mode?: CallMode; + /** Agent ID for outbound call (resolves from number from config.numbers) */ + agentId?: string; }; // ----------------------------------------------------------------------------- diff --git a/extensions/voice-call/src/voice-functions.test.ts b/extensions/voice-call/src/voice-functions.test.ts new file mode 100644 index 000000000..334509f2c --- /dev/null +++ b/extensions/voice-call/src/voice-functions.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from "vitest"; +import { + buildVoiceAgentFunctions, + sessionsSpawnFunction, + sessionsSendFunction, + sessionStatusFunction, + voiceHandoffFunction, +} from "./voice-functions.js"; + +describe("voice function definitions", () => { + it("buildVoiceAgentFunctions returns all four functions", () => { + const fns = buildVoiceAgentFunctions(); + + expect(fns).toHaveLength(4); + const names = fns.map((f) => f.name); + expect(names).toContain("sessions_spawn"); + expect(names).toContain("sessions_send"); + expect(names).toContain("session_status"); + expect(names).toContain("voice_handoff"); + }); + + it("sessions_spawn has required 'task' parameter", () => { + expect(sessionsSpawnFunction.name).toBe("sessions_spawn"); + expect(sessionsSpawnFunction.parameters.required).toContain("task"); + expect(sessionsSpawnFunction.parameters.properties.task).toBeDefined(); + expect(sessionsSpawnFunction.parameters.properties.task.type).toBe("string"); + }); + + it("sessions_send has required 'message' parameter", () => { + expect(sessionsSendFunction.name).toBe("sessions_send"); + expect(sessionsSendFunction.parameters.required).toContain("message"); + expect(sessionsSendFunction.parameters.properties.message).toBeDefined(); + expect(sessionsSendFunction.parameters.properties.sessionKey).toBeDefined(); + }); + + it("session_status has optional parameters only", () => { + expect(sessionStatusFunction.name).toBe("session_status"); + expect(sessionStatusFunction.parameters.required).toBeUndefined(); + expect(sessionStatusFunction.parameters.properties.sessionKey).toBeDefined(); + expect(sessionStatusFunction.parameters.properties.model).toBeDefined(); + }); + + it("voice_handoff has required 'targetAgentId' parameter", () => { + expect(voiceHandoffFunction.name).toBe("voice_handoff"); + expect(voiceHandoffFunction.parameters.required).toContain("targetAgentId"); + expect(voiceHandoffFunction.parameters.properties.targetAgentId).toBeDefined(); + expect(voiceHandoffFunction.parameters.properties.contextSummary).toBeDefined(); + }); + + it("all functions have descriptions", () => { + const fns = buildVoiceAgentFunctions(); + for (const fn of fns) { + expect(fn.description).toBeTruthy(); + expect(fn.description.length).toBeGreaterThan(10); + } + }); + + it("all functions have object parameter type", () => { + const fns = buildVoiceAgentFunctions(); + for (const fn of fns) { + expect(fn.parameters.type).toBe("object"); + } + }); +}); diff --git a/extensions/voice-call/src/voice-functions.ts b/extensions/voice-call/src/voice-functions.ts new file mode 100644 index 000000000..fa2397fa1 --- /dev/null +++ b/extensions/voice-call/src/voice-functions.ts @@ -0,0 +1,116 @@ +/** + * Deepgram Voice Agent function definitions for OpenClaw session tools. + * + * Maps sessions_spawn, sessions_send, and session_status to Deepgram + * client-side function definitions so the voice agent can interact with + * other agents and sessions during a call. + */ + +import type { DeepgramFunctionDef } from "./providers/deepgram-voice-agent.js"; + +/** + * Function definition for sessions_spawn — spawns a background sub-agent. + */ +export const sessionsSpawnFunction: DeepgramFunctionDef = { + name: "sessions_spawn", + description: + "Spawn a background sub-agent to handle a task asynchronously. The result will be announced when ready.", + parameters: { + type: "object", + properties: { + task: { + type: "string", + description: "The task description for the sub-agent to perform.", + }, + label: { + type: "string", + description: "Optional short label for identifying this task.", + }, + agentId: { + type: "string", + description: "Optional agent ID to route the task to a specific agent.", + }, + }, + required: ["task"], + }, +}; + +/** + * Function definition for sessions_send — sends a message to another session. + */ +export const sessionsSendFunction: DeepgramFunctionDef = { + name: "sessions_send", + description: + "Send a message to another active session. Use sessionKey or label to identify the target.", + parameters: { + type: "object", + properties: { + message: { + type: "string", + description: "The message to send to the target session.", + }, + sessionKey: { + type: "string", + description: "The session key of the target session.", + }, + label: { + type: "string", + description: "The label of the target session (alternative to sessionKey).", + }, + }, + required: ["message"], + }, +}; + +/** + * Function definition for session_status — queries session status. + */ +export const sessionStatusFunction: DeepgramFunctionDef = { + name: "session_status", + description: + "Show session status including model, usage, and cost. Optionally change the model override.", + parameters: { + type: "object", + properties: { + sessionKey: { + type: "string", + description: "Optional session key to query (defaults to current session).", + }, + model: { + type: "string", + description: 'Optional model override to set (use "default" to reset).', + }, + }, + }, +}; + +/** + * Function definition for voice_handoff — transfers a call to another agent. + */ +export const voiceHandoffFunction: DeepgramFunctionDef = { + name: "voice_handoff", + description: + "Transfer the current call to another agent. Use when the caller needs a different department or specialist.", + parameters: { + type: "object", + properties: { + targetAgentId: { + type: "string", + description: "The agent ID to hand the call off to (e.g. 'billing', 'support').", + }, + contextSummary: { + type: "string", + description: "Brief summary of the conversation so far for the receiving agent.", + }, + }, + required: ["targetAgentId"], + }, +}; + +/** + * Returns the set of Deepgram function definitions for inter-agent + * communication tools available to voice sessions. + */ +export function buildVoiceAgentFunctions(): DeepgramFunctionDef[] { + return [sessionsSpawnFunction, sessionsSendFunction, sessionStatusFunction, voiceHandoffFunction]; +} diff --git a/extensions/voice-call/src/voice-session-bridge.test.ts b/extensions/voice-call/src/voice-session-bridge.test.ts new file mode 100644 index 000000000..b82bb36aa --- /dev/null +++ b/extensions/voice-call/src/voice-session-bridge.test.ts @@ -0,0 +1,114 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import type { DeepgramVoiceAgentClient } from "./providers/deepgram-voice-agent.js"; +import { + registerVoiceSession, + unregisterVoiceSession, + injectMessageToVoiceSession, + isActiveVoiceSession, + getActiveVoiceSessionKeys, +} from "./voice-session-bridge.js"; + +function createMockClient(connected = true): DeepgramVoiceAgentClient { + return { + isConnected: vi.fn().mockReturnValue(connected), + injectAgentMessage: vi.fn(), + } as unknown as DeepgramVoiceAgentClient; +} + +describe("voice-session-bridge", () => { + afterEach(() => { + // Clean up all registered sessions between tests + for (const key of getActiveVoiceSessionKeys()) { + unregisterVoiceSession(key); + } + }); + + it("registerVoiceSession adds a session and getActiveVoiceSessionKeys returns it", () => { + const client = createMockClient(); + registerVoiceSession("agent:main:voice-call:123:456", client, "call-1"); + + const keys = getActiveVoiceSessionKeys(); + expect(keys).toContain("agent:main:voice-call:123:456"); + }); + + it("unregisterVoiceSession removes a session", () => { + const client = createMockClient(); + registerVoiceSession("agent:main:voice-call:123:456", client, "call-1"); + unregisterVoiceSession("agent:main:voice-call:123:456"); + + const keys = getActiveVoiceSessionKeys(); + expect(keys).not.toContain("agent:main:voice-call:123:456"); + }); + + it("isActiveVoiceSession returns true for connected session", () => { + const client = createMockClient(true); + registerVoiceSession("agent:main:voice-call:123:456", client, "call-1"); + + expect(isActiveVoiceSession("agent:main:voice-call:123:456")).toBe(true); + }); + + it("isActiveVoiceSession returns false for disconnected session", () => { + const client = createMockClient(false); + registerVoiceSession("agent:main:voice-call:123:456", client, "call-1"); + + expect(isActiveVoiceSession("agent:main:voice-call:123:456")).toBe(false); + }); + + it("isActiveVoiceSession returns false for unknown session", () => { + expect(isActiveVoiceSession("agent:main:voice-call:999:999")).toBe(false); + }); + + it("injectMessageToVoiceSession injects into connected session", () => { + const client = createMockClient(true); + registerVoiceSession("agent:main:voice-call:123:456", client, "call-1"); + + const result = injectMessageToVoiceSession( + "agent:main:voice-call:123:456", + "Hello from subagent", + ); + + expect(result).toBe(true); + expect(client.injectAgentMessage).toHaveBeenCalledWith("Hello from subagent"); + }); + + it("injectMessageToVoiceSession returns false for disconnected session", () => { + const client = createMockClient(false); + registerVoiceSession("agent:main:voice-call:123:456", client, "call-1"); + + const result = injectMessageToVoiceSession("agent:main:voice-call:123:456", "Hello"); + + expect(result).toBe(false); + }); + + it("injectMessageToVoiceSession returns false for unknown session", () => { + const result = injectMessageToVoiceSession("agent:main:voice-call:999:999", "Hello"); + + expect(result).toBe(false); + }); + + it("multiple sessions can be registered", () => { + const client1 = createMockClient(); + const client2 = createMockClient(); + registerVoiceSession("agent:billing:voice-call:100:200", client1, "call-1"); + registerVoiceSession("agent:support:voice-call:300:400", client2, "call-2"); + + const keys = getActiveVoiceSessionKeys(); + expect(keys).toHaveLength(2); + expect(keys).toContain("agent:billing:voice-call:100:200"); + expect(keys).toContain("agent:support:voice-call:300:400"); + }); + + it("registering same key replaces previous session", () => { + const client1 = createMockClient(); + const client2 = createMockClient(); + registerVoiceSession("agent:main:voice-call:123:456", client1, "call-1"); + registerVoiceSession("agent:main:voice-call:123:456", client2, "call-2"); + + const keys = getActiveVoiceSessionKeys(); + expect(keys).toHaveLength(1); + + injectMessageToVoiceSession("agent:main:voice-call:123:456", "Hello"); + expect(client2.injectAgentMessage).toHaveBeenCalledWith("Hello"); + expect(client1.injectAgentMessage).not.toHaveBeenCalled(); + }); +}); diff --git a/extensions/voice-call/src/voice-session-bridge.ts b/extensions/voice-call/src/voice-session-bridge.ts new file mode 100644 index 000000000..95e5308e0 --- /dev/null +++ b/extensions/voice-call/src/voice-session-bridge.ts @@ -0,0 +1,80 @@ +/** + * Voice Session Bridge + * + * Maintains a registry of active voice sessions so that subagent results + * and inter-agent messages can be injected into live Deepgram Voice Agent + * conversations via `InjectAgentMessage`. + * + * Text agents can send messages to voice sessions using `sessions_send` + * with the standard session key (e.g. `agent:main:voice-call::`). + * The gateway will route the message to the embedded Pi agent, which will + * produce a response. This bridge allows that response to be spoken aloud + * by injecting it into the active Deepgram connection. + */ + +import type { DeepgramVoiceAgentClient } from "./providers/deepgram-voice-agent.js"; + +type VoiceSessionEntry = { + client: DeepgramVoiceAgentClient; + callId: string; + registeredAt: number; +}; + +const activeVoiceSessions = new Map(); + +/** + * Register a live voice session so external messages can be injected. + * + * @param sessionKey - The canonical session key (e.g. `agent:main:voice-call:1234:5678`) + * @param client - The active DeepgramVoiceAgentClient for this session + * @param callId - The call ID associated with this session + */ +export function registerVoiceSession( + sessionKey: string, + client: DeepgramVoiceAgentClient, + callId: string, +): void { + activeVoiceSessions.set(sessionKey, { + client, + callId, + registeredAt: Date.now(), + }); +} + +/** + * Unregister a voice session (call ended or client disconnected). + */ +export function unregisterVoiceSession(sessionKey: string): void { + activeVoiceSessions.delete(sessionKey); +} + +/** + * Inject a message into an active voice session. + * + * Used by subagent announce flows and inter-agent sends to speak results + * to the caller. Returns true if the message was injected, false if no + * active voice session was found. + */ +export function injectMessageToVoiceSession(sessionKey: string, message: string): boolean { + const entry = activeVoiceSessions.get(sessionKey); + if (!entry || !entry.client.isConnected()) { + return false; + } + entry.client.injectAgentMessage(message); + return true; +} + +/** + * Check whether a session key corresponds to an active voice session. + */ +export function isActiveVoiceSession(sessionKey: string): boolean { + const entry = activeVoiceSessions.get(sessionKey); + return !!entry && entry.client.isConnected(); +} + +/** + * Get all currently active voice session keys. + */ +export function getActiveVoiceSessionKeys(): string[] { + return Array.from(activeVoiceSessions.keys()); +} diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index 99f14a468..40e4a7468 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -3,6 +3,7 @@ import http from "node:http"; import { URL } from "node:url"; import type { VoiceCallConfig } from "./config.js"; import type { CoreConfig } from "./core-bridge.js"; +import type { DeepgramMediaBridge } from "./deepgram-media-bridge.js"; import type { CallManager } from "./manager.js"; import type { MediaStreamConfig } from "./media-stream.js"; import type { VoiceCallProvider } from "./providers/base.js"; @@ -26,6 +27,16 @@ export class VoiceCallWebhookServer { /** Media stream handler for bidirectional audio (when streaming enabled) */ private mediaStreamHandler: MediaStreamHandler | null = null; + /** Deepgram media bridge for hybrid mode */ + private deepgramBridge: DeepgramMediaBridge | null = null; + /** Gateway URL for LLM proxy */ + private gatewayUrl: string | null = null; + /** Gateway auth token */ + private gatewayToken: string | null = null; + /** Filler threshold (ms) — inject filler phrase if first SSE chunk takes longer */ + private fillerThresholdMs: number = 0; + /** Filler phrases to randomly pick from when threshold fires */ + private fillerPhrases: string[] = []; constructor( config: VoiceCallConfig, @@ -38,6 +49,12 @@ export class VoiceCallWebhookServer { this.provider = provider; this.coreConfig = coreConfig ?? null; + // Store filler config from deepgram latency settings + if (config.deepgram?.latency) { + this.fillerThresholdMs = config.deepgram.latency.fillerThresholdMs ?? 0; + this.fillerPhrases = config.deepgram.latency.fillerPhrases ?? []; + } + // Initialize media stream handler if streaming is enabled if (config.streaming?.enabled) { this.initializeMediaStreaming(); @@ -51,6 +68,21 @@ export class VoiceCallWebhookServer { return this.mediaStreamHandler; } + /** + * Set the Deepgram media bridge for hybrid mode WS routing. + */ + setDeepgramMediaBridge(bridge: DeepgramMediaBridge): void { + this.deepgramBridge = bridge; + } + + /** + * Configure gateway proxy settings. + */ + setGatewayConfig(url: string, token: string): void { + this.gatewayUrl = url; + this.gatewayToken = token; + } + /** * Initialize media streaming with OpenAI Realtime STT. */ @@ -173,18 +205,23 @@ export class VoiceCallWebhookServer { }); // Handle WebSocket upgrades for media streams - if (this.mediaStreamHandler) { - this.server.on("upgrade", (request, socket, head) => { - const url = new URL(request.url || "/", `http://${request.headers.host}`); - - if (url.pathname === streamPath) { + this.server.on("upgrade", (request, socket, head) => { + const url = new URL(request.url || "/", `http://${request.headers.host}`); + + if (url.pathname === streamPath) { + if (this.deepgramBridge) { + console.log("[voice-call] WebSocket upgrade for Deepgram media bridge"); + this.deepgramBridge.handleUpgrade(request, socket, head); + } else if (this.mediaStreamHandler) { console.log("[voice-call] WebSocket upgrade for media stream"); - this.mediaStreamHandler?.handleUpgrade(request, socket, head); + this.mediaStreamHandler.handleUpgrade(request, socket, head); } else { socket.destroy(); } - }); - } + } else { + socket.destroy(); + } + }); this.server.on("error", reject); @@ -225,6 +262,12 @@ export class VoiceCallWebhookServer { ): Promise { const url = new URL(req.url || "/", `http://${req.headers.host}`); + // Gateway LLM proxy for Deepgram think.endpoint + if (url.pathname === "/v1/chat/completions" && req.method === "POST") { + await this.handleGatewayProxy(req, res); + return; + } + // Check path if (!url.pathname.startsWith(webhookPath)) { res.statusCode = 404; @@ -368,6 +411,7 @@ export class VoiceCallWebhookServer { coreConfig: this.coreConfig, callId, from: call.from, + calledNumber: call.to, transcript: call.transcript, userMessage, }); @@ -385,6 +429,169 @@ export class VoiceCallWebhookServer { console.error(`[voice-call] Auto-response error:`, err); } } + + /** + * Proxy /v1/chat/completions requests to the gateway. + * Used by Deepgram's think.endpoint to route LLM calls through the gateway. + * + * Supports SSE streaming: when the gateway returns `text/event-stream`, + * chunks are piped through in real-time so Deepgram can begin TTS immediately. + * If the first chunk takes longer than `fillerThresholdMs`, a filler phrase + * is injected so the caller hears an acknowledgment instead of silence. + */ + private async handleGatewayProxy( + req: http.IncomingMessage, + res: http.ServerResponse, + ): Promise { + if (!this.gatewayUrl || !this.gatewayToken) { + res.statusCode = 503; + res.end("Gateway not configured"); + return; + } + + let body: string; + try { + body = await this.readBody(req, MAX_WEBHOOK_BODY_BYTES); + } catch (err) { + if (err instanceof Error && err.message === "PayloadTooLarge") { + res.statusCode = 413; + res.end("Payload Too Large"); + return; + } + throw err; + } + + // Force streaming so the gateway uses its SSE path + try { + const parsedBody: Record = JSON.parse(body); + parsedBody.stream = true; + body = JSON.stringify(parsedBody); + } catch { + /* keep original body if parse fails */ + } + + // Forward session key and agent ID from incoming headers + const sessionKey = req.headers["x-openclaw-session-key"] as string | undefined; + const agentId = req.headers["x-openclaw-agent-id"] as string | undefined; + + const proxyHeaders: Record = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.gatewayToken}`, + }; + if (sessionKey) proxyHeaders["x-openclaw-session-key"] = sessionKey; + if (agentId) proxyHeaders["x-openclaw-agent-id"] = agentId; + + try { + console.log(`[voice-call] Proxying /v1/chat/completions (streaming) to ${this.gatewayUrl}`); + const proxyRes = await fetch(`${this.gatewayUrl}/v1/chat/completions`, { + method: "POST", + headers: proxyHeaders, + body, + }); + + res.statusCode = proxyRes.status; + const contentType = proxyRes.headers.get("content-type"); + if (contentType) res.setHeader("Content-Type", contentType); + + // SSE streaming — pipe chunks through in real-time + if (contentType?.includes("text/event-stream") && proxyRes.body) { + res.setHeader("Cache-Control", "no-cache"); + res.setHeader("Connection", "keep-alive"); + res.flushHeaders(); + + const reader = proxyRes.body.getReader(); + let firstChunkSeen = false; + let fillerSent = false; + let skipNextRoleChunk = false; + + const threshold = this.fillerThresholdMs; + const phrases = this.fillerPhrases; + + // Set up filler timer — fires if no data arrives before threshold + const fillerTimer = + threshold > 0 && phrases.length > 0 + ? setTimeout(() => { + if (!firstChunkSeen) { + const phrase = phrases[Math.floor(Math.random() * phrases.length)]; + const id = `filler_${Date.now()}`; + const created = Math.floor(Date.now() / 1000); + res.write( + `data: ${JSON.stringify({ + id, + object: "chat.completion.chunk", + created, + model: "filler", + choices: [{ index: 0, delta: { role: "assistant" } }], + })}\n\n`, + ); + res.write( + `data: ${JSON.stringify({ + id, + object: "chat.completion.chunk", + created, + model: "filler", + choices: [{ index: 0, delta: { content: phrase }, finish_reason: null }], + })}\n\n`, + ); + fillerSent = true; + skipNextRoleChunk = true; + console.log(`[voice-call] Injected filler phrase: "${phrase}"`); + } + }, threshold) + : null; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + if (!firstChunkSeen) { + firstChunkSeen = true; + if (fillerTimer) clearTimeout(fillerTimer); + } + + // If filler was sent, skip the gateway's initial role chunk to avoid duplication + if (skipNextRoleChunk && fillerSent) { + const text = new TextDecoder().decode(value); + // Check if this chunk contains only the role delta (no content) + if (text.includes('"role"') && !text.includes('"content"')) { + skipNextRoleChunk = false; + continue; + } + skipNextRoleChunk = false; + } + + res.write(value); + } + } finally { + if (fillerTimer) clearTimeout(fillerTimer); + res.end(); + } + } else { + // Non-streaming — buffer and forward + const responseBody = await proxyRes.text(); + res.end(responseBody); + } + } catch (err) { + console.error("[voice-call] Gateway proxy error:", err); + if (!res.headersSent) { + res.statusCode = 502; + res.end("Bad Gateway"); + } else { + // Mid-stream error — write an SSE error event and close + try { + res.write( + `data: ${JSON.stringify({ + error: { message: "Gateway proxy error", type: "proxy_error" }, + })}\n\n`, + ); + } catch { + /* response may already be closed */ + } + res.end(); + } + } + } } /** diff --git a/src/agents/anthropic-models.ts b/src/agents/anthropic-models.ts new file mode 100644 index 000000000..9023ea24d --- /dev/null +++ b/src/agents/anthropic-models.ts @@ -0,0 +1,142 @@ +import type { ModelDefinitionConfig } from "../config/types.js"; + +export const ANTHROPIC_BASE_URL = "https://api.anthropic.com"; + +// Anthropic uses per-token pricing that varies by model. +// Set to 0 as costs vary by model; override in models.json for accurate costs. +export const ANTHROPIC_DEFAULT_COST = { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, +}; + +/** + * Static catalog of common Anthropic chat models. + * Serves as a fallback when the Anthropic API is unreachable. + */ +export const ANTHROPIC_MODEL_CATALOG = [ + { + id: "claude-opus-4-6", + name: "Claude Opus 4.6", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 200000, + maxTokens: 32000, + }, + { + id: "claude-sonnet-4-5-20250929", + name: "Claude Sonnet 4.5", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 200000, + maxTokens: 16384, + }, + { + id: "claude-haiku-4-5-20251001", + name: "Claude Haiku 4.5", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 200000, + maxTokens: 8192, + }, +] as const; + +export type AnthropicCatalogEntry = (typeof ANTHROPIC_MODEL_CATALOG)[number]; + +export function buildAnthropicModelDefinition(entry: AnthropicCatalogEntry): ModelDefinitionConfig { + return { + id: entry.id, + name: entry.name, + reasoning: entry.reasoning, + input: [...entry.input], + cost: ANTHROPIC_DEFAULT_COST, + contextWindow: entry.contextWindow, + maxTokens: entry.maxTokens, + }; +} + +// Prefixes for non-chat models to filter out from discovery +const NON_CHAT_PREFIXES = ["claude-1", "claude-instant"]; + +// Anthropic API response types +interface AnthropicModel { + id: string; + type: string; + display_name?: string; +} + +interface AnthropicModelsResponse { + data: AnthropicModel[]; +} + +/** + * Discover models from Anthropic API with fallback to static catalog. + * Requires an API key for authentication. + */ +export async function discoverAnthropicModels(params: { + apiKey: string; +}): Promise { + // Skip API discovery in test environment + if (process.env.NODE_ENV === "test" || process.env.VITEST) { + return ANTHROPIC_MODEL_CATALOG.map(buildAnthropicModelDefinition); + } + + try { + const response = await fetch(`${ANTHROPIC_BASE_URL}/v1/models`, { + headers: { + "x-api-key": params.apiKey, + "anthropic-version": "2023-06-01", + }, + signal: AbortSignal.timeout(5000), + }); + + if (!response.ok) { + console.warn( + `[anthropic-models] Failed to discover models: HTTP ${response.status}, using static catalog`, + ); + return ANTHROPIC_MODEL_CATALOG.map(buildAnthropicModelDefinition); + } + + const data = (await response.json()) as AnthropicModelsResponse; + if (!Array.isArray(data.data) || data.data.length === 0) { + console.warn("[anthropic-models] No models found from API, using static catalog"); + return ANTHROPIC_MODEL_CATALOG.map(buildAnthropicModelDefinition); + } + + // Filter out legacy non-chat models + const chatModels = data.data.filter( + (m) => !NON_CHAT_PREFIXES.some((prefix) => m.id.startsWith(prefix)), + ); + + // Merge discovered models with catalog metadata + const catalogById = new Map( + ANTHROPIC_MODEL_CATALOG.map((m) => [m.id, m]), + ); + const models: ModelDefinitionConfig[] = []; + + for (const apiModel of chatModels) { + const catalogEntry = catalogById.get(apiModel.id); + if (catalogEntry) { + // Use catalog metadata for known models + models.push(buildAnthropicModelDefinition(catalogEntry)); + } else { + // Create definition for newly discovered models not in catalog + models.push({ + id: apiModel.id, + name: apiModel.display_name ?? apiModel.id, + reasoning: false, + input: ["text", "image"], + cost: ANTHROPIC_DEFAULT_COST, + contextWindow: 200000, + maxTokens: 8192, + }); + } + } + + return models.length > 0 ? models : ANTHROPIC_MODEL_CATALOG.map(buildAnthropicModelDefinition); + } catch (error) { + console.warn(`[anthropic-models] Discovery failed: ${String(error)}, using static catalog`); + return ANTHROPIC_MODEL_CATALOG.map(buildAnthropicModelDefinition); + } +} diff --git a/src/agents/model-catalog.ts b/src/agents/model-catalog.ts index 3ae2a1204..b89178663 100644 --- a/src/agents/model-catalog.ts +++ b/src/agents/model-catalog.ts @@ -61,7 +61,7 @@ export async function loadModelCatalog(params?: { }); try { const cfg = params?.config ?? loadConfig(); - await ensureOpenClawModelsJson(cfg); + const { activeProviders } = await ensureOpenClawModelsJson(cfg); // IMPORTANT: keep the dynamic import *inside* the try/catch. // If this fails once (e.g. during a pnpm install that temporarily swaps node_modules), // we must not poison the cache with a rejected promise (otherwise all channel handlers @@ -95,12 +95,16 @@ export async function loadModelCatalog(params?: { models.push({ id, name, provider, contextWindow, reasoning, input }); } - if (models.length === 0) { + // Filter to only providers with active credentials. + const filtered = + activeProviders.size > 0 ? models.filter((m) => activeProviders.has(m.provider)) : models; + + if (filtered.length === 0) { // If we found nothing, don't cache this result so we can try again. modelCatalogPromise = null; } - return sortModels(models); + return sortModels(filtered); } catch (error) { if (!hasLoggedModelCatalogError) { hasLoggedModelCatalogError = true; diff --git a/src/agents/models-config.providers.ts b/src/agents/models-config.providers.ts index d4ae66cc0..1ea7ebc30 100644 --- a/src/agents/models-config.providers.ts +++ b/src/agents/models-config.providers.ts @@ -4,6 +4,7 @@ import { DEFAULT_COPILOT_API_BASE_URL, resolveCopilotApiToken, } from "../providers/github-copilot-token.js"; +import { discoverAnthropicModels, ANTHROPIC_BASE_URL } from "./anthropic-models.js"; import { ensureAuthProfileStore, listProfilesForProvider } from "./auth-profiles.js"; import { discoverBedrockModels } from "./bedrock-discovery.js"; import { @@ -11,6 +12,7 @@ import { resolveCloudflareAiGatewayBaseUrl, } from "./cloudflare-ai-gateway.js"; import { resolveAwsSdkEnvVarName, resolveEnvApiKey } from "./model-auth.js"; +import { discoverOpenAiModels, OPENAI_BASE_URL } from "./openai-models.js"; import { buildSyntheticModelDefinition, SYNTHETIC_BASE_URL, @@ -405,6 +407,24 @@ async function buildVeniceProvider(): Promise { }; } +async function buildOpenAiProvider(apiKey: string): Promise { + const models = await discoverOpenAiModels({ apiKey }); + return { + baseUrl: OPENAI_BASE_URL, + api: "openai-completions", + models, + }; +} + +async function buildAnthropicProvider(apiKey: string): Promise { + const models = await discoverAnthropicModels({ apiKey }); + return { + baseUrl: ANTHROPIC_BASE_URL, + api: "anthropic-messages", + models, + }; +} + async function buildOllamaProvider(): Promise { const models = await discoverOllamaModels(); return { @@ -485,6 +505,31 @@ export async function resolveImplicitProviders(params: { providers.venice = { ...(await buildVeniceProvider()), apiKey: veniceKey }; } + const openaiEnv = resolveEnvApiKey("openai"); + const openaiProfileKey = resolveApiKeyFromProfiles({ provider: "openai", store: authStore }); + const openaiApiKeyValue = openaiEnv?.apiKey ?? openaiProfileKey; + const openaiProviderKey = resolveEnvApiKeyVarName("openai") ?? openaiProfileKey; + if (openaiApiKeyValue && openaiProviderKey) { + providers.openai = { + ...(await buildOpenAiProvider(openaiApiKeyValue)), + apiKey: openaiProviderKey, + }; + } + + const anthropicEnv = resolveEnvApiKey("anthropic"); + const anthropicProfileKey = resolveApiKeyFromProfiles({ + provider: "anthropic", + store: authStore, + }); + const anthropicApiKeyValue = anthropicEnv?.apiKey ?? anthropicProfileKey; + const anthropicProviderKey = resolveEnvApiKeyVarName("anthropic") ?? anthropicProfileKey; + if (anthropicApiKeyValue && anthropicProviderKey) { + providers.anthropic = { + ...(await buildAnthropicProvider(anthropicApiKeyValue)), + apiKey: anthropicProviderKey, + }; + } + const qwenProfiles = listProfilesForProvider(authStore, "qwen-portal"); if (qwenProfiles.length > 0) { providers["qwen-portal"] = { diff --git a/src/agents/models-config.ts b/src/agents/models-config.ts index b322f7d61..7e6f10551 100644 --- a/src/agents/models-config.ts +++ b/src/agents/models-config.ts @@ -84,7 +84,7 @@ async function readJson(pathname: string): Promise { export async function ensureOpenClawModelsJson( config?: OpenClawConfig, agentDirOverride?: string, -): Promise<{ agentDir: string; wrote: boolean }> { +): Promise<{ agentDir: string; wrote: boolean; activeProviders: Set }> { const cfg = config ?? loadConfig(); const agentDir = agentDirOverride?.trim() ? agentDirOverride.trim() : resolveOpenClawAgentDir(); @@ -106,8 +106,10 @@ export async function ensureOpenClawModelsJson( providers["github-copilot"] = implicitCopilot; } - if (Object.keys(providers).length === 0) { - return { agentDir, wrote: false }; + const activeProviders = new Set(Object.keys(providers)); + + if (activeProviders.size === 0) { + return { agentDir, wrote: false, activeProviders }; } const mode = cfg.models?.mode ?? DEFAULT_MODE; @@ -138,10 +140,10 @@ export async function ensureOpenClawModelsJson( } if (existingRaw === next) { - return { agentDir, wrote: false }; + return { agentDir, wrote: false, activeProviders }; } await fs.mkdir(agentDir, { recursive: true, mode: 0o700 }); await fs.writeFile(targetPath, next, { mode: 0o600 }); - return { agentDir, wrote: true }; + return { agentDir, wrote: true, activeProviders }; } diff --git a/src/agents/openai-models.ts b/src/agents/openai-models.ts new file mode 100644 index 000000000..d67ce8510 --- /dev/null +++ b/src/agents/openai-models.ts @@ -0,0 +1,214 @@ +import type { ModelDefinitionConfig } from "../config/types.js"; + +export const OPENAI_BASE_URL = "https://api.openai.com/v1"; + +// OpenAI uses per-token pricing that varies by model. +// Set to 0 as costs vary by model; override in models.json for accurate costs. +export const OPENAI_DEFAULT_COST = { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, +}; + +/** + * Static catalog of common OpenAI chat models. + * Serves as a fallback when the OpenAI API is unreachable. + */ +export const OPENAI_MODEL_CATALOG = [ + { + id: "gpt-4.1", + name: "GPT-4.1", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 1047576, + maxTokens: 32768, + }, + { + id: "gpt-4.1-mini", + name: "GPT-4.1 Mini", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 1047576, + maxTokens: 32768, + }, + { + id: "gpt-4.1-nano", + name: "GPT-4.1 Nano", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 1047576, + maxTokens: 32768, + }, + { + id: "gpt-4o", + name: "GPT-4o", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 128000, + maxTokens: 16384, + }, + { + id: "gpt-4o-mini", + name: "GPT-4o Mini", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 128000, + maxTokens: 16384, + }, + { + id: "o4-mini", + name: "o4-mini", + reasoning: true, + input: ["text", "image"] as const, + contextWindow: 200000, + maxTokens: 100000, + }, + { + id: "o3", + name: "o3", + reasoning: true, + input: ["text", "image"] as const, + contextWindow: 200000, + maxTokens: 100000, + }, + { + id: "o3-mini", + name: "o3-mini", + reasoning: true, + input: ["text"] as const, + contextWindow: 200000, + maxTokens: 100000, + }, + { + id: "o1", + name: "o1", + reasoning: true, + input: ["text", "image"] as const, + contextWindow: 200000, + maxTokens: 100000, + }, + { + id: "o1-mini", + name: "o1-mini", + reasoning: true, + input: ["text"] as const, + contextWindow: 128000, + maxTokens: 65536, + }, + { + id: "chatgpt-4o-latest", + name: "ChatGPT-4o Latest", + reasoning: false, + input: ["text", "image"] as const, + contextWindow: 128000, + maxTokens: 16384, + }, +] as const; + +export type OpenAiCatalogEntry = (typeof OPENAI_MODEL_CATALOG)[number]; + +export function buildOpenAiModelDefinition(entry: OpenAiCatalogEntry): ModelDefinitionConfig { + return { + id: entry.id, + name: entry.name, + reasoning: entry.reasoning, + input: [...entry.input], + cost: OPENAI_DEFAULT_COST, + contextWindow: entry.contextWindow, + maxTokens: entry.maxTokens, + }; +} + +// Prefixes for non-chat models to filter out from discovery +const NON_CHAT_PREFIXES = ["dall-e", "whisper", "tts", "text-embedding", "babbage", "davinci"]; + +// OpenAI API response types +interface OpenAiModel { + id: string; + object: string; + owned_by: string; +} + +interface OpenAiModelsResponse { + data: OpenAiModel[]; +} + +/** + * Discover models from OpenAI API with fallback to static catalog. + * Requires an API key for authentication. + * + * Note: `baseUrl` should be the root URL without `/v1` (e.g. `https://api.openai.com`), + * since this function appends `/v1/models` itself. This differs from `OPENAI_BASE_URL` + * which includes `/v1` for use as the completions base URL. + */ +export async function discoverOpenAiModels(params: { + apiKey: string; + baseUrl?: string; +}): Promise { + // Skip API discovery in test environment + if (process.env.NODE_ENV === "test" || process.env.VITEST) { + return OPENAI_MODEL_CATALOG.map(buildOpenAiModelDefinition); + } + + const baseUrl = params.baseUrl ?? "https://api.openai.com"; + + try { + const response = await fetch(`${baseUrl}/v1/models`, { + headers: { + Authorization: `Bearer ${params.apiKey}`, + }, + signal: AbortSignal.timeout(5000), + }); + + if (!response.ok) { + console.warn( + `[openai-models] Failed to discover models: HTTP ${response.status}, using static catalog`, + ); + return OPENAI_MODEL_CATALOG.map(buildOpenAiModelDefinition); + } + + const data = (await response.json()) as OpenAiModelsResponse; + if (!Array.isArray(data.data) || data.data.length === 0) { + console.warn("[openai-models] No models found from API, using static catalog"); + return OPENAI_MODEL_CATALOG.map(buildOpenAiModelDefinition); + } + + // Filter to chat-capable models only + const chatModels = data.data.filter( + (m) => !NON_CHAT_PREFIXES.some((prefix) => m.id.startsWith(prefix)), + ); + + // Merge discovered models with catalog metadata + const catalogById = new Map( + OPENAI_MODEL_CATALOG.map((m) => [m.id, m]), + ); + const models: ModelDefinitionConfig[] = []; + + for (const apiModel of chatModels) { + const catalogEntry = catalogById.get(apiModel.id); + if (catalogEntry) { + // Use catalog metadata for known models + models.push(buildOpenAiModelDefinition(catalogEntry)); + } else { + // Create definition for newly discovered models not in catalog + const isReasoning = /^o\d/.test(apiModel.id); + + models.push({ + id: apiModel.id, + name: apiModel.id, + reasoning: isReasoning, + input: ["text"], + cost: OPENAI_DEFAULT_COST, + contextWindow: 128000, + maxTokens: 16384, + }); + } + } + + return models.length > 0 ? models : OPENAI_MODEL_CATALOG.map(buildOpenAiModelDefinition); + } catch (error) { + console.warn(`[openai-models] Discovery failed: ${String(error)}, using static catalog`); + return OPENAI_MODEL_CATALOG.map(buildOpenAiModelDefinition); + } +} diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index f195150a0..8aab70f04 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -166,7 +166,11 @@ export async function runEmbeddedAttempt( let restoreSkillEnv: (() => void) | undefined; process.chdir(effectiveWorkspace); try { - const shouldLoadSkillEntries = !params.skillsSnapshot || !params.skillsSnapshot.resolvedSkills; + // When sandboxed with isolated workspace, reload skills from the sandbox dir + // so paths point inside the sandbox (not the gateway container's filesystem). + const isSandboxedIsolated = !!sandbox?.enabled && sandbox.workspaceAccess !== "rw"; + const shouldLoadSkillEntries = + isSandboxedIsolated || !params.skillsSnapshot || !params.skillsSnapshot.resolvedSkills; const skillEntries = shouldLoadSkillEntries ? loadWorkspaceSkillEntries(effectiveWorkspace) : []; @@ -181,7 +185,7 @@ export async function runEmbeddedAttempt( }); const skillsPrompt = resolveSkillsPromptForRun({ - skillsSnapshot: params.skillsSnapshot, + skillsSnapshot: isSandboxedIsolated ? undefined : params.skillsSnapshot, entries: shouldLoadSkillEntries ? skillEntries : undefined, config: params.config, workspaceDir: effectiveWorkspace, diff --git a/src/agents/tools/web-search.test.ts b/src/agents/tools/web-search.test.ts index 447e53102..19ba3228b 100644 --- a/src/agents/tools/web-search.test.ts +++ b/src/agents/tools/web-search.test.ts @@ -8,6 +8,7 @@ const { resolveGrokApiKey, resolveGrokModel, resolveGrokInlineCitations, + resolvePerplexitySearchDomainFilter, } = __testing; describe("web_search perplexity baseUrl defaults", () => { @@ -75,6 +76,33 @@ describe("web_search freshness normalization", () => { }); }); +describe("web_search perplexity domain filter normalization", () => { + it("lowercases and sorts domains", () => { + expect( + resolvePerplexitySearchDomainFilter({ + searchDomainFilter: ["Reddit.com", "arxiv.org", "Github.COM"], + }), + ).toEqual(["arxiv.org", "github.com", "reddit.com"]); + }); + + it("filters out empty/whitespace-only entries", () => { + expect( + resolvePerplexitySearchDomainFilter({ + searchDomainFilter: ["example.com", " ", ""], + }), + ).toEqual(["example.com"]); + }); + + it("returns undefined for empty array", () => { + expect(resolvePerplexitySearchDomainFilter({ searchDomainFilter: [] })).toBeUndefined(); + }); + + it("returns undefined when not configured", () => { + expect(resolvePerplexitySearchDomainFilter({})).toBeUndefined(); + expect(resolvePerplexitySearchDomainFilter(undefined)).toBeUndefined(); + }); +}); + describe("web_search grok config resolution", () => { it("uses config apiKey when provided", () => { expect(resolveGrokApiKey({ apiKey: "xai-test-key" })).toBe("xai-test-key"); diff --git a/src/agents/tools/web-search.ts b/src/agents/tools/web-search.ts index 5653952a9..d91222c1d 100644 --- a/src/agents/tools/web-search.ts +++ b/src/agents/tools/web-search.ts @@ -66,6 +66,12 @@ const WebSearchSchema = Type.Object({ "Filter results by discovery time (Brave only). Values: 'pd' (past 24h), 'pw' (past week), 'pm' (past month), 'py' (past year), or date range 'YYYY-MM-DDtoYYYY-MM-DD'.", }), ), + recency: Type.Optional( + Type.String({ + description: + "Filter by recency (Perplexity only). Values: 'hour', 'day', 'week', 'month', 'year'.", + }), + ), }); type WebSearchConfig = NonNullable["web"] extends infer Web @@ -91,6 +97,11 @@ type PerplexityConfig = { apiKey?: string; baseUrl?: string; model?: string; + searchContextSize?: string; + searchRecencyFilter?: string; + searchDomainFilter?: string[]; + returnRelatedQuestions?: boolean; + systemPrompt?: string; }; type PerplexityApiKeySource = "config" | "perplexity_env" | "openrouter_env" | "none"; @@ -118,6 +129,7 @@ type PerplexitySearchResponse = { }; }>; citations?: string[]; + related_questions?: string[]; }; type PerplexityBaseUrlHint = "direct" | "openrouter"; @@ -277,6 +289,48 @@ function resolvePerplexityModel(perplexity?: PerplexityConfig): string { return fromConfig || DEFAULT_PERPLEXITY_MODEL; } +const PERPLEXITY_RECENCY_VALUES = new Set(["hour", "day", "week", "month", "year"]); + +function resolvePerplexitySearchContextSize(perplexity?: PerplexityConfig): string { + const fromConfig = perplexity?.searchContextSize?.trim().toLowerCase() ?? ""; + if (fromConfig === "low" || fromConfig === "medium" || fromConfig === "high") { + return fromConfig; + } + return "high"; +} + +function resolvePerplexitySearchRecencyFilter( + perplexity?: PerplexityConfig, + agentArg?: string, +): string | undefined { + const value = + agentArg?.trim().toLowerCase() || perplexity?.searchRecencyFilter?.trim().toLowerCase(); + if (value && PERPLEXITY_RECENCY_VALUES.has(value)) { + return value; + } + return undefined; +} + +function resolvePerplexitySearchDomainFilter(perplexity?: PerplexityConfig): string[] | undefined { + const domains = perplexity?.searchDomainFilter; + if (!Array.isArray(domains) || domains.length === 0) { + return undefined; + } + return domains + .map((d) => d.trim().toLowerCase()) + .filter(Boolean) + .toSorted(); +} + +function resolvePerplexitySystemPrompt(perplexity?: PerplexityConfig): string | undefined { + const prompt = perplexity?.systemPrompt?.trim(); + return prompt || undefined; +} + +function resolvePerplexityReturnRelatedQuestions(perplexity?: PerplexityConfig): boolean { + return perplexity?.returnRelatedQuestions === true; +} + function resolveGrokConfig(search?: WebSearchConfig): GrokConfig { if (!search || typeof search !== "object") { return {}; @@ -375,9 +429,38 @@ async function runPerplexitySearch(params: { baseUrl: string; model: string; timeoutSeconds: number; -}): Promise<{ content: string; citations: string[] }> { + searchContextSize?: string; + searchRecencyFilter?: string; + searchDomainFilter?: string[]; + returnRelatedQuestions?: boolean; + systemPrompt?: string; +}): Promise<{ content: string; citations: string[]; relatedQuestions?: string[] }> { const endpoint = `${params.baseUrl.replace(/\/$/, "")}/chat/completions`; + const messages: Array<{ role: string; content: string }> = []; + if (params.systemPrompt) { + messages.push({ role: "system", content: params.systemPrompt }); + } + messages.push({ role: "user", content: params.query }); + + const body: Record = { + model: params.model, + messages, + web_search_options: { + search_context_size: params.searchContextSize ?? "high", + }, + }; + + if (params.searchRecencyFilter) { + body.search_recency_filter = params.searchRecencyFilter; + } + if (params.searchDomainFilter?.length) { + body.search_domain_filter = params.searchDomainFilter; + } + if (params.returnRelatedQuestions) { + body.return_related_questions = true; + } + const res = await fetch(endpoint, { method: "POST", headers: { @@ -386,15 +469,7 @@ async function runPerplexitySearch(params: { "HTTP-Referer": "https://openclaw.ai", "X-Title": "OpenClaw Web Search", }, - body: JSON.stringify({ - model: params.model, - messages: [ - { - role: "user", - content: params.query, - }, - ], - }), + body: JSON.stringify(body), signal: withTimeout(undefined, params.timeoutSeconds * 1000), }); @@ -406,8 +481,9 @@ async function runPerplexitySearch(params: { const data = (await res.json()) as PerplexitySearchResponse; const content = data.choices?.[0]?.message?.content ?? "No response"; const citations = data.citations ?? []; + const relatedQuestions = data.related_questions; - return { content, citations }; + return { content, citations, relatedQuestions }; } async function runGrokSearch(params: { @@ -472,6 +548,11 @@ async function runWebSearch(params: { freshness?: string; perplexityBaseUrl?: string; perplexityModel?: string; + perplexitySearchContextSize?: string; + perplexitySearchRecencyFilter?: string; + perplexitySearchDomainFilter?: string[]; + perplexityReturnRelatedQuestions?: boolean; + perplexitySystemPrompt?: string; grokModel?: string; grokInlineCitations?: boolean; }): Promise> { @@ -479,7 +560,7 @@ async function runWebSearch(params: { params.provider === "brave" ? `${params.provider}:${params.query}:${params.count}:${params.country || "default"}:${params.search_lang || "default"}:${params.ui_lang || "default"}:${params.freshness || "default"}` : params.provider === "perplexity" - ? `${params.provider}:${params.query}:${params.perplexityBaseUrl ?? DEFAULT_PERPLEXITY_BASE_URL}:${params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL}` + ? `${params.provider}:${params.query}:${params.perplexityBaseUrl ?? DEFAULT_PERPLEXITY_BASE_URL}:${params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL}:${params.perplexitySearchContextSize ?? "high"}:${params.perplexitySearchRecencyFilter ?? "none"}:${params.perplexitySearchDomainFilter?.join(",") ?? "none"}:${String(params.perplexityReturnRelatedQuestions ?? false)}:${params.perplexitySystemPrompt ?? "none"}` : `${params.provider}:${params.query}:${params.grokModel ?? DEFAULT_GROK_MODEL}:${String(params.grokInlineCitations ?? false)}`, ); const cached = readCache(SEARCH_CACHE, cacheKey); @@ -490,22 +571,41 @@ async function runWebSearch(params: { const start = Date.now(); if (params.provider === "perplexity") { - const { content, citations } = await runPerplexitySearch({ + const { content, citations, relatedQuestions } = await runPerplexitySearch({ query: params.query, apiKey: params.apiKey, baseUrl: params.perplexityBaseUrl ?? DEFAULT_PERPLEXITY_BASE_URL, model: params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL, timeoutSeconds: params.timeoutSeconds, + searchContextSize: params.perplexitySearchContextSize, + searchRecencyFilter: params.perplexitySearchRecencyFilter, + searchDomainFilter: params.perplexitySearchDomainFilter, + returnRelatedQuestions: params.perplexityReturnRelatedQuestions, + systemPrompt: params.perplexitySystemPrompt, }); - const payload = { + const searchParams: Record = { + searchContextSize: params.perplexitySearchContextSize ?? "high", + }; + if (params.perplexitySearchRecencyFilter) { + searchParams.searchRecencyFilter = params.perplexitySearchRecencyFilter; + } + if (params.perplexitySearchDomainFilter?.length) { + searchParams.searchDomainFilter = params.perplexitySearchDomainFilter; + } + + const payload: Record = { query: params.query, provider: params.provider, model: params.perplexityModel ?? DEFAULT_PERPLEXITY_MODEL, + searchParams, tookMs: Date.now() - start, content: wrapWebContent(content), citations, }; + if (relatedQuestions?.length) { + payload.relatedQuestions = relatedQuestions; + } writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; } @@ -608,7 +708,7 @@ export function createWebSearchTool(options?: { const description = provider === "perplexity" - ? "Search the web using Perplexity Sonar (direct or via OpenRouter). Returns AI-synthesized answers with citations from real-time web search." + ? "Search the web using Perplexity Sonar. Returns AI-synthesized answers with citations from real-time web search. Supports recency filter (hour/day/week/month/year) via the recency parameter." : provider === "grok" ? "Search the web using xAI Grok. Returns AI-synthesized answers with citations from real-time web search." : "Search the web using Brave Search API. Supports region-specific and localized search via country and language parameters. Returns titles, URLs, and snippets for fast research."; @@ -655,6 +755,21 @@ export function createWebSearchTool(options?: { docs: "https://docs.openclaw.ai/tools/web", }); } + const rawRecency = readStringParam(params, "recency"); + if (rawRecency && provider !== "perplexity") { + return jsonResult({ + error: "unsupported_recency", + message: "recency is only supported by the Perplexity web_search provider.", + docs: "https://docs.openclaw.ai/tools/web", + }); + } + if (rawRecency && !PERPLEXITY_RECENCY_VALUES.has(rawRecency.trim().toLowerCase())) { + return jsonResult({ + error: "invalid_recency", + message: "recency must be one of hour, day, week, month, year.", + docs: "https://docs.openclaw.ai/tools/web", + }); + } const result = await runWebSearch({ query, count: resolveSearchCount(count, DEFAULT_SEARCH_COUNT), @@ -672,6 +787,14 @@ export function createWebSearchTool(options?: { perplexityAuth?.apiKey, ), perplexityModel: resolvePerplexityModel(perplexityConfig), + perplexitySearchContextSize: resolvePerplexitySearchContextSize(perplexityConfig), + perplexitySearchRecencyFilter: resolvePerplexitySearchRecencyFilter( + perplexityConfig, + rawRecency ?? undefined, + ), + perplexitySearchDomainFilter: resolvePerplexitySearchDomainFilter(perplexityConfig), + perplexityReturnRelatedQuestions: resolvePerplexityReturnRelatedQuestions(perplexityConfig), + perplexitySystemPrompt: resolvePerplexitySystemPrompt(perplexityConfig), grokModel: resolveGrokModel(grokConfig), grokInlineCitations: resolveGrokInlineCitations(grokConfig), }); @@ -687,4 +810,5 @@ export const __testing = { resolveGrokApiKey, resolveGrokModel, resolveGrokInlineCitations, + resolvePerplexitySearchDomainFilter, } as const; diff --git a/src/commands/agents.config.ts b/src/commands/agents.config.ts index fe0907c04..0a316e67e 100644 --- a/src/commands/agents.config.ts +++ b/src/commands/agents.config.ts @@ -137,6 +137,7 @@ export function applyAgentConfig( workspace?: string; agentDir?: string; model?: string; + agentType?: "text" | "voice"; }, ): OpenClawConfig { const agentId = normalizeAgentId(params.agentId); @@ -150,6 +151,7 @@ export function applyAgentConfig( ...(params.workspace ? { workspace: params.workspace } : {}), ...(params.agentDir ? { agentDir: params.agentDir } : {}), ...(params.model ? { model: params.model } : {}), + ...(params.agentType ? { agentType: params.agentType } : {}), }; const nextList = [...list]; if (index >= 0) { diff --git a/src/config/schema.ts b/src/config/schema.ts index 605c3b247..186234d78 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -486,6 +486,16 @@ const FIELD_HELP: Record = { "Perplexity base URL override (default: https://openrouter.ai/api/v1 or https://api.perplexity.ai).", "tools.web.search.perplexity.model": 'Perplexity model override (default: "perplexity/sonar-pro").', + "tools.web.search.perplexity.searchContextSize": + 'How much web content to retrieve: "low", "medium", or "high" (default: "high").', + "tools.web.search.perplexity.searchRecencyFilter": + 'Default recency filter: "hour", "day", "week", "month", or "year".', + "tools.web.search.perplexity.searchDomainFilter": + 'Domain allowlist/denylist array (prefix with "-" to exclude, e.g. ["-reddit.com"]).', + "tools.web.search.perplexity.returnRelatedQuestions": + "Return related follow-up questions from Perplexity (default: false).", + "tools.web.search.perplexity.systemPrompt": + "System prompt for Perplexity response formatting (does not affect search, only LLM output).", "tools.web.fetch.enabled": "Enable the web_fetch tool (lightweight HTTP fetch).", "tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).", "tools.web.fetch.maxCharsCap": diff --git a/src/config/types.agents.ts b/src/config/types.agents.ts index ad4fa7853..bbcb17cee 100644 --- a/src/config/types.agents.ts +++ b/src/config/types.agents.ts @@ -21,6 +21,7 @@ export type AgentModelConfig = export type AgentConfig = { id: string; default?: boolean; + agentType?: "text" | "voice"; name?: string; workspace?: string; agentDir?: string; diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index d5292e7c2..40a375dc3 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -354,6 +354,16 @@ export type ToolsConfig = { baseUrl?: string; /** Model to use (defaults to "perplexity/sonar-pro"). */ model?: string; + /** How much web content to retrieve: "low", "medium", or "high" (default: "high"). */ + searchContextSize?: "low" | "medium" | "high"; + /** Default recency filter: "hour", "day", "week", "month", or "year". */ + searchRecencyFilter?: "hour" | "day" | "week" | "month" | "year"; + /** Domain allowlist/denylist (prefix with "-" to exclude, e.g. ["-reddit.com"]). */ + searchDomainFilter?: string[]; + /** Return related follow-up questions (default: false). */ + returnRelatedQuestions?: boolean; + /** System prompt for response formatting (does NOT affect search, only LLM output). */ + systemPrompt?: string; }; /** Grok-specific configuration (used when provider="grok"). */ grok?: { diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index 035c3b23b..199ff63c3 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -181,6 +181,21 @@ export const ToolsWebSearchSchema = z apiKey: z.string().optional(), baseUrl: z.string().optional(), model: z.string().optional(), + searchContextSize: z + .union([z.literal("low"), z.literal("medium"), z.literal("high")]) + .optional(), + searchRecencyFilter: z + .union([ + z.literal("hour"), + z.literal("day"), + z.literal("week"), + z.literal("month"), + z.literal("year"), + ]) + .optional(), + searchDomainFilter: z.array(z.string()).optional(), + returnRelatedQuestions: z.boolean().optional(), + systemPrompt: z.string().optional(), }) .strict() .optional(), @@ -441,6 +456,7 @@ export const AgentEntrySchema = z .object({ id: z.string(), default: z.boolean().optional(), + agentType: z.union([z.literal("text"), z.literal("voice")]).optional(), name: z.string().optional(), workspace: z.string().optional(), agentDir: z.string().optional(), diff --git a/src/gateway/protocol/schema/agents-models-skills.ts b/src/gateway/protocol/schema/agents-models-skills.ts index aaa886dd5..bf8b57a77 100644 --- a/src/gateway/protocol/schema/agents-models-skills.ts +++ b/src/gateway/protocol/schema/agents-models-skills.ts @@ -8,6 +8,7 @@ export const ModelChoiceSchema = Type.Object( provider: NonEmptyString, contextWindow: Type.Optional(Type.Integer({ minimum: 1 })), reasoning: Type.Optional(Type.Boolean()), + input: Type.Optional(Type.Array(Type.Union([Type.Literal("text"), Type.Literal("image")]))), }, { additionalProperties: false }, ); @@ -16,6 +17,7 @@ export const AgentSummarySchema = Type.Object( { id: NonEmptyString, name: Type.Optional(NonEmptyString), + agentType: Type.Optional(Type.Union([Type.Literal("text"), Type.Literal("voice")])), identity: Type.Optional( Type.Object( { @@ -50,6 +52,7 @@ export const AgentsCreateParamsSchema = Type.Object( workspace: NonEmptyString, emoji: Type.Optional(Type.String()), avatar: Type.Optional(Type.String()), + agentType: Type.Optional(Type.Union([Type.Literal("text"), Type.Literal("voice")])), }, { additionalProperties: false }, ); @@ -71,6 +74,7 @@ export const AgentsUpdateParamsSchema = Type.Object( workspace: Type.Optional(NonEmptyString), model: Type.Optional(NonEmptyString), avatar: Type.Optional(Type.String()), + agentType: Type.Optional(Type.Union([Type.Literal("text"), Type.Literal("voice")])), }, { additionalProperties: false }, ); @@ -164,7 +168,10 @@ export const AgentsFilesSetResultSchema = Type.Object( { additionalProperties: false }, ); -export const ModelsListParamsSchema = Type.Object({}, { additionalProperties: false }); +export const ModelsListParamsSchema = Type.Object( + { refresh: Type.Optional(Type.Boolean()) }, + { additionalProperties: false }, +); export const ModelsListResultSchema = Type.Object( { diff --git a/src/gateway/server-dictation.test.ts b/src/gateway/server-dictation.test.ts new file mode 100644 index 000000000..9291ca7d5 --- /dev/null +++ b/src/gateway/server-dictation.test.ts @@ -0,0 +1,19 @@ +import { describe, expect, it, vi } from "vitest"; + +// Mock the imports +vi.mock("./server-dictation.js", async () => { + const actual = await vi.importActual("./server-dictation.js"); + return actual; +}); + +describe("server-dictation", () => { + it("module exports createDictationUpgradeHandler", async () => { + const mod = await import("./server-dictation.js"); + expect(typeof mod.createDictationUpgradeHandler).toBe("function"); + }); + + it("module exports DICTATION_PATH constant", async () => { + const mod = await import("./server-dictation.js"); + expect(mod.DICTATION_PATH).toBe("/dictation/stream"); + }); +}); diff --git a/src/gateway/server-dictation.ts b/src/gateway/server-dictation.ts new file mode 100644 index 000000000..db2948b6a --- /dev/null +++ b/src/gateway/server-dictation.ts @@ -0,0 +1,223 @@ +import type { IncomingMessage } from "node:http"; +import type { Duplex } from "node:stream"; +import type { WebSocketServer } from "ws"; +import { WebSocket } from "ws"; +import type { createSubsystemLogger } from "../logging/subsystem.js"; +import { resolveEnvApiKey } from "../agents/model-auth.js"; + +type SubsystemLogger = ReturnType; + +const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v2/listen"; +const DICTATION_PATH = "/dictation/stream"; + +type DictationParams = { + model?: string; + language?: string; + sampleRate?: string; +}; + +function buildDeepgramUrl(params: DictationParams): string { + const url = new URL(DEEPGRAM_WS_URL); + url.searchParams.set("model", params.model || "flux-general-en"); + url.searchParams.set("encoding", "linear16"); + url.searchParams.set("sample_rate", params.sampleRate || "16000"); + // v2/listen only accepts: model, encoding, sample_rate, eot_threshold, eager_eot_threshold, eot_timeout_ms + // Do NOT set: interim_results, punctuate, smart_format, language (these are v1-only) + return url.toString(); +} + +function resolveDeepgramApiKey(): string | null { + const envResult = resolveEnvApiKey("deepgram"); + return envResult?.apiKey ?? null; +} + +function isJsonMessage(data: Buffer | ArrayBuffer | Buffer[]): boolean { + if (Buffer.isBuffer(data)) { + // 0x7b is ASCII code for '{' - indicates start of JSON + return data.length > 0 && data[0] === 0x7b; + } + if (data instanceof ArrayBuffer) { + const view = new Uint8Array(data); + return view.length > 0 && view[0] === 0x7b; + } + if (Array.isArray(data) && data.length > 0) { + return isJsonMessage(data[0]); + } + return false; +} + +export type DictationUpgradeHandler = ( + req: IncomingMessage, + socket: Duplex, + head: Buffer, + wss: WebSocketServer, +) => boolean; + +export function createDictationUpgradeHandler(opts: { + log: SubsystemLogger; +}): DictationUpgradeHandler { + const { log } = opts; + + return (req: IncomingMessage, socket: Duplex, head: Buffer, wss: WebSocketServer): boolean => { + const reqUrl = new URL(req.url ?? "/", "http://localhost"); + if (reqUrl.pathname !== DICTATION_PATH) { + return false; + } + + const apiKey = resolveDeepgramApiKey(); + if (!apiKey) { + log.warn("dictation: DEEPGRAM_API_KEY not configured"); + socket.write("HTTP/1.1 503 Service Unavailable\r\n\r\n"); + socket.destroy(); + return true; + } + + wss.handleUpgrade(req, socket, head, (clientWs) => { + log.info("dictation: client connected"); + + const params: DictationParams = { + model: reqUrl.searchParams.get("model") ?? undefined, + language: reqUrl.searchParams.get("language") ?? undefined, + sampleRate: reqUrl.searchParams.get("sample_rate") ?? undefined, + }; + + const deepgramUrl = buildDeepgramUrl(params); + log.info(`dictation: connecting to ${deepgramUrl}`); + const deepgramWs = new WebSocket(deepgramUrl, { + headers: { + Authorization: `Token ${apiKey}`, + }, + }); + + let deepgramReady = false; + const pendingAudio: Buffer[] = []; + let pendingBytes = 0; + const MAX_PENDING_BYTES = 256 * 1024; // 256 KB (~8s of 16kHz mono PCM) + const CONNECT_TIMEOUT_MS = 10_000; + + // Close client if Deepgram doesn't connect in time + const connectTimer = setTimeout(() => { + if (!deepgramReady) { + log.error("dictation: Deepgram connection timeout"); + if (clientWs.readyState === WebSocket.OPEN) { + clientWs.send( + JSON.stringify({ type: "Error", message: "Transcription service timeout" }), + ); + clientWs.close(1011, "Deepgram connection timeout"); + } + deepgramWs.close(); + } + }, CONNECT_TIMEOUT_MS); + + deepgramWs.on("open", () => { + clearTimeout(connectTimer); + log.info("dictation: connected to Deepgram"); + deepgramReady = true; + // Flush any pending audio + for (const chunk of pendingAudio) { + deepgramWs.send(chunk); + } + pendingAudio.length = 0; + pendingBytes = 0; + }); + + deepgramWs.on("message", (data: Buffer | ArrayBuffer | Buffer[]) => { + // Forward Deepgram responses to client + if (clientWs.readyState === WebSocket.OPEN) { + const message = Buffer.isBuffer(data) + ? data.toString("utf8") + : Array.isArray(data) + ? Buffer.concat(data).toString("utf8") + : Buffer.from(data).toString("utf8"); + clientWs.send(message); + } + }); + + deepgramWs.on("unexpected-response", (_req, res) => { + let body = ""; + res.on("data", (chunk: Buffer) => { + body += chunk.toString(); + }); + res.on("end", () => { + log.error(`dictation: Deepgram rejected (${res.statusCode}): ${body}`); + }); + }); + + deepgramWs.on("error", (err: Error & { code?: string }) => { + clearTimeout(connectTimer); + log.error(`dictation: Deepgram error: ${err.message} (code: ${err.code ?? "none"})`); + if (clientWs.readyState === WebSocket.OPEN) { + clientWs.send(JSON.stringify({ type: "Error", message: "Transcription service error" })); + clientWs.close(1011, "Deepgram error"); + } + }); + + deepgramWs.on("close", (code, reason) => { + clearTimeout(connectTimer); + log.info(`dictation: Deepgram closed (${code}): ${reason.toString()}`); + if (clientWs.readyState === WebSocket.OPEN) { + clientWs.close(1000, "Deepgram closed"); + } + }); + + clientWs.on("message", (data: Buffer | ArrayBuffer | Buffer[]) => { + // Handle control messages (JSON starting with '{') + if (isJsonMessage(data)) { + try { + const dataStr = Buffer.isBuffer(data) + ? data.toString() + : data instanceof ArrayBuffer + ? Buffer.from(data).toString() + : Buffer.concat(data).toString(); + const msg = JSON.parse(dataStr) as { type?: string }; + if (msg.type === "CloseStream" || msg.type === "Finalize") { + if (deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(JSON.stringify(msg)); + } + return; + } + if (msg.type === "KeepAlive") { + if (deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(JSON.stringify(msg)); + } + return; + } + } catch { + // Not valid JSON, treat as audio + } + } + + // Forward audio data to Deepgram + const audioData = Buffer.isBuffer(data) + ? data + : data instanceof ArrayBuffer + ? Buffer.from(data) + : Buffer.concat(data); + if (deepgramReady && deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(audioData); + } else if (pendingBytes < MAX_PENDING_BYTES) { + // Buffer audio until Deepgram is ready (capped to prevent memory growth) + pendingAudio.push(audioData); + pendingBytes += audioData.length; + } + }); + + clientWs.on("close", () => { + log.info("dictation: client disconnected"); + if (deepgramWs.readyState === WebSocket.OPEN) { + deepgramWs.send(JSON.stringify({ type: "CloseStream" })); + deepgramWs.close(); + } + }); + + clientWs.on("error", (err) => { + log.error(`dictation: client error: ${err.message}`); + deepgramWs.close(); + }); + }); + + return true; + }; +} + +export { DICTATION_PATH }; diff --git a/src/gateway/server-http.ts b/src/gateway/server-http.ts index 66a6f725a..182e00457 100644 --- a/src/gateway/server-http.ts +++ b/src/gateway/server-http.ts @@ -9,6 +9,7 @@ import { import { createServer as createHttpsServer } from "node:https"; import type { CanvasHostHandler } from "../canvas-host/server.js"; import type { createSubsystemLogger } from "../logging/subsystem.js"; +import type { DictationUpgradeHandler } from "./server-dictation.js"; import type { GatewayWsClient } from "./server/ws-types.js"; import { resolveAgentAvatar } from "../agents/identity-avatar.js"; import { @@ -415,8 +416,9 @@ export function attachGatewayUpgradeHandler(opts: { canvasHost: CanvasHostHandler | null; clients: Set; resolvedAuth: ResolvedGatewayAuth; + dictationHandler?: DictationUpgradeHandler; }) { - const { httpServer, wss, canvasHost, clients, resolvedAuth } = opts; + const { httpServer, wss, canvasHost, clients, resolvedAuth, dictationHandler } = opts; httpServer.on("upgrade", (req, socket, head) => { void (async () => { if (canvasHost) { @@ -440,6 +442,10 @@ export function attachGatewayUpgradeHandler(opts: { return; } } + // Check if this is a dictation WebSocket request + if (dictationHandler?.(req, socket, head, wss)) { + return; + } wss.handleUpgrade(req, socket, head, (ws) => { wss.emit("connection", ws, req); }); diff --git a/src/gateway/server-methods/agents.ts b/src/gateway/server-methods/agents.ts index d0f3589d3..a521aadac 100644 --- a/src/gateway/server-methods/agents.ts +++ b/src/gateway/server-methods/agents.ts @@ -5,6 +5,7 @@ import { listAgentIds, resolveAgentDir, resolveAgentWorkspaceDir, + resolveDefaultAgentId, } from "../../agents/agent-scope.js"; import { DEFAULT_AGENTS_FILENAME, @@ -219,6 +220,7 @@ export const agentsHandlers: GatewayRequestHandlers = { } const workspaceDir = resolveUserPath(String(params.workspace ?? "").trim()); + const agentType = params.agentType === "voice" ? ("voice" as const) : undefined; // Resolve agentDir against the config we're about to persist (vs the pre-write config), // so subsequent resolutions can't disagree about the agent's directory. @@ -226,6 +228,7 @@ export const agentsHandlers: GatewayRequestHandlers = { agentId, name: rawName, workspace: workspaceDir, + ...(agentType ? { agentType } : {}), }); const agentDir = resolveAgentDir(nextConfig, agentId); nextConfig = applyAgentConfig(nextConfig, { agentId, agentDir }); @@ -236,6 +239,28 @@ export const agentsHandlers: GatewayRequestHandlers = { await ensureAgentWorkspace({ dir: workspaceDir, ensureBootstrapFiles: !skipBootstrap }); await fs.mkdir(resolveSessionTranscriptsDirForAgent(agentId), { recursive: true }); + // Symlink USER.md from the default agent's workspace so new agents inherit + // the user profile automatically. If the user later needs per-agent overrides, + // they can replace the symlink with a regular file. + // TODO: consider a fallback-chain approach instead (read default agent's USER.md + // at runtime if the agent's own copy is empty/missing) for more flexibility. + try { + const defaultAgentId = normalizeAgentId(resolveDefaultAgentId(cfg)); + if (agentId !== defaultAgentId) { + const defaultWorkspace = resolveAgentWorkspaceDir(cfg, defaultAgentId); + const defaultUserPath = path.join(defaultWorkspace, DEFAULT_USER_FILENAME); + const newUserPath = path.join(workspaceDir, DEFAULT_USER_FILENAME); + const defaultUserStat = await fs.stat(defaultUserPath).catch(() => null); + if (defaultUserStat?.isFile()) { + // Remove the blank bootstrap USER.md and replace with a symlink. + await fs.unlink(newUserPath).catch(() => {}); + await fs.symlink(defaultUserPath, newUserPath); + } + } + } catch { + // Non-fatal: agent still works, just won't have inherited user profile. + } + await writeConfigFile(nextConfig); // Always write Name to IDENTITY.md; optionally include emoji/avatar. @@ -287,6 +312,8 @@ export const agentsHandlers: GatewayRequestHandlers = { const model = resolveOptionalStringParam(params.model); const avatar = resolveOptionalStringParam(params.avatar); + const agentType = + params.agentType === "text" || params.agentType === "voice" ? params.agentType : undefined; const nextConfig = applyAgentConfig(cfg, { agentId, @@ -295,6 +322,7 @@ export const agentsHandlers: GatewayRequestHandlers = { : {}), ...(workspaceDir ? { workspace: workspaceDir } : {}), ...(model ? { model } : {}), + ...(agentType ? { agentType } : {}), }); await writeConfigFile(nextConfig); diff --git a/src/gateway/server-methods/config.ts b/src/gateway/server-methods/config.ts index 05a534454..aad598c21 100644 --- a/src/gateway/server-methods/config.ts +++ b/src/gateway/server-methods/config.ts @@ -179,38 +179,36 @@ export const configHandlers: GatewayRequestHandlers = { respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, parsedRes.error)); return; } - const validated = validateConfigObjectWithPlugins(parsedRes.parsed); - if (!validated.ok) { + let restoredParsed: unknown; + try { + restoredParsed = restoreRedactedValues(parsedRes.parsed, snapshot.config); + } catch (err) { respond( false, undefined, - errorShape(ErrorCodes.INVALID_REQUEST, "invalid config", { - details: { issues: validated.issues }, - }), + errorShape(ErrorCodes.INVALID_REQUEST, String(err instanceof Error ? err.message : err)), ); return; } - let restored: typeof validated.config; - try { - restored = restoreRedactedValues( - validated.config, - snapshot.config, - ) as typeof validated.config; - } catch (err) { + const validated = validateConfigObjectWithPlugins(restoredParsed); + if (!validated.ok) { + console.error("[config.set] validation failed:", JSON.stringify(validated.issues, null, 2)); respond( false, undefined, - errorShape(ErrorCodes.INVALID_REQUEST, String(err instanceof Error ? err.message : err)), + errorShape(ErrorCodes.INVALID_REQUEST, "invalid config", { + details: { issues: validated.issues }, + }), ); return; } - await writeConfigFile(restored); + await writeConfigFile(validated.config); respond( true, { ok: true, path: CONFIG_PATH, - config: redactConfigObject(restored), + config: redactConfigObject(validated.config), }, undefined, ); diff --git a/src/gateway/server-methods/models.ts b/src/gateway/server-methods/models.ts index 68eca48a1..2f55d4405 100644 --- a/src/gateway/server-methods/models.ts +++ b/src/gateway/server-methods/models.ts @@ -20,7 +20,8 @@ export const modelsHandlers: GatewayRequestHandlers = { return; } try { - const models = await context.loadGatewayModelCatalog(); + const refresh = (params as { refresh?: boolean }).refresh === true; + const models = await context.loadGatewayModelCatalog({ useCache: !refresh }); respond(true, { models }, undefined); } catch (err) { respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, String(err))); diff --git a/src/gateway/server-methods/types.ts b/src/gateway/server-methods/types.ts index aa26b232f..7d7315b0e 100644 --- a/src/gateway/server-methods/types.ts +++ b/src/gateway/server-methods/types.ts @@ -28,7 +28,7 @@ export type GatewayRequestContext = { deps: ReturnType; cron: CronService; cronStorePath: string; - loadGatewayModelCatalog: () => Promise; + loadGatewayModelCatalog: (opts?: { useCache?: boolean }) => Promise; getHealthCache: () => HealthSummary | null; refreshHealthSnapshot: (opts?: { probe?: boolean }) => Promise; logHealth: { error: (message: string) => void }; diff --git a/src/gateway/server-model-catalog.ts b/src/gateway/server-model-catalog.ts index 7f72fbc4e..5a26c5b36 100644 --- a/src/gateway/server-model-catalog.ts +++ b/src/gateway/server-model-catalog.ts @@ -14,6 +14,8 @@ export function __resetModelCatalogCacheForTest() { resetModelCatalogCacheForTest(); } -export async function loadGatewayModelCatalog(): Promise { - return await loadModelCatalog({ config: loadConfig() }); +export async function loadGatewayModelCatalog(opts?: { + useCache?: boolean; +}): Promise { + return await loadModelCatalog({ config: loadConfig(), useCache: opts?.useCache }); } diff --git a/src/gateway/server-runtime-state.ts b/src/gateway/server-runtime-state.ts index 0312fc2e1..6faa4c2c6 100644 --- a/src/gateway/server-runtime-state.ts +++ b/src/gateway/server-runtime-state.ts @@ -21,6 +21,7 @@ import { createToolEventRecipientRegistry, } from "./server-chat.js"; import { MAX_PAYLOAD_BYTES } from "./server-constants.js"; +import { createDictationUpgradeHandler } from "./server-dictation.js"; import { attachGatewayUpgradeHandler, createGatewayHttpServer } from "./server-http.js"; import { createGatewayHooksRequestHandler } from "./server/hooks.js"; import { listenGatewayHttpServer } from "./server/http-listen.js"; @@ -48,6 +49,7 @@ export async function createGatewayRuntimeState(params: { log: { info: (msg: string) => void; warn: (msg: string) => void }; logHooks: ReturnType; logPlugins: ReturnType; + logDictation: ReturnType; }): Promise<{ canvasHost: CanvasHostHandler | null; httpServer: HttpServer; @@ -167,6 +169,7 @@ export async function createGatewayRuntimeState(params: { noServer: true, maxPayload: MAX_PAYLOAD_BYTES, }); + const dictationHandler = createDictationUpgradeHandler({ log: params.logDictation }); for (const server of httpServers) { attachGatewayUpgradeHandler({ httpServer: server, @@ -174,6 +177,7 @@ export async function createGatewayRuntimeState(params: { canvasHost, clients, resolvedAuth: params.resolvedAuth, + dictationHandler, }); } diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index d46a38ef3..4b32cb76f 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -84,6 +84,7 @@ ensureOpenClawCliOnPath(); const log = createSubsystemLogger("gateway"); const logCanvas = log.child("canvas"); +const logDictation = log.child("dictation"); const logDiscovery = log.child("discovery"); const logTailscale = log.child("tailscale"); const logChannels = log.child("channels"); @@ -350,6 +351,7 @@ export async function startGatewayServer( log, logHooks, logPlugins, + logDictation, }); let bonjourStop: (() => Promise) | null = null; const nodeRegistry = new NodeRegistry(); diff --git a/src/gateway/server/ws-connection/message-handler.ts b/src/gateway/server/ws-connection/message-handler.ts index 89bd9531f..57324f69c 100644 --- a/src/gateway/server/ws-connection/message-handler.ts +++ b/src/gateway/server/ws-connection/message-handler.ts @@ -5,6 +5,7 @@ import type { createSubsystemLogger } from "../../../logging/subsystem.js"; import type { ResolvedGatewayAuth } from "../../auth.js"; import type { GatewayRequestContext, GatewayRequestHandlers } from "../../server-methods/types.js"; import type { GatewayWsClient } from "../ws-types.js"; +import { resolveEnvApiKey } from "../../../agents/model-auth.js"; import { loadConfig } from "../../../config/config.js"; import { deriveDeviceIdFromPublicKey, @@ -849,6 +850,8 @@ export function attachGatewayWsMessageHandler(params: { snapshot.health = cachedHealth; snapshot.stateVersion.health = getHealthVersion(); } + const deepgramKey = resolveEnvApiKey("deepgram"); + const dictationEnabled = Boolean(deepgramKey?.apiKey?.trim()); const helloOk = { type: "hello-ok", protocol: PROTOCOL_VERSION, @@ -858,7 +861,7 @@ export function attachGatewayWsMessageHandler(params: { host: os.hostname(), connId, }, - features: { methods: gatewayMethods, events }, + features: { methods: gatewayMethods, events, dictation: dictationEnabled }, snapshot, canvasHostUrl, auth: deviceToken diff --git a/src/gateway/session-utils.ts b/src/gateway/session-utils.ts index bbbbc575e..7dc182935 100644 --- a/src/gateway/session-utils.ts +++ b/src/gateway/session-utils.ts @@ -290,7 +290,7 @@ export function listAgentsForGateway(cfg: OpenClawConfig): { const scope = cfg.session?.scope ?? "per-sender"; const configuredById = new Map< string, - { name?: string; identity?: GatewayAgentRow["identity"] } + { name?: string; agentType?: "text" | "voice"; identity?: GatewayAgentRow["identity"] } >(); for (const entry of cfg.agents?.list ?? []) { if (!entry?.id) { @@ -311,6 +311,7 @@ export function listAgentsForGateway(cfg: OpenClawConfig): { : undefined; configuredById.set(normalizeAgentId(entry.id), { name: typeof entry.name === "string" && entry.name.trim() ? entry.name.trim() : undefined, + agentType: entry.agentType, identity, }); } @@ -328,15 +329,33 @@ export function listAgentsForGateway(cfg: OpenClawConfig): { } const agents = agentIds.map((id) => { const meta = configuredById.get(id); + const agentType = meta?.agentType ?? inferAgentTypeFromIdentity(cfg, id); return { id, name: meta?.name, + ...(agentType ? { agentType } : {}), identity: meta?.identity, }; }); return { defaultId, mainKey, scope, agents }; } +/** + * Infer agentType from IDENTITY.md for agents that predate the agentType config field. + * Returns "voice" if a `Voice:` line is found, otherwise undefined (treated as "text"). + */ +function inferAgentTypeFromIdentity(cfg: OpenClawConfig, agentId: string): "voice" | undefined { + try { + const workspace = resolveAgentWorkspaceDir(cfg, agentId); + const content = fs.readFileSync(path.join(workspace, "IDENTITY.md"), "utf-8"); + return /^\s*-?\s*(?:\*{1,2}|_{1,2})?voice(?:\*{1,2}|_{1,2})?\s*:/im.test(content) + ? "voice" + : undefined; + } catch { + return undefined; + } +} + function canonicalizeSessionKeyForAgent(agentId: string, key: string): string { if (key === "global" || key === "unknown") { return key; diff --git a/src/gateway/session-utils.types.ts b/src/gateway/session-utils.types.ts index a7939bd1e..06ce86f30 100644 --- a/src/gateway/session-utils.types.ts +++ b/src/gateway/session-utils.types.ts @@ -46,6 +46,7 @@ export type GatewaySessionRow = { export type GatewayAgentRow = { id: string; name?: string; + agentType?: "text" | "voice"; identity?: { name?: string; theme?: string; diff --git a/ui/index.html b/ui/index.html index dc03f4911..aac635bb0 100644 --- a/ui/index.html +++ b/ui/index.html @@ -2,12 +2,16 @@ - - OpenClaw Control + + DeepClaw Control + + + + diff --git a/ui/public/apple-touch-icon.png b/ui/public/apple-touch-icon.png index 71781843f..9490c0ea2 100644 Binary files a/ui/public/apple-touch-icon.png and b/ui/public/apple-touch-icon.png differ diff --git a/ui/public/favicon-32.png b/ui/public/favicon-32.png index 563c79b0e..38ae998f6 100644 Binary files a/ui/public/favicon-32.png and b/ui/public/favicon-32.png differ diff --git a/ui/public/favicon.ico b/ui/public/favicon.ico index ec5665f56..12626bfa5 100644 Binary files a/ui/public/favicon.ico and b/ui/public/favicon.ico differ diff --git a/ui/public/favicon.svg b/ui/public/favicon.svg index bcbc1e10c..686701f40 100644 --- a/ui/public/favicon.svg +++ b/ui/public/favicon.svg @@ -1,22 +1,3 @@ - - - - - - - - - - - - - - - - - - - - - + + diff --git a/ui/public/fonts/Roobert-Bold.otf b/ui/public/fonts/Roobert-Bold.otf new file mode 100644 index 000000000..0bfb61923 Binary files /dev/null and b/ui/public/fonts/Roobert-Bold.otf differ diff --git a/ui/public/fonts/Roobert-BoldItalic.otf b/ui/public/fonts/Roobert-BoldItalic.otf new file mode 100644 index 000000000..1e6f1746b Binary files /dev/null and b/ui/public/fonts/Roobert-BoldItalic.otf differ diff --git a/ui/public/fonts/Roobert-Heavy.otf b/ui/public/fonts/Roobert-Heavy.otf new file mode 100644 index 000000000..802fb2647 Binary files /dev/null and b/ui/public/fonts/Roobert-Heavy.otf differ diff --git a/ui/public/fonts/Roobert-HeavyItalic.otf b/ui/public/fonts/Roobert-HeavyItalic.otf new file mode 100644 index 000000000..4dfb780d3 Binary files /dev/null and b/ui/public/fonts/Roobert-HeavyItalic.otf differ diff --git a/ui/public/fonts/Roobert-Light.otf b/ui/public/fonts/Roobert-Light.otf new file mode 100644 index 000000000..9296ba7b7 Binary files /dev/null and b/ui/public/fonts/Roobert-Light.otf differ diff --git a/ui/public/fonts/Roobert-LightItalic.otf b/ui/public/fonts/Roobert-LightItalic.otf new file mode 100644 index 000000000..ab6b780bb Binary files /dev/null and b/ui/public/fonts/Roobert-LightItalic.otf differ diff --git a/ui/public/fonts/Roobert-Medium.otf b/ui/public/fonts/Roobert-Medium.otf new file mode 100644 index 000000000..6b94852ae Binary files /dev/null and b/ui/public/fonts/Roobert-Medium.otf differ diff --git a/ui/public/fonts/Roobert-MediumItalic.otf b/ui/public/fonts/Roobert-MediumItalic.otf new file mode 100644 index 000000000..00b864342 Binary files /dev/null and b/ui/public/fonts/Roobert-MediumItalic.otf differ diff --git a/ui/public/fonts/Roobert-Regular.otf b/ui/public/fonts/Roobert-Regular.otf new file mode 100644 index 000000000..afdcdeb9f Binary files /dev/null and b/ui/public/fonts/Roobert-Regular.otf differ diff --git a/ui/public/fonts/Roobert-RegularItalic.otf b/ui/public/fonts/Roobert-RegularItalic.otf new file mode 100644 index 000000000..8349e5826 Binary files /dev/null and b/ui/public/fonts/Roobert-RegularItalic.otf differ diff --git a/ui/public/fonts/Roobert-SemiBold.otf b/ui/public/fonts/Roobert-SemiBold.otf new file mode 100644 index 000000000..e7eb49615 Binary files /dev/null and b/ui/public/fonts/Roobert-SemiBold.otf differ diff --git a/ui/public/fonts/Roobert-SemiBoldItalic.otf b/ui/public/fonts/Roobert-SemiBoldItalic.otf new file mode 100644 index 000000000..d12c3f7af Binary files /dev/null and b/ui/public/fonts/Roobert-SemiBoldItalic.otf differ diff --git a/ui/public/icon-192.png b/ui/public/icon-192.png new file mode 100644 index 000000000..287c4e398 Binary files /dev/null and b/ui/public/icon-192.png differ diff --git a/ui/public/icon-512.png b/ui/public/icon-512.png new file mode 100644 index 000000000..1ac8f2899 Binary files /dev/null and b/ui/public/icon-512.png differ diff --git a/ui/public/manifest.json b/ui/public/manifest.json new file mode 100644 index 000000000..23bce2eb0 --- /dev/null +++ b/ui/public/manifest.json @@ -0,0 +1,15 @@ +{ + "name": "DeepClaw Control", + "short_name": "DeepClaw", + "description": "DeepClaw Gateway Dashboard", + "start_url": ".", + "scope": ".", + "display": "standalone", + "background_color": "#0a0a0f", + "theme_color": "#0a0a0f", + "icons": [ + { "src": "favicon.svg", "sizes": "any", "type": "image/svg+xml" }, + { "src": "icon-192.png", "sizes": "192x192", "type": "image/png", "purpose": "any maskable" }, + { "src": "icon-512.png", "sizes": "512x512", "type": "image/png", "purpose": "any maskable" } + ] +} diff --git a/ui/public/sw.js b/ui/public/sw.js new file mode 100644 index 000000000..40dd139fa --- /dev/null +++ b/ui/public/sw.js @@ -0,0 +1,64 @@ +const CACHE_VERSION = "deepclaw-v1"; +const SHELL_ASSETS = ["./", "./manifest.json", "./favicon.svg", "./icon-192.png", "./icon-512.png"]; + +self.addEventListener("install", (event) => { + event.waitUntil(caches.open(CACHE_VERSION).then((cache) => cache.addAll(SHELL_ASSETS))); + self.skipWaiting(); +}); + +self.addEventListener("activate", (event) => { + event.waitUntil( + caches + .keys() + .then((keys) => + Promise.all(keys.filter((k) => k !== CACHE_VERSION).map((k) => caches.delete(k))), + ), + ); + self.clients.claim(); +}); + +self.addEventListener("fetch", (event) => { + const url = new URL(event.request.url); + + // Skip non-GET, WebSocket, RPC, and API requests + if ( + event.request.method !== "GET" || + url.protocol === "ws:" || + url.protocol === "wss:" || + url.pathname.includes("/rpc") + ) { + return; + } + + // Static assets: cache-first + if (/\.(js|css|otf|svg|png|ico|woff2?)$/i.test(url.pathname)) { + event.respondWith( + caches.match(event.request).then( + (cached) => + cached || + fetch(event.request).then((response) => { + if (response.ok) { + const clone = response.clone(); + void caches.open(CACHE_VERSION).then((cache) => cache.put(event.request, clone)); + } + return response; + }), + ), + ); + return; + } + + // Navigation: network-first with cache fallback + if (event.request.mode === "navigate") { + event.respondWith( + fetch(event.request) + .then((response) => { + const clone = response.clone(); + void caches.open(CACHE_VERSION).then((cache) => cache.put(event.request, clone)); + return response; + }) + .catch(() => caches.match(event.request)), + ); + return; + } +}); diff --git a/ui/src/main.ts b/ui/src/main.ts index 9374bb20e..735fe9b8f 100644 --- a/ui/src/main.ts +++ b/ui/src/main.ts @@ -1,2 +1,8 @@ import "./styles.css"; import "./ui/app.ts"; + +if ("serviceWorker" in navigator) { + window.addEventListener("load", () => { + navigator.serviceWorker.register("./sw.js", { scope: "./" }).catch(() => {}); + }); +} diff --git a/ui/src/styles.css b/ui/src/styles.css index 16b327f3a..3e7f190c3 100644 --- a/ui/src/styles.css +++ b/ui/src/styles.css @@ -1,5 +1,6 @@ @import "./styles/base.css"; @import "./styles/layout.css"; @import "./styles/layout.mobile.css"; +@import "./styles/layout.mobile-tab-bar.css"; @import "./styles/components.css"; @import "./styles/config.css"; diff --git a/ui/src/styles/base.css b/ui/src/styles/base.css index b83afd32c..f713c4ee9 100644 --- a/ui/src/styles/base.css +++ b/ui/src/styles/base.css @@ -1,75 +1,165 @@ -@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap"); +/* Roobert font family */ +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-Light.otf") format("opentype"); + font-weight: 300; + font-style: normal; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-LightItalic.otf") format("opentype"); + font-weight: 300; + font-style: italic; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-Regular.otf") format("opentype"); + font-weight: 400; + font-style: normal; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-RegularItalic.otf") format("opentype"); + font-weight: 400; + font-style: italic; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-Medium.otf") format("opentype"); + font-weight: 500; + font-style: normal; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-MediumItalic.otf") format("opentype"); + font-weight: 500; + font-style: italic; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-SemiBold.otf") format("opentype"); + font-weight: 600; + font-style: normal; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-SemiBoldItalic.otf") format("opentype"); + font-weight: 600; + font-style: italic; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-Bold.otf") format("opentype"); + font-weight: 700; + font-style: normal; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-BoldItalic.otf") format("opentype"); + font-weight: 700; + font-style: italic; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-Heavy.otf") format("opentype"); + font-weight: 800; + font-style: normal; + font-display: swap; +} +@font-face { + font-family: "Roobert"; + src: url("/fonts/Roobert-HeavyItalic.otf") format("opentype"); + font-weight: 800; + font-style: italic; + font-display: swap; +} + +@import url("https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap"); :root { - /* Background - Warmer dark with depth */ - --bg: #12141a; - --bg-accent: #14161d; - --bg-elevated: #1a1d25; - --bg-hover: #262a35; - --bg-muted: #262a35; - - /* Card / Surface - More contrast between levels */ - --card: #181b22; - --card-foreground: #f4f4f5; + /* Background - Deepgram dark grays */ + --bg: #101014; + --bg-accent: #1a1a1f; + --bg-elevated: #232329; + --bg-hover: #2c2c33; + --bg-muted: #2c2c33; + + /* Card / Surface */ + --card: #1a1a1f; + --card-foreground: #fbfbff; --card-highlight: rgba(255, 255, 255, 0.05); - --popover: #181b22; - --popover-foreground: #f4f4f5; + --popover: #1a1a1f; + --popover-foreground: #fbfbff; /* Panel */ - --panel: #12141a; - --panel-strong: #1a1d25; - --panel-hover: #262a35; - --chrome: rgba(18, 20, 26, 0.95); - --chrome-strong: rgba(18, 20, 26, 0.98); - - /* Text - Slightly warmer */ - --text: #e4e4e7; - --text-strong: #fafafa; - --chat-text: #e4e4e7; - --muted: #71717a; - --muted-strong: #52525b; - --muted-foreground: #71717a; - - /* Border - Subtle but defined */ - --border: #27272a; - --border-strong: #3f3f46; - --border-hover: #52525b; - --input: #27272a; - --ring: #ff5c5c; - - /* Accent - Punchy signature red */ - --accent: #ff5c5c; - --accent-hover: #ff7070; - --accent-muted: #ff5c5c; - --accent-subtle: rgba(255, 92, 92, 0.15); - --accent-foreground: #fafafa; - --accent-glow: rgba(255, 92, 92, 0.25); - --primary: #ff5c5c; - --primary-foreground: #ffffff; - - /* Secondary - Teal accent for variety */ - --secondary: #1e2028; - --secondary-foreground: #f4f4f5; - --accent-2: #14b8a6; - --accent-2-muted: rgba(20, 184, 166, 0.7); - --accent-2-subtle: rgba(20, 184, 166, 0.15); - - /* Semantic - More saturated */ - --ok: #22c55e; - --ok-muted: rgba(34, 197, 94, 0.75); - --ok-subtle: rgba(34, 197, 94, 0.12); - --destructive: #ef4444; - --destructive-foreground: #fafafa; - --warn: #f59e0b; - --warn-muted: rgba(245, 158, 11, 0.75); - --warn-subtle: rgba(245, 158, 11, 0.12); - --danger: #ef4444; - --danger-muted: rgba(239, 68, 68, 0.75); - --danger-subtle: rgba(239, 68, 68, 0.12); - --info: #3b82f6; + --panel: #101014; + --panel-strong: #1a1a1f; + --panel-hover: #2c2c33; + --chrome: rgba(16, 16, 20, 0.95); + --chrome-strong: rgba(16, 16, 20, 0.98); + + /* Text - Deepgram grays */ + --text: #e1e1e5; + --text-strong: #fbfbff; + --chat-text: #e1e1e5; + --muted: #949498; + --muted-strong: #4e4e52; + --muted-foreground: #949498; + + /* Border */ + --border: #2c2c33; + --border-strong: #4e4e52; + --border-hover: #949498; + --input: #2c2c33; + --ring: #13ef93; + + /* Accent - Deepgram Spring Green */ + --accent: #13ef93; + --accent-hover: #a1f9d4; + --accent-muted: #13ef93; + --accent-subtle: rgba(19, 239, 147, 0.15); + --accent-foreground: #0b0b0c; + --accent-glow: rgba(19, 239, 147, 0.25); + --primary: #13ef93; + --primary-foreground: #0b0b0c; + + /* Secondary - Deepgram Blue */ + --secondary: #1a1a1f; + --secondary-foreground: #fbfbff; + --accent-2: #149afb; + --accent-2-muted: rgba(20, 154, 251, 0.7); + --accent-2-subtle: rgba(20, 154, 251, 0.15); + + /* Semantic colors */ + --ok: #12b76a; + --ok-muted: rgba(18, 183, 106, 0.75); + --ok-subtle: rgba(18, 183, 106, 0.12); + --destructive: #f04438; + --destructive-foreground: #fbfbff; + --warn: #fec84b; + --warn-muted: rgba(254, 200, 75, 0.75); + --warn-subtle: rgba(254, 200, 75, 0.12); + --danger: #f04438; + --danger-muted: rgba(240, 68, 56, 0.75); + --danger-subtle: rgba(240, 68, 56, 0.12); + --info: #149afb; + + /* Additional accent colors */ + --pink: #ee028c; + --purple: #ae63f9; /* Focus - With glow */ - --focus: rgba(255, 92, 92, 0.25); + --focus: rgba(19, 239, 147, 0.25); --focus-ring: 0 0 0 2px var(--bg), 0 0 0 4px var(--ring); --focus-glow: 0 0 0 2px var(--bg), 0 0 0 4px var(--ring), 0 0 20px var(--accent-glow); @@ -80,21 +170,20 @@ --theme-switch-x: 50%; --theme-switch-y: 50%; - /* Typography - Space Grotesk for personality */ + /* Typography - Roobert for brand, JetBrains Mono for code */ --mono: "JetBrains Mono", ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace; - --font-body: "Space Grotesk", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; - --font-display: - "Space Grotesk", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + --font-body: "Roobert", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + --font-display: "Roobert", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; - /* Shadows - Richer with subtle color */ + /* Shadows */ --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.2); --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.25), 0 0 0 1px rgba(255, 255, 255, 0.03); --shadow-lg: 0 12px 28px rgba(0, 0, 0, 0.35), 0 0 0 1px rgba(255, 255, 255, 0.03); --shadow-xl: 0 24px 48px rgba(0, 0, 0, 0.4), 0 0 0 1px rgba(255, 255, 255, 0.03); --shadow-glow: 0 0 30px var(--accent-glow); - /* Radii - Slightly larger for friendlier feel */ + /* Radii */ --radius-sm: 6px; --radius-md: 8px; --radius-lg: 12px; @@ -102,7 +191,7 @@ --radius-full: 9999px; --radius: 8px; - /* Transitions - Snappy but smooth */ + /* Transitions */ --ease-out: cubic-bezier(0.16, 1, 0.3, 1); --ease-in-out: cubic-bezier(0.4, 0, 0.2, 1); --ease-spring: cubic-bezier(0.34, 1.56, 0.64, 1); @@ -113,68 +202,68 @@ color-scheme: dark; } -/* Light theme - Clean with subtle warmth */ +/* Light theme - Deepgram light palette */ :root[data-theme="light"] { - --bg: #fafafa; - --bg-accent: #f5f5f5; - --bg-elevated: #ffffff; - --bg-hover: #f0f0f0; - --bg-muted: #f0f0f0; - --bg-content: #f5f5f5; - - --card: #ffffff; - --card-foreground: #18181b; + --bg: #fbfbff; + --bg-accent: #ededf2; + --bg-elevated: #fbfbff; + --bg-hover: #e1e1e5; + --bg-muted: #e1e1e5; + --bg-content: #ededf2; + + --card: #fbfbff; + --card-foreground: #1a1a1f; --card-highlight: rgba(0, 0, 0, 0.03); - --popover: #ffffff; - --popover-foreground: #18181b; - - --panel: #fafafa; - --panel-strong: #f5f5f5; - --panel-hover: #ebebeb; - --chrome: rgba(250, 250, 250, 0.95); - --chrome-strong: rgba(250, 250, 250, 0.98); - - --text: #3f3f46; - --text-strong: #18181b; - --chat-text: #3f3f46; - --muted: #71717a; - --muted-strong: #52525b; - --muted-foreground: #71717a; - - --border: #e4e4e7; - --border-strong: #d4d4d8; - --border-hover: #a1a1aa; - --input: #e4e4e7; - - --accent: #dc2626; - --accent-hover: #ef4444; - --accent-muted: #dc2626; - --accent-subtle: rgba(220, 38, 38, 0.12); - --accent-foreground: #ffffff; - --accent-glow: rgba(220, 38, 38, 0.15); - --primary: #dc2626; - --primary-foreground: #ffffff; - - --secondary: #f4f4f5; - --secondary-foreground: #3f3f46; - --accent-2: #0d9488; - --accent-2-muted: rgba(13, 148, 136, 0.75); - --accent-2-subtle: rgba(13, 148, 136, 0.12); - - --ok: #16a34a; - --ok-muted: rgba(22, 163, 74, 0.75); - --ok-subtle: rgba(22, 163, 74, 0.1); - --destructive: #dc2626; - --destructive-foreground: #fafafa; - --warn: #d97706; - --warn-muted: rgba(217, 119, 6, 0.75); - --warn-subtle: rgba(217, 119, 6, 0.1); - --danger: #dc2626; - --danger-muted: rgba(220, 38, 38, 0.75); - --danger-subtle: rgba(220, 38, 38, 0.1); - --info: #2563eb; - - --focus: rgba(220, 38, 38, 0.2); + --popover: #fbfbff; + --popover-foreground: #1a1a1f; + + --panel: #fbfbff; + --panel-strong: #ededf2; + --panel-hover: #e1e1e5; + --chrome: rgba(251, 251, 255, 0.95); + --chrome-strong: rgba(251, 251, 255, 0.98); + + --text: #4e4e52; + --text-strong: #1a1a1f; + --chat-text: #4e4e52; + --muted: #949498; + --muted-strong: #4e4e52; + --muted-foreground: #949498; + + --border: #e1e1e5; + --border-strong: #bbbbbf; + --border-hover: #949498; + --input: #e1e1e5; + + --accent: #075433; + --accent-hover: #13ef93; + --accent-muted: #075433; + --accent-subtle: rgba(7, 84, 51, 0.12); + --accent-foreground: #fbfbff; + --accent-glow: rgba(7, 84, 51, 0.15); + --primary: #075433; + --primary-foreground: #fbfbff; + + --secondary: #ededf2; + --secondary-foreground: #4e4e52; + --accent-2: #149afb; + --accent-2-muted: rgba(20, 154, 251, 0.75); + --accent-2-subtle: rgba(20, 154, 251, 0.12); + + --ok: #12b76a; + --ok-muted: rgba(18, 183, 106, 0.75); + --ok-subtle: rgba(18, 183, 106, 0.1); + --destructive: #f04438; + --destructive-foreground: #fbfbff; + --warn: #fec84b; + --warn-muted: rgba(254, 200, 75, 0.75); + --warn-subtle: rgba(254, 200, 75, 0.1); + --danger: #f04438; + --danger-muted: rgba(240, 68, 56, 0.75); + --danger-subtle: rgba(240, 68, 56, 0.1); + --info: #149afb; + + --focus: rgba(7, 84, 51, 0.2); --focus-glow: 0 0 0 2px var(--bg), 0 0 0 4px var(--ring), 0 0 16px var(--accent-glow); --grid-line: rgba(0, 0, 0, 0.05); @@ -354,7 +443,7 @@ select { @keyframes glow-pulse { 0%, 100% { - box-shadow: 0 0 0 rgba(255, 92, 92, 0); + box-shadow: 0 0 0 rgba(19, 239, 147, 0); } 50% { box-shadow: 0 0 20px var(--accent-glow); @@ -386,3 +475,10 @@ select { outline: none; box-shadow: var(--focus-ring); } + +/* Safe-area insets for notched devices */ +@supports (padding-top: env(safe-area-inset-top)) { + .topbar { + padding-top: max(0px, calc(env(safe-area-inset-top) - 8px)); + } +} diff --git a/ui/src/styles/chat.css b/ui/src/styles/chat.css index 07d3b644a..27e4fcf5e 100644 --- a/ui/src/styles/chat.css +++ b/ui/src/styles/chat.css @@ -3,3 +3,4 @@ @import "./chat/grouped.css"; @import "./chat/tool-cards.css"; @import "./chat/sidebar.css"; +@import "./chat/dictation.css"; diff --git a/ui/src/styles/chat/dictation.css b/ui/src/styles/chat/dictation.css new file mode 100644 index 000000000..a97720ab0 --- /dev/null +++ b/ui/src/styles/chat/dictation.css @@ -0,0 +1,154 @@ +/* ============================================= + DICTATION - Voice input button and permission modal + ============================================= */ + +/* Dictation button states */ +.chat-dictation-btn { + position: relative; + display: flex; + align-items: center; + justify-content: center; + padding: 0.5rem; + border-radius: var(--radius-md); + transition: + background-color 0.15s, + color 0.15s, + transform 0.15s; +} + +/* Styled tooltip for keyboard shortcut */ +.chat-dictation-btn::after { + content: attr(data-tooltip); + position: absolute; + bottom: calc(100% + 8px); + left: 50%; + transform: translateX(-50%); + padding: 6px 10px; + background: var(--bg-elevated); + color: var(--fg); + font-size: 12px; + white-space: nowrap; + border-radius: var(--radius-sm); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2); + opacity: 0; + visibility: hidden; + transition: + opacity 0.15s, + visibility 0.15s; + pointer-events: none; + z-index: 100; +} + +.chat-dictation-btn:hover::after { + opacity: 1; + visibility: visible; +} + +/* Hide tooltip when recording (sound waves would overlap) */ +.chat-dictation-btn--recording::after { + display: none; +} + +.chat-dictation-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.chat-dictation-btn--recording { + background-color: var(--danger); + color: white; + transform: scale(1.1); + box-shadow: 0 0 0 3px var(--danger-muted); +} + +.chat-dictation-btn--recording:hover { + background-color: var(--danger-muted); +} + +/* Sound wave rings around mic button */ +.chat-dictation-btn--recording::before, +.chat-dictation-btn--recording::after { + content: ""; + position: absolute; + inset: -4px; + border-radius: 50%; + border: 2px solid var(--danger); + animation: sound-wave 1.5s ease-out infinite; + pointer-events: none; +} + +.chat-dictation-btn--recording::after { + inset: -10px; + animation-delay: 0.5s; +} + +@keyframes sound-wave { + 0% { + opacity: 0.6; + transform: scale(0.9); + } + 100% { + opacity: 0; + transform: scale(1.3); + } +} + +/* Input field recording state */ +.chat-compose__field--recording textarea { + border-color: var(--danger) !important; + box-shadow: 0 0 0 2px var(--danger-muted); + animation: input-pulse-recording 1.5s ease-in-out infinite; +} + +.chat-compose__field--recording textarea::placeholder { + color: var(--danger); + opacity: 0.8; +} + +@keyframes input-pulse-recording { + 0%, + 100% { + box-shadow: 0 0 0 2px var(--danger-muted); + } + 50% { + box-shadow: 0 0 0 3px var(--danger); + } +} + +/* Permission modal */ +.dictation-permission-modal { + max-width: 28rem; +} + +.dictation-permission-modal__content { + display: flex; + flex-direction: column; + gap: 1rem; +} + +.dictation-permission-modal__browser-instructions { + background: var(--bg-muted); + border-radius: var(--radius-md); + padding: 1rem; + font-size: 0.875rem; +} + +.dictation-permission-modal__browser-instructions h4 { + margin: 0 0 0.5rem; + font-weight: 600; +} + +.dictation-permission-modal__browser-instructions ol { + margin: 0; + padding-left: 1.25rem; +} + +.dictation-permission-modal__browser-instructions li { + margin-bottom: 0.25rem; +} + +.dictation-permission-modal__actions { + display: flex; + gap: 0.5rem; + justify-content: flex-end; +} diff --git a/ui/src/styles/components.css b/ui/src/styles/components.css index 0b1d56ef7..13fbfd5b3 100644 --- a/ui/src/styles/components.css +++ b/ui/src/styles/components.css @@ -291,6 +291,7 @@ background: var(--danger); box-shadow: 0 0 8px rgba(239, 68, 68, 0.5); animation: pulse-subtle 2s ease-in-out infinite; + flex-shrink: 0; } .statusDot.ok { @@ -299,6 +300,154 @@ animation: none; } +.statusDot.warn { + background: var(--warn); + box-shadow: 0 0 8px rgba(245, 158, 11, 0.4); + animation: pulse-subtle 2s ease-in-out infinite; +} + +.statusDot.off { + background: var(--muted); + box-shadow: none; + animation: none; + opacity: 0.5; +} + +/* =========================================== + Channel Accordion List + =========================================== */ + +.channel-list { + display: grid; + grid-template-columns: 1fr; + gap: 6px; +} + +.channel-row { + border: 1px solid var(--border); + background: var(--card); + border-radius: var(--radius-md); + box-shadow: var(--shadow-sm); + transition: + border-color var(--duration-fast) var(--ease-out), + box-shadow var(--duration-fast) var(--ease-out); +} + +.channel-row:hover { + border-color: var(--border-strong); +} + +.channel-row[open] { + box-shadow: var(--shadow-md); +} + +.channel-row--error { + border-left: 3px solid var(--danger); +} + +.channel-summary { + display: flex; + align-items: center; + gap: 12px; + padding: 12px 16px; + cursor: pointer; + list-style: none; + user-select: none; +} + +.channel-summary::-webkit-details-marker { + display: none; +} + +.channel-summary::marker { + content: ""; +} + +.channel-summary__dot { + flex-shrink: 0; +} + +.channel-summary__name { + font-size: 14px; + font-weight: 600; + letter-spacing: -0.01em; + color: var(--text-strong); + white-space: nowrap; + min-width: 100px; +} + +.channel-summary__chips { + display: flex; + align-items: center; + gap: 6px; + flex: 1; + min-width: 0; +} + +.channel-summary__chips .chip { + font-size: 11px; + padding: 2px 8px; + pointer-events: none; +} + +.channel-summary__chevron { + flex-shrink: 0; + font-size: 12px; + color: var(--muted); + transition: transform var(--duration-fast) ease; +} + +.channel-row[open] .channel-summary__chevron { + transform: rotate(90deg); +} + +.channel-detail { + border-top: 1px solid var(--border); + padding: 16px; +} + +.channel-debug-toggle { + margin-top: 18px; + border: 1px solid var(--border); + border-radius: var(--radius-md); +} + +.channel-debug-toggle > summary { + padding: 10px 14px; + cursor: pointer; + font-size: 13px; + font-weight: 500; + color: var(--muted); + list-style: none; + display: flex; + align-items: center; + gap: 8px; +} + +.channel-debug-toggle > summary::-webkit-details-marker { + display: none; +} + +.channel-debug-toggle > summary::marker { + content: ""; +} + +.channel-debug-toggle > summary::after { + content: "▸"; + font-size: 11px; + transition: transform var(--duration-fast) ease; +} + +.channel-debug-toggle[open] > summary::after { + transform: rotate(90deg); +} + +.channel-debug-toggle .code-block { + margin: 0; + border-top: 1px solid var(--border); + border-radius: 0 0 var(--radius-md) var(--radius-md); +} + /* =========================================== Buttons - Tactile with personality =========================================== */ @@ -1800,6 +1949,11 @@ border-color: var(--warn); } +.agent-pill.voice { + color: #a78bfa; + border-color: #a78bfa; +} + .agent-header { display: grid; grid-template-columns: minmax(0, 1fr) auto; diff --git a/ui/src/styles/layout.mobile-tab-bar.css b/ui/src/styles/layout.mobile-tab-bar.css new file mode 100644 index 000000000..04206d600 --- /dev/null +++ b/ui/src/styles/layout.mobile-tab-bar.css @@ -0,0 +1,196 @@ +/* =========================================== + Mobile Bottom Tab Bar + =========================================== */ + +/* Hidden by default (desktop) */ +.mobile-tab-bar { + display: none; +} +.mobile-more-backdrop { + display: none; +} +.mobile-more-sheet { + display: none; +} + +@media (max-width: 768px) { + /* Hide desktop nav */ + .nav { + display: none !important; + } + + /* Show mobile tab bar */ + .mobile-tab-bar { + display: flex; + position: fixed; + bottom: 0; + left: 0; + right: 0; + z-index: 50; + background: var(--bg-card, #141420); + border-top: 1px solid var(--border, #333); + padding: 4px 0; + padding-bottom: max(4px, env(safe-area-inset-bottom)); + justify-content: space-around; + align-items: stretch; + } + + .mobile-tab-bar__tab { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 2px; + flex: 1; + min-height: 48px; + padding: 6px 4px; + background: none; + border: none; + color: var(--text-dim, #888); + cursor: pointer; + position: relative; + -webkit-tap-highlight-color: transparent; + transition: color 150ms ease; + } + + .mobile-tab-bar__tab--active { + color: var(--accent, #13ef93); + } + + .mobile-tab-bar__icon { + width: 22px; + height: 22px; + } + + .mobile-tab-bar__icon svg { + width: 100%; + height: 100%; + fill: none; + stroke: currentColor; + stroke-width: 2; + stroke-linecap: round; + stroke-linejoin: round; + } + + .mobile-tab-bar__label { + font-size: 10px; + font-weight: 500; + line-height: 1; + } + + .mobile-tab-bar__dot { + position: absolute; + top: 6px; + right: calc(50% - 14px); + width: 6px; + height: 6px; + border-radius: 50%; + background: var(--accent, #13ef93); + } + + /* Content needs bottom padding to avoid tab bar occlusion */ + .content { + padding-bottom: 72px !important; + } + + /* More sheet backdrop */ + .mobile-more-backdrop { + display: block; + position: fixed; + inset: 0; + z-index: 49; + background: rgba(0, 0, 0, 0.5); + backdrop-filter: blur(2px); + -webkit-backdrop-filter: blur(2px); + } + + /* More sheet */ + .mobile-more-sheet { + display: flex; + flex-direction: column; + position: fixed; + bottom: 0; + left: 0; + right: 0; + z-index: 51; + background: var(--bg-card, #141420); + border-top: 1px solid var(--border, #333); + border-radius: 16px 16px 0 0; + padding: 16px 16px; + padding-bottom: max(16px, calc(env(safe-area-inset-bottom) + 64px)); + max-height: 60vh; + overflow-y: auto; + gap: 12px; + animation: mobile-sheet-up 200ms ease-out; + } + + @keyframes mobile-sheet-up { + from { + transform: translateY(100%); + } + to { + transform: translateY(0); + } + } + + .mobile-more-sheet__group { + display: flex; + flex-direction: column; + gap: 4px; + } + + .mobile-more-sheet__group-label { + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.05em; + color: var(--text-dim, #888); + padding: 4px 8px; + } + + .mobile-more-sheet__group-items { + display: flex; + flex-direction: column; + gap: 2px; + } + + .mobile-more-sheet__item { + display: flex; + align-items: center; + gap: 10px; + padding: 10px 12px; + background: none; + border: none; + border-radius: var(--radius-md, 8px); + color: var(--text, #eee); + cursor: pointer; + font-size: 14px; + -webkit-tap-highlight-color: transparent; + transition: background 150ms ease; + } + + .mobile-more-sheet__item:active { + background: var(--bg-hover, #1a1a2e); + } + + .mobile-more-sheet__item--active { + color: var(--accent, #13ef93); + background: var(--bg-hover, #1a1a2e); + } + + .mobile-more-sheet__item .mobile-tab-bar__icon { + width: 20px; + height: 20px; + } + + /* Hide tab bar in focus modes */ + .shell--chat-focus .mobile-tab-bar, + .shell--onboarding .mobile-tab-bar { + display: none; + } + + .shell--chat-focus .content, + .shell--onboarding .content { + padding-bottom: 16px !important; + } +} diff --git a/ui/src/styles/layout.mobile.css b/ui/src/styles/layout.mobile.css index 450a83608..48c8a0ffc 100644 --- a/ui/src/styles/layout.mobile.css +++ b/ui/src/styles/layout.mobile.css @@ -299,6 +299,16 @@ width: 12px; height: 12px; } + + /* Touch-friendly button targets */ + .btn { + min-height: 44px; + min-width: 44px; + } + + .chat-compose__actions .btn { + min-height: 44px; + } } /* Small mobile */ @@ -372,3 +382,25 @@ height: 11px; } } + +/* Touch devices: always show hover-revealed controls */ +@media (hover: none) { + .chat-msg__actions { + opacity: 1; + } +} + +/* Keyboard-aware layout */ +@media (max-width: 768px) { + .keyboard-open .mobile-tab-bar { + display: none !important; + } + + .keyboard-open .content { + padding-bottom: 16px !important; + } + + .keyboard-open .chat-compose { + padding-bottom: env(safe-area-inset-bottom, 4px); + } +} diff --git a/ui/src/ui/app-defaults.ts b/ui/src/ui/app-defaults.ts index 89bdaf11d..39458f7f1 100644 --- a/ui/src/ui/app-defaults.ts +++ b/ui/src/ui/app-defaults.ts @@ -1,5 +1,5 @@ import type { LogLevel } from "./types.ts"; -import type { CronFormState } from "./ui-types.ts"; +import type { AddAgentFormState, CronFormState } from "./ui-types.ts"; export const DEFAULT_LOG_LEVEL_FILTERS: Record = { trace: true, @@ -30,3 +30,12 @@ export const DEFAULT_CRON_FORM: CronFormState = { deliveryTo: "", timeoutSeconds: "", }; + +export const DEFAULT_ADD_AGENT_FORM: AddAgentFormState = { + name: "", + emoji: "", + workspace: "", + agentType: "text", + voice: "aura-2-thalia-en", + greeting: "", +}; diff --git a/ui/src/ui/app-gateway.ts b/ui/src/ui/app-gateway.ts index 4cfa01134..bdd541a49 100644 --- a/ui/src/ui/app-gateway.ts +++ b/ui/src/ui/app-gateway.ts @@ -26,6 +26,7 @@ import { } from "./controllers/exec-approval.ts"; import { loadNodes } from "./controllers/nodes.ts"; import { loadSessions } from "./controllers/sessions.ts"; +import { isDictationSupported } from "./dictation.ts"; import { GatewayBrowserClient } from "./gateway.ts"; type GatewayHost = { @@ -54,6 +55,7 @@ type GatewayHost = { refreshSessionsAfterChat: Set; execApprovalQueue: ExecApprovalRequest[]; execApprovalError: string | null; + dictationEnabled: boolean; }; type SessionDefaultsSnapshot = { @@ -140,6 +142,22 @@ export function connectGateway(host: GatewayHost) { (host as unknown as { chatStream: string | null }).chatStream = null; (host as unknown as { chatStreamStartedAt: number | null }).chatStreamStartedAt = null; resetToolStream(host as unknown as Parameters[0]); + // Stop any active dictation session on reconnect + const dictationHost = host as unknown as { + dictationClient: { stop: () => void } | null; + dictationState: string; + }; + if (dictationHost.dictationClient && dictationHost.dictationState === "recording") { + dictationHost.dictationClient.stop(); + dictationHost.dictationState = "idle"; + } + // Check if dictation feature is available and supported + const features = hello.features as { dictation?: boolean } | undefined; + if (features?.dictation) { + host.dictationEnabled = isDictationSupported(); + } else { + host.dictationEnabled = false; + } void loadAssistantIdentity(host as unknown as OpenClawApp); void loadAgents(host as unknown as OpenClawApp); void loadNodes(host as unknown as OpenClawApp, { quiet: true }); diff --git a/ui/src/ui/app-lifecycle.ts b/ui/src/ui/app-lifecycle.ts index 9a9826103..5570127a3 100644 --- a/ui/src/ui/app-lifecycle.ts +++ b/ui/src/ui/app-lifecycle.ts @@ -17,6 +17,7 @@ import { syncTabWithLocation, syncThemeWithSettings, } from "./app-settings.ts"; +import { observeVirtualKeyboard } from "./mobile-keyboard.ts"; type LifecycleHost = { basePath: string; @@ -32,6 +33,7 @@ type LifecycleHost = { logsEntries: unknown[]; popStateHandler: () => void; topbarObserver: ResizeObserver | null; + keyboardCleanup: (() => void) | null; }; export function handleConnected(host: LifecycleHost) { @@ -49,6 +51,9 @@ export function handleConnected(host: LifecycleHost) { if (host.tab === "debug") { startDebugPolling(host as unknown as Parameters[0]); } + host.keyboardCleanup = observeVirtualKeyboard(({ isOpen }) => { + document.documentElement.classList.toggle("keyboard-open", isOpen); + }); } export function handleFirstUpdated(host: LifecycleHost) { @@ -56,6 +61,8 @@ export function handleFirstUpdated(host: LifecycleHost) { } export function handleDisconnected(host: LifecycleHost) { + host.keyboardCleanup?.(); + host.keyboardCleanup = null; window.removeEventListener("popstate", host.popStateHandler); stopNodesPolling(host as unknown as Parameters[0]); stopLogsPolling(host as unknown as Parameters[0]); diff --git a/ui/src/ui/app-render.ts b/ui/src/ui/app-render.ts index 5431627e0..70ccb7df0 100644 --- a/ui/src/ui/app-render.ts +++ b/ui/src/ui/app-render.ts @@ -1,9 +1,12 @@ import { html, nothing } from "lit"; import type { AppViewState } from "./app-view-state.ts"; +import type { OpenClawApp } from "./app.ts"; import type { UsageState } from "./controllers/usage.ts"; import { parseAgentSessionKey } from "../../../src/routing/session-key.js"; import { refreshChatAvatar } from "./app-chat.ts"; +import { DEFAULT_ADD_AGENT_FORM } from "./app-defaults.ts"; import { renderChatControls, renderTab, renderThemeToggle } from "./app-render.helpers.ts"; +import { submitAddAgent } from "./controllers/add-agent.ts"; import { loadAgentFileContent, loadAgentFiles, saveAgentFile } from "./controllers/agent-files.ts"; import { loadAgentIdentities, loadAgentIdentity } from "./controllers/agent-identity.ts"; import { loadAgentSkills } from "./controllers/agent-skills.ts"; @@ -40,8 +43,10 @@ import { updateExecApprovalsFormValue, } from "./controllers/exec-approvals.ts"; import { loadLogs } from "./controllers/logs.ts"; +import { loadModelCatalog } from "./controllers/model-catalog.ts"; import { loadNodes } from "./controllers/nodes.ts"; import { loadPresence } from "./controllers/presence.ts"; +import { saveAgentIdentity } from "./controllers/save-identity.ts"; import { deleteSession, loadSessions, patchSession } from "./controllers/sessions.ts"; import { installSkill, @@ -62,6 +67,8 @@ const debouncedLoadUsage = (state: UsageState) => { } usageDateDebounceTimeout = window.setTimeout(() => void loadUsage(state), 400); }; +import { renderMobileTabBar } from "./mobile-tab-bar.ts"; +import { renderAddAgentModal } from "./views/add-agent-modal.ts"; import { renderAgents } from "./views/agents.ts"; import { renderChannels } from "./views/channels.ts"; import { renderChat } from "./views/chat.ts"; @@ -134,10 +141,10 @@ export function renderApp(state: AppViewState) {
-
OPENCLAW
+
DEEPCLAW
Gateway Dashboard
@@ -608,7 +615,10 @@ export function renderApp(state: AppViewState) { ${ state.tab === "agents" - ? renderAgents({ + ? (state.agentModelCatalog.length === 0 && + !state.agentModelCatalogLoading && + void loadModelCatalog(state), + renderAgents({ loading: state.agentsLoading, error: state.agentsError, agentsList: state.agentsList, @@ -636,13 +646,41 @@ export function renderApp(state: AppViewState) { agentIdentityLoading: state.agentIdentityLoading, agentIdentityError: state.agentIdentityError, agentIdentityById: state.agentIdentityById, + identityDraftName: state.identityDraftName, + identityDraftEmoji: state.identityDraftEmoji, + identitySaving: state.identitySaving, + onIdentityNameChange: (value) => { + state.identityDraftName = value; + }, + onIdentityEmojiChange: (value) => { + state.identityDraftEmoji = value; + }, + onIdentitySave: (agentId) => { + void saveAgentIdentity( + state as unknown as Parameters[0], + agentId, + ); + }, + onIdentityReset: () => { + state.identityDraftName = null; + state.identityDraftEmoji = null; + }, agentSkillsLoading: state.agentSkillsLoading, agentSkillsReport: state.agentSkillsReport, agentSkillsError: state.agentSkillsError, agentSkillsAgentId: state.agentSkillsAgentId, skillsFilter: state.skillsFilter, + modelCatalog: state.agentModelCatalog, + modelCatalogLoading: state.agentModelCatalogLoading, + onModelCatalogRefresh: () => loadModelCatalog(state, { refresh: true }), + onAddAgentOpen: () => { + state.addAgentForm = { ...DEFAULT_ADD_AGENT_FORM }; + state.addAgentError = null; + state.addAgentModalOpen = true; + }, onRefresh: async () => { await loadAgents(state); + void loadModelCatalog(state, { refresh: true }); const agentIds = state.agentsList?.agents?.map((entry) => entry.id) ?? []; if (agentIds.length > 0) { void loadAgentIdentities(state, agentIds); @@ -653,6 +691,8 @@ export function renderApp(state: AppViewState) { return; } state.agentsSelectedId = agentId; + state.identityDraftName = null; + state.identityDraftEmoji = null; state.agentFilesList = null; state.agentFilesError = null; state.agentFilesLoading = false; @@ -948,7 +988,32 @@ export function renderApp(state: AppViewState) { : { fallbacks: normalized }; updateConfigFormValue(state, basePath, next); }, - }) + onAgentTypeChange: (agentId, agentType) => { + if (!configValue) { + return; + } + const list = (configValue as { agents?: { list?: unknown[] } }).agents?.list; + if (!Array.isArray(list)) { + return; + } + const index = list.findIndex( + (entry) => + entry && + typeof entry === "object" && + "id" in entry && + (entry as { id?: string }).id === agentId, + ); + if (index < 0) { + return; + } + const basePath = ["agents", "list", index, "agentType"]; + if (agentType === "voice") { + updateConfigFormValue(state, basePath, "voice"); + } else { + removeConfigFormValue(state, basePath); + } + }, + })) : nothing } @@ -1127,6 +1192,16 @@ export function renderApp(state: AppViewState) { onSplitRatioChange: (ratio: number) => state.handleSplitRatioChange(ratio), assistantName: state.assistantName, assistantAvatar: state.assistantAvatar, + // Dictation props + dictationEnabled: (state as unknown as OpenClawApp).dictationEnabled, + dictationState: (state as unknown as OpenClawApp).dictationState, + showMicPermissionModal: (state as unknown as OpenClawApp).showMicPermissionModal, + pendingDictationText: (state as unknown as OpenClawApp).pendingDictationText, + onDictationToggle: () => (state as unknown as OpenClawApp).handleDictationToggle(), + onMicPermissionModalClose: () => + (state as unknown as OpenClawApp).handleMicPermissionModalClose(), + onMicPermissionRetry: () => + (state as unknown as OpenClawApp).handleMicPermissionRetry(), }) : nothing } @@ -1210,13 +1285,37 @@ export function renderApp(state: AppViewState) { onToggleAutoFollow: (next) => (state.logsAutoFollow = next), onRefresh: () => loadLogs(state, { reset: true }), onExport: (lines, label) => state.exportLogs(lines, label), + onCopy: (lines) => void navigator.clipboard.writeText(lines.join("\n")), onScroll: (event) => state.handleLogsScroll(event), }) : nothing } + ${renderMobileTabBar(state, state.mobileMoreOpen, () => state.handleToggleMobileMore())} ${renderExecApprovalPrompt(state)} ${renderGatewayUrlConfirmation(state)} + ${renderAddAgentModal({ + open: state.addAgentModalOpen, + form: state.addAgentForm, + busy: state.addAgentBusy, + error: state.addAgentError, + onFormChange: (patch) => { + state.addAgentForm = { ...state.addAgentForm, ...patch }; + }, + onSubmit: async () => { + const agentId = await submitAddAgent(state); + if (agentId) { + await loadAgents(state); + const agentIds = state.agentsList?.agents?.map((entry) => entry.id) ?? []; + if (agentIds.length > 0) { + void loadAgentIdentities(state, agentIds); + } + } + }, + onClose: () => { + state.addAgentModalOpen = false; + }, + })} `; } diff --git a/ui/src/ui/app-settings.ts b/ui/src/ui/app-settings.ts index e0860e4e5..40c7c2b6f 100644 --- a/ui/src/ui/app-settings.ts +++ b/ui/src/ui/app-settings.ts @@ -18,6 +18,7 @@ import { loadDebug } from "./controllers/debug.ts"; import { loadDevices } from "./controllers/devices.ts"; import { loadExecApprovals } from "./controllers/exec-approvals.ts"; import { loadLogs } from "./controllers/logs.ts"; +import { loadModelCatalog } from "./controllers/model-catalog.ts"; import { loadNodes } from "./controllers/nodes.ts"; import { loadPresence } from "./controllers/presence.ts"; import { loadSessions } from "./controllers/sessions.ts"; @@ -203,6 +204,7 @@ export async function refreshActiveTab(host: SettingsHost) { if (host.tab === "agents") { await loadAgents(host as unknown as OpenClawApp); await loadConfig(host as unknown as OpenClawApp); + void loadModelCatalog(host as unknown as OpenClawApp); const agentIds = host.agentsList?.agents?.map((entry) => entry.id) ?? []; if (agentIds.length > 0) { void loadAgentIdentities(host as unknown as OpenClawApp, agentIds); diff --git a/ui/src/ui/app-view-state.ts b/ui/src/ui/app-view-state.ts index f01b4f915..02d00d121 100644 --- a/ui/src/ui/app-view-state.ts +++ b/ui/src/ui/app-view-state.ts @@ -22,6 +22,7 @@ import type { HealthSnapshot, LogEntry, LogLevel, + ModelCatalogEntry, NostrProfile, PresenceEntry, SessionsUsageResult, @@ -31,7 +32,12 @@ import type { SkillStatusReport, StatusSummary, } from "./types.ts"; -import type { ChatAttachment, ChatQueueItem, CronFormState } from "./ui-types.ts"; +import type { + AddAgentFormState, + ChatAttachment, + ChatQueueItem, + CronFormState, +} from "./ui-types.ts"; import type { NostrProfileFormState } from "./views/channels.nostr-profile-form.ts"; import type { SessionLogEntry } from "./views/usage.ts"; @@ -72,6 +78,8 @@ export type AppViewState = { sidebarContent: string | null; sidebarError: string | null; splitRatio: number; + mobileMoreOpen: boolean; + handleToggleMobileMore: () => void; scrollToBottom: (opts?: { smooth?: boolean }) => void; devicesLoading: boolean; devicesError: string | null; @@ -142,6 +150,15 @@ export type AppViewState = { agentSkillsError: string | null; agentSkillsReport: SkillStatusReport | null; agentSkillsAgentId: string | null; + agentModelCatalog: ModelCatalogEntry[]; + agentModelCatalogLoading: boolean; + identityDraftName: string | null; + identityDraftEmoji: string | null; + identitySaving: boolean; + addAgentModalOpen: boolean; + addAgentForm: AddAgentFormState; + addAgentBusy: boolean; + addAgentError: string | null; sessionsLoading: boolean; sessionsResult: SessionsListResult | null; sessionsError: string | null; @@ -264,6 +281,10 @@ export type AppViewState = { handleLoadNodes: () => Promise; handleLoadPresence: () => Promise; handleLoadSkills: () => Promise; + handleAddAgentOpen: () => void; + handleAddAgentClose: () => void; + handleAddAgentFormChange: (patch: Partial) => void; + handleAddAgentSubmit: () => Promise; handleLoadDebug: () => Promise; handleLoadLogs: () => Promise; handleDebugCall: () => Promise; diff --git a/ui/src/ui/app.ts b/ui/src/ui/app.ts index bb2c9f154..e58a31365 100644 --- a/ui/src/ui/app.ts +++ b/ui/src/ui/app.ts @@ -21,6 +21,7 @@ import type { HealthSnapshot, LogEntry, LogLevel, + ModelCatalogEntry, PresenceEntry, ChannelsStatusSnapshot, SessionsListResult, @@ -47,7 +48,11 @@ import { handleSendChat as handleSendChatInternal, removeQueuedMessage as removeQueuedMessageInternal, } from "./app-chat.ts"; -import { DEFAULT_CRON_FORM, DEFAULT_LOG_LEVEL_FILTERS } from "./app-defaults.ts"; +import { + DEFAULT_ADD_AGENT_FORM, + DEFAULT_CRON_FORM, + DEFAULT_LOG_LEVEL_FILTERS, +} from "./app-defaults.ts"; import { connectGateway as connectGatewayInternal } from "./app-gateway.ts"; import { handleConnected, @@ -78,8 +83,14 @@ import { } from "./app-tool-stream.ts"; import { resolveInjectedAssistantIdentity } from "./assistant-identity.ts"; import { loadAssistantIdentity as loadAssistantIdentityInternal } from "./controllers/assistant-identity.ts"; +import { DictationClient, type DictationState } from "./dictation.ts"; import { loadSettings, type UiSettings } from "./storage.ts"; -import { type ChatAttachment, type ChatQueueItem, type CronFormState } from "./ui-types.ts"; +import { + type AddAgentFormState, + type ChatAttachment, + type ChatQueueItem, + type CronFormState, +} from "./ui-types.ts"; declare global { interface Window { @@ -142,6 +153,14 @@ export class OpenClawApp extends LitElement { @state() sidebarContent: string | null = null; @state() sidebarError: string | null = null; @state() splitRatio = this.settings.splitRatio; + @state() mobileMoreOpen = false; + + // Dictation state + private dictationClient: DictationClient | null = null; + @state() dictationState: DictationState = "idle"; + @state() dictationEnabled = false; + @state() showMicPermissionModal = false; + @state() pendingDictationText = ""; @state() nodesLoading = false; @state() nodes: Array> = []; @@ -220,6 +239,18 @@ export class OpenClawApp extends LitElement { @state() agentSkillsReport: SkillStatusReport | null = null; @state() agentSkillsAgentId: string | null = null; + @state() agentModelCatalog: ModelCatalogEntry[] = []; + @state() agentModelCatalogLoading = false; + + @state() identityDraftName: string | null = null; + @state() identityDraftEmoji: string | null = null; + @state() identitySaving = false; + + @state() addAgentModalOpen = false; + @state() addAgentForm: AddAgentFormState = { ...DEFAULT_ADD_AGENT_FORM }; + @state() addAgentBusy = false; + @state() addAgentError: string | null = null; + @state() sessionsLoading = false; @state() sessionsResult: SessionsListResult | null = null; @state() sessionsError: string | null = null; @@ -325,6 +356,7 @@ export class OpenClawApp extends LitElement { @state() logsAtBottom = true; client: GatewayBrowserClient | null = null; + private keyboardCleanup: (() => void) | null = null; private chatScrollFrame: number | null = null; private chatScrollTimeout: number | null = null; private chatHasAutoScrolled = false; @@ -351,6 +383,7 @@ export class OpenClawApp extends LitElement { connectedCallback() { super.connectedCallback(); handleConnected(this as unknown as Parameters[0]); + document.addEventListener("keydown", this.handleGlobalKeydown); } protected firstUpdated() { @@ -359,6 +392,7 @@ export class OpenClawApp extends LitElement { disconnectedCallback() { handleDisconnected(this as unknown as Parameters[0]); + document.removeEventListener("keydown", this.handleGlobalKeydown); super.disconnectedCallback(); } @@ -415,6 +449,11 @@ export class OpenClawApp extends LitElement { setTab(next: Tab) { setTabInternal(this as unknown as Parameters[0], next); + this.mobileMoreOpen = false; + } + + handleToggleMobileMore() { + this.mobileMoreOpen = !this.mobileMoreOpen; } setTheme(next: ThemeMode, context?: Parameters[2]) { @@ -495,6 +534,24 @@ export class OpenClawApp extends LitElement { handleNostrProfileToggleAdvancedInternal(this); } + handleAddAgentOpen() { + this.addAgentForm = { ...DEFAULT_ADD_AGENT_FORM }; + this.addAgentError = null; + this.addAgentModalOpen = true; + } + + handleAddAgentClose() { + this.addAgentModalOpen = false; + } + + handleAddAgentFormChange(patch: Partial) { + this.addAgentForm = { ...this.addAgentForm, ...patch }; + } + + async handleAddAgentSubmit() { + // Wired in app-render.ts + } + async handleExecApprovalDecision(decision: "allow-once" | "allow-always" | "deny") { const active = this.execApprovalQueue[0]; if (!active || !this.client || this.execApprovalBusy) { @@ -565,6 +622,71 @@ export class OpenClawApp extends LitElement { this.applySettings({ ...this.settings, splitRatio: newRatio }); } + // Dictation handlers + handleDictationToggle = () => { + if (!this.dictationEnabled) { + return; + } + + if (this.dictationState === "recording") { + this.dictationClient?.stop(); + } else if (this.dictationState === "idle" || this.dictationState === "error") { + this.startDictation(); + } + }; + + private startDictation = () => { + if (!this.dictationClient) { + this.dictationClient = new DictationClient({ + gatewayUrl: this.settings.gatewayUrl, + callbacks: { + onStateChange: (state) => { + this.dictationState = state; + this.requestUpdate(); + }, + onTranscript: ({ text, isFinal }) => { + if (isFinal) { + // Append final text to message + const existing = this.chatMessage.trimEnd(); + const spacer = existing && !existing.endsWith(" ") ? " " : ""; + this.chatMessage = existing + spacer + text; + this.pendingDictationText = ""; + } else { + // Show interim text + this.pendingDictationText = text; + } + this.requestUpdate(); + }, + onError: (error) => { + if (error === "permission_denied") { + this.showMicPermissionModal = true; + } + this.requestUpdate(); + }, + }, + }); + } + void this.dictationClient.start(); + }; + + handleMicPermissionModalClose = () => { + this.showMicPermissionModal = false; + this.requestUpdate(); + }; + + handleMicPermissionRetry = () => { + this.showMicPermissionModal = false; + this.startDictation(); + }; + + private handleGlobalKeydown = (e: KeyboardEvent) => { + // Cmd/Ctrl + Shift + D for dictation + if ((e.metaKey || e.ctrlKey) && e.shiftKey && e.key.toLowerCase() === "d") { + e.preventDefault(); + this.handleDictationToggle(); + } + }; + render() { return renderApp(this as unknown as AppViewState); } diff --git a/ui/src/ui/audio-worklet-processor.ts b/ui/src/ui/audio-worklet-processor.ts new file mode 100644 index 000000000..a1e9f2cd7 --- /dev/null +++ b/ui/src/ui/audio-worklet-processor.ts @@ -0,0 +1,80 @@ +/** + * AudioWorklet processor for capturing PCM audio data. + * + * This file runs in the AudioWorklet context, which is a separate thread + * from the main browser thread. It has no access to DOM or most browser APIs. + * Communication with the main thread happens via message passing through `this.port`. + * + * The processor captures audio in 80ms chunks (1280 samples at 16kHz), + * converts Float32 samples to Int16 PCM format (which Deepgram expects), + * and posts the binary data to the main thread. + * + * Usage: + * await audioContext.audioWorklet.addModule('/path/to/audio-worklet-processor.js'); + * const node = new AudioWorkletNode(audioContext, 'pcm-capture-processor'); + * node.port.onmessage = (e) => handlePcmBuffer(e.data); + */ + +// AudioWorklet global types - these are only available in the worklet context +declare class AudioWorkletProcessor { + readonly port: MessagePort; + process( + inputs: Float32Array[][], + outputs: Float32Array[][], + parameters: Record, + ): boolean; +} + +declare function registerProcessor( + name: string, + processorCtor: new () => AudioWorkletProcessor, +): void; + +const BUFFER_SIZE = 1280; // 80ms at 16kHz + +class PcmCaptureProcessor extends AudioWorkletProcessor { + private buffer: Float32Array; + private bufferIndex: number; + + constructor() { + super(); + this.buffer = new Float32Array(BUFFER_SIZE); + this.bufferIndex = 0; + } + + process( + inputs: Float32Array[][], + _outputs: Float32Array[][], + _parameters: Record, + ): boolean { + const input = inputs[0]?.[0]; + if (!input) { + return true; + } + + for (let i = 0; i < input.length; i++) { + this.buffer[this.bufferIndex++] = input[i]; + + if (this.bufferIndex >= BUFFER_SIZE) { + // Convert float32 to int16 PCM + const pcm = new Int16Array(BUFFER_SIZE); + for (let j = 0; j < BUFFER_SIZE; j++) { + const s = Math.max(-1, Math.min(1, this.buffer[j])); + pcm[j] = s < 0 ? s * 0x8000 : s * 0x7fff; + } + + // Transfer the buffer to the main thread + // Using transferable objects for efficiency (avoids copying) + this.port.postMessage(pcm.buffer, [pcm.buffer]); + + // Allocate a new buffer for the next chunk + this.buffer = new Float32Array(BUFFER_SIZE); + this.bufferIndex = 0; + } + } + + return true; + } +} + +registerProcessor("pcm-capture-processor", PcmCaptureProcessor); diff --git a/ui/src/ui/components/mic-permission-modal.ts b/ui/src/ui/components/mic-permission-modal.ts new file mode 100644 index 000000000..cf16a8d82 --- /dev/null +++ b/ui/src/ui/components/mic-permission-modal.ts @@ -0,0 +1,155 @@ +import { html, nothing, type TemplateResult } from "lit"; + +export type MicPermissionModalProps = { + open: boolean; + onClose: () => void; + onRetry: () => void; +}; + +type BrowserType = "chrome" | "safari" | "firefox" | "edge" | "other"; + +function detectBrowser(): BrowserType { + const ua = navigator.userAgent.toLowerCase(); + if (ua.includes("edg/")) { + return "edge"; + } + if (ua.includes("chrome")) { + return "chrome"; + } + if (ua.includes("safari") && !ua.includes("chrome")) { + return "safari"; + } + if (ua.includes("firefox")) { + return "firefox"; + } + return "other"; +} + +function getBrowserInstructions(browser: BrowserType): TemplateResult { + switch (browser) { + case "chrome": + return html` +
    +
  1. Click the lock icon (or tune icon) in the address bar
  2. +
  3. Find Microphone in the permissions list
  4. +
  5. Change it to Allow
  6. +
  7. Refresh the page if prompted
  8. +
+ `; + case "safari": + return html` +
    +
  1. Go to Safari menu → Settings
  2. +
  3. Click the Websites tab
  4. +
  5. Select Microphone from the left sidebar
  6. +
  7. Find this website and set it to Allow
  8. +
+ `; + case "firefox": + return html` +
    +
  1. Click the lock icon in the address bar
  2. +
  3. Click Connection secure
  4. +
  5. Click More Information
  6. +
  7. Go to Permissions tab and allow Microphone
  8. +
+ `; + case "edge": + return html` +
    +
  1. Click the lock icon in the address bar
  2. +
  3. Click Permissions for this site
  4. +
  5. Find Microphone and set it to Allow
  6. +
  7. Refresh the page if prompted
  8. +
+ `; + default: + return html` +
    +
  1. Open your browser settings
  2. +
  3. Navigate to site permissions or privacy settings
  4. +
  5. Find microphone permissions for this website
  6. +
  7. Enable microphone access and refresh the page
  8. +
+ `; + } +} + +function getBrowserName(browser: BrowserType): string { + switch (browser) { + case "chrome": + return "Chrome"; + case "safari": + return "Safari"; + case "firefox": + return "Firefox"; + case "edge": + return "Edge"; + default: + return "your browser"; + } +} + +export function renderMicPermissionModal(props: MicPermissionModalProps) { + if (!props.open) { + return nothing; + } + + const browser = detectBrowser(); + const browserName = getBrowserName(browser); + + return html` + + `; +} diff --git a/ui/src/ui/components/resizable-divider.ts b/ui/src/ui/components/resizable-divider.ts index defec19e5..2b53470c7 100644 --- a/ui/src/ui/components/resizable-divider.ts +++ b/ui/src/ui/components/resizable-divider.ts @@ -47,6 +47,7 @@ export class ResizableDivider extends LitElement { connectedCallback() { super.connectedCallback(); this.addEventListener("mousedown", this.handleMouseDown); + this.addEventListener("touchstart", this.handleTouchStart, { passive: false }); } disconnectedCallback() { @@ -54,6 +55,9 @@ export class ResizableDivider extends LitElement { this.removeEventListener("mousedown", this.handleMouseDown); document.removeEventListener("mousemove", this.handleMouseMove); document.removeEventListener("mouseup", this.handleMouseUp); + this.removeEventListener("touchstart", this.handleTouchStart); + document.removeEventListener("touchmove", this.handleTouchMove); + document.removeEventListener("touchend", this.handleTouchEnd); } private handleMouseDown = (e: MouseEvent) => { @@ -101,6 +105,57 @@ export class ResizableDivider extends LitElement { document.removeEventListener("mousemove", this.handleMouseMove); document.removeEventListener("mouseup", this.handleMouseUp); }; + + private handleTouchStart = (e: TouchEvent) => { + if (e.touches.length !== 1) { + return; + } + this.isDragging = true; + this.startX = e.touches[0].clientX; + this.startRatio = this.splitRatio; + this.classList.add("dragging"); + + document.addEventListener("touchmove", this.handleTouchMove, { passive: false }); + document.addEventListener("touchend", this.handleTouchEnd); + + e.preventDefault(); + }; + + private handleTouchMove = (e: TouchEvent) => { + if (!this.isDragging || e.touches.length !== 1) { + return; + } + + const container = this.parentElement; + if (!container) { + return; + } + + const containerWidth = container.getBoundingClientRect().width; + const deltaX = e.touches[0].clientX - this.startX; + const deltaRatio = deltaX / containerWidth; + + let newRatio = this.startRatio + deltaRatio; + newRatio = Math.max(this.minRatio, Math.min(this.maxRatio, newRatio)); + + this.dispatchEvent( + new CustomEvent("resize", { + detail: { splitRatio: newRatio }, + bubbles: true, + composed: true, + }), + ); + + e.preventDefault(); + }; + + private handleTouchEnd = () => { + this.isDragging = false; + this.classList.remove("dragging"); + + document.removeEventListener("touchmove", this.handleTouchMove); + document.removeEventListener("touchend", this.handleTouchEnd); + }; } declare global { diff --git a/ui/src/ui/controllers/add-agent.ts b/ui/src/ui/controllers/add-agent.ts new file mode 100644 index 000000000..f6a0ca90a --- /dev/null +++ b/ui/src/ui/controllers/add-agent.ts @@ -0,0 +1,96 @@ +import type { GatewayBrowserClient } from "../gateway.ts"; +import type { AddAgentFormState } from "../ui-types.ts"; + +export type AddAgentState = { + client: GatewayBrowserClient | null; + connected: boolean; + addAgentBusy: boolean; + addAgentError: string | null; + addAgentForm: AddAgentFormState; + addAgentModalOpen: boolean; + agentsSelectedId: string | null; +}; + +function slugify(name: string): string { + return name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, ""); +} + +export async function submitAddAgent(state: AddAgentState): Promise { + if (!state.client || !state.connected) { + return null; + } + if (state.addAgentBusy) { + return null; + } + + const form = state.addAgentForm; + const name = form.name.trim(); + if (!name) { + state.addAgentError = "Agent name is required."; + return null; + } + + state.addAgentBusy = true; + state.addAgentError = null; + + try { + const workspace = form.workspace.trim() || `~/.openclaw/workspace/${slugify(name)}`; + const params: Record = { name, workspace }; + if (form.emoji.trim()) { + params.emoji = form.emoji.trim(); + } + if (form.agentType === "voice") { + params.agentType = "voice"; + } + + const res = await state.client.request<{ agentId: string }>("agents.create", params); + if (!res?.agentId) { + state.addAgentError = "Failed to create agent — no ID returned."; + return null; + } + const agentId = res.agentId; + + if (form.agentType === "voice") { + let existing = ""; + try { + const fileRes = await state.client.request<{ content: string }>("agents.files.get", { + agentId, + name: "IDENTITY.md", + }); + existing = fileRes?.content ?? ""; + } catch { + // File may not exist yet + } + + const lines: string[] = []; + if (form.voice.trim()) { + lines.push(`Voice: ${form.voice.trim()}`); + } + if (form.greeting.trim()) { + lines.push(`Greeting: ${form.greeting.trim()}`); + } + + if (lines.length > 0) { + const separator = existing.trim() ? "\n\n" : ""; + const updated = existing.trim() + separator + lines.join("\n") + "\n"; + await state.client.request("agents.files.set", { + agentId, + name: "IDENTITY.md", + content: updated, + }); + } + } + + state.agentsSelectedId = agentId; + state.addAgentModalOpen = false; + return agentId; + } catch (err) { + state.addAgentError = String(err); + return null; + } finally { + state.addAgentBusy = false; + } +} diff --git a/ui/src/ui/controllers/model-catalog.ts b/ui/src/ui/controllers/model-catalog.ts new file mode 100644 index 000000000..f06b4fa8b --- /dev/null +++ b/ui/src/ui/controllers/model-catalog.ts @@ -0,0 +1,28 @@ +import type { ModelCatalogEntry } from "../types.js"; + +type ModelCatalogState = { + agentModelCatalog: ModelCatalogEntry[]; + agentModelCatalogLoading: boolean; + client: { request: (method: string, params: Record) => Promise } | null; + connected: boolean; +}; + +export async function loadModelCatalog( + state: ModelCatalogState, + opts?: { refresh?: boolean }, +): Promise { + if (!state.client || !state.connected) { + return; + } + state.agentModelCatalogLoading = true; + try { + const result = (await state.client.request("models.list", { + refresh: opts?.refresh ?? false, + })) as { models: ModelCatalogEntry[] }; + state.agentModelCatalog = result.models; + } catch (err) { + console.warn("[model-catalog] Failed to load model catalog:", err); + } finally { + state.agentModelCatalogLoading = false; + } +} diff --git a/ui/src/ui/controllers/save-identity.ts b/ui/src/ui/controllers/save-identity.ts new file mode 100644 index 000000000..a7d55d0a0 --- /dev/null +++ b/ui/src/ui/controllers/save-identity.ts @@ -0,0 +1,94 @@ +import type { AgentFilesState } from "./agent-files.ts"; +import type { AgentIdentityState } from "./agent-identity.ts"; +import { loadAgentIdentity } from "./agent-identity.ts"; + +export type SaveIdentityState = AgentFilesState & + AgentIdentityState & { + identityDraftName: string | null; + identityDraftEmoji: string | null; + identitySaving: boolean; + }; + +/** + * Replace (or append) a `- Field: value` line in IDENTITY.md content. + * Handles markdown bold/italic wrapping on the field name. + */ +export function replaceIdentityField(content: string, field: string, newValue: string): string { + const pattern = new RegExp( + `^(\\s*-\\s+(?:\\*{1,2}|_{1,2})?)${escapeRegExp(field)}((?:\\*{1,2}|_{1,2})?\\s*:\\s*).*$`, + "im", + ); + const match = content.match(pattern); + if (match) { + return content.replace(pattern, `$1${field}$2${newValue}`); + } + // Append the field if not found + const trimmed = content.trimEnd(); + const separator = trimmed ? "\n" : ""; + return `${trimmed}${separator}- ${field}: ${newValue}\n`; +} + +function escapeRegExp(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +export async function saveAgentIdentity(state: SaveIdentityState, agentId: string) { + if (!state.client || !state.connected || state.identitySaving) { + return; + } + + const draftName = state.identityDraftName; + const draftEmoji = state.identityDraftEmoji; + if (draftName == null && draftEmoji == null) { + return; + } + + state.identitySaving = true; + try { + // Read current IDENTITY.md + let content = ""; + try { + const res = await state.client.request<{ file?: { content?: string; missing?: boolean } }>( + "agents.files.get", + { agentId, name: "IDENTITY.md" }, + ); + content = res?.file?.content ?? ""; + } catch { + // File may not exist yet + } + + // Apply field replacements + if (draftName != null) { + content = replaceIdentityField(content, "Name", draftName); + } + if (draftEmoji != null) { + content = replaceIdentityField(content, "Emoji", draftEmoji); + } + + // Write back + await state.client.request("agents.files.set", { + agentId, + name: "IDENTITY.md", + content, + }); + + // Clear drafts + state.identityDraftName = null; + state.identityDraftEmoji = null; + + // Invalidate identity cache so it reloads + const { [agentId]: _, ...rest } = state.agentIdentityById; + state.agentIdentityById = rest; + void loadAgentIdentity(state, agentId); + + // Sync the files tab if IDENTITY.md was previously loaded + if (Object.hasOwn(state.agentFileContents, "IDENTITY.md")) { + state.agentFileContents = { ...state.agentFileContents, "IDENTITY.md": content }; + state.agentFileDrafts = { ...state.agentFileDrafts, "IDENTITY.md": content }; + } + } catch (err) { + state.agentIdentityError = String(err); + } finally { + state.identitySaving = false; + } +} diff --git a/ui/src/ui/dictation.test.ts b/ui/src/ui/dictation.test.ts new file mode 100644 index 000000000..09252850b --- /dev/null +++ b/ui/src/ui/dictation.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, it } from "vitest"; +import { isDictationSupported } from "./dictation.ts"; + +describe("dictation", () => { + describe("isDictationSupported", () => { + it("returns true when getUserMedia and AudioWorkletNode are available", () => { + // In a modern browser (Chromium via Playwright), these APIs should be available. + // We're testing in a real browser environment, so we verify the function works + // with the actual browser APIs present. + expect(typeof navigator).toBe("object"); + expect(typeof navigator.mediaDevices).toBe("object"); + expect(typeof navigator.mediaDevices.getUserMedia).toBe("function"); + expect(typeof AudioWorkletNode).toBe("function"); + + // Since all APIs are available in our test browser, this should return true + expect(isDictationSupported()).toBe(true); + }); + + it("returns false when getUserMedia is not available", () => { + // The isDictationSupported function checks for multiple browser APIs. + // In a real browser (Chromium via Playwright), we can verify: + // 1. The function returns a boolean + // 2. The logic correctly requires all APIs to be present + + const result = isDictationSupported(); + expect(typeof result).toBe("boolean"); + + // Verify the underlying logic: all conditions must be true for support + // This helper mirrors the logic in isDictationSupported + const checkLogic = ( + hasNavigator: boolean, + hasMediaDevices: boolean, + hasGetUserMedia: boolean, + hasAudioWorklet: boolean, + ): boolean => { + return hasNavigator && hasMediaDevices && hasGetUserMedia && hasAudioWorklet; + }; + + // All true = supported + expect(checkLogic(true, true, true, true)).toBe(true); + // Any false = not supported + expect(checkLogic(false, true, true, true)).toBe(false); + expect(checkLogic(true, false, true, true)).toBe(false); + expect(checkLogic(true, true, false, true)).toBe(false); + expect(checkLogic(true, true, true, false)).toBe(false); + }); + }); +}); diff --git a/ui/src/ui/dictation.ts b/ui/src/ui/dictation.ts new file mode 100644 index 000000000..275beb0eb --- /dev/null +++ b/ui/src/ui/dictation.ts @@ -0,0 +1,481 @@ +/** + * Browser-side dictation client. + * + * Orchestrates: + * - Mic permission request via getUserMedia + * - AudioContext and AudioWorklet setup for PCM capture + * - WebSocket connection to the gateway (which proxies to Deepgram) + * - Processing Deepgram transcript responses + * - State management throughout the dictation lifecycle + */ + +/** + * Inline JavaScript source for the AudioWorklet processor. + * + * This is embedded rather than imported because: + * 1. AudioWorklet processors must be loaded as separate modules via addModule() + * 2. Bundlers don't always handle worklet files correctly + * 3. The TypeScript source needs to be compiled to JavaScript for browser use + * + * This must be kept in sync with audio-worklet-processor.ts + */ +const WORKLET_SOURCE = ` +const BUFFER_SIZE = 1280; // 80ms at 16kHz + +class PcmCaptureProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.buffer = new Float32Array(BUFFER_SIZE); + this.bufferIndex = 0; + } + + process(inputs, _outputs, _parameters) { + const input = inputs[0]?.[0]; + if (!input) { + return true; + } + + for (let i = 0; i < input.length; i++) { + this.buffer[this.bufferIndex++] = input[i]; + + if (this.bufferIndex >= BUFFER_SIZE) { + // Convert float32 to int16 PCM + const pcm = new Int16Array(BUFFER_SIZE); + for (let j = 0; j < BUFFER_SIZE; j++) { + const s = Math.max(-1, Math.min(1, this.buffer[j])); + pcm[j] = s < 0 ? s * 0x8000 : s * 0x7fff; + } + + // Transfer the buffer to the main thread + this.port.postMessage(pcm.buffer, [pcm.buffer]); + + // Allocate a new buffer for the next chunk + this.buffer = new Float32Array(BUFFER_SIZE); + this.bufferIndex = 0; + } + } + + return true; + } +} + +registerProcessor("pcm-capture-processor", PcmCaptureProcessor); +`; + +/** + * Possible states of the dictation client. + */ +export type DictationState = + | "idle" + | "requesting-permission" + | "connecting" + | "recording" + | "error"; + +/** + * A transcript event from the speech recognition service. + */ +export type DictationTranscript = { + /** The transcribed text */ + text: string; + /** Whether this is a final (committed) transcript vs interim */ + isFinal: boolean; + /** Whether Flux detected end-of-thought (speech_final) */ + speechFinal?: boolean; +}; + +/** + * Callbacks for dictation events. + */ +export type DictationCallbacks = { + /** Called when the dictation state changes */ + onStateChange: (state: DictationState) => void; + /** Called when a transcript (interim or final) is received */ + onTranscript: (transcript: DictationTranscript) => void; + /** Called when an error occurs. Special value "permission_denied" for mic permission errors */ + onError: (error: string) => void; +}; + +/** + * Options for configuring the DictationClient. + */ +export type DictationClientOptions = { + /** The gateway URL (http/https). Will be converted to ws/wss for WebSocket. */ + gatewayUrl: string; + /** Callbacks for dictation events */ + callbacks: DictationCallbacks; + /** Deepgram model to use (default: flux-general-en) */ + model?: string; + /** Language code (default: en) */ + language?: string; +}; + +/** + * Deepgram transcript result structure. + * See: https://developers.deepgram.com/docs/results + */ +type DeepgramResult = { + type: "Results"; + /** Whether this transcript segment is final (won't be revised) */ + is_final?: boolean; + /** Whether Flux detected end-of-thought (Flux model feature) */ + speech_final?: boolean; + channel?: { + alternatives?: Array<{ + transcript?: string; + confidence?: number; + }>; + }; +}; + +type DeepgramError = { + type: "Error"; + message?: string; +}; + +type DeepgramTurnInfo = { + type: "TurnInfo"; + transcript?: string; + event?: string; +}; + +type DeepgramMessage = DeepgramResult | DeepgramError | DeepgramTurnInfo; + +const MIC_CONSTRAINTS: MediaStreamConstraints = { + audio: { + channelCount: 1, + sampleRate: 16000, + echoCancellation: true, + noiseSuppression: true, + }, +}; + +const WEBSOCKET_CONNECT_TIMEOUT_MS = 10_000; +const SAMPLE_RATE = 16000; + +/** + * Create a Blob URL for the AudioWorklet processor. + * + * AudioWorklet processors must be loaded as separate modules, but bundlers + * don't always handle this well. By inlining the source and creating a Blob URL, + * we ensure the worklet loads correctly in all environments. + */ +let workletBlobUrl: string | null = null; + +function getWorkletUrl(): string { + if (!workletBlobUrl) { + const blob = new Blob([WORKLET_SOURCE], { type: "application/javascript" }); + workletBlobUrl = URL.createObjectURL(blob); + } + return workletBlobUrl; +} + +/** + * Client for browser-based voice dictation. + * + * Captures audio from the microphone, streams it to the gateway WebSocket + * (which proxies to Deepgram), and receives real-time transcripts. + * + * Usage: + * ```typescript + * const client = new DictationClient({ + * gatewayUrl: "http://localhost:18789", + * callbacks: { + * onStateChange: (state) => console.log("State:", state), + * onTranscript: ({ text, isFinal }) => console.log("Transcript:", text, isFinal), + * onError: (error) => console.error("Error:", error), + * }, + * }); + * + * await client.start(); + * // ... user speaks ... + * client.stop(); + * ``` + */ +export class DictationClient { + private state: DictationState = "idle"; + private audioContext: AudioContext | null = null; + private mediaStream: MediaStream | null = null; + private workletNode: AudioWorkletNode | null = null; + private sourceNode: MediaStreamAudioSourceNode | null = null; + private ws: WebSocket | null = null; + private callbacks: DictationCallbacks; + private gatewayUrl: string; + private model: string; + private language: string; + + constructor(options: DictationClientOptions) { + this.gatewayUrl = options.gatewayUrl; + this.callbacks = options.callbacks; + this.model = options.model ?? "flux-general-en"; + this.language = options.language ?? "en"; + } + + /** + * Get the current state of the dictation client. + */ + get currentState(): DictationState { + return this.state; + } + + /** + * Start dictation. + * + * This will: + * 1. Request microphone permission + * 2. Connect to the gateway WebSocket + * 3. Start capturing and streaming audio + */ + async start(): Promise { + // Only start from idle or error states + if (this.state !== "idle" && this.state !== "error") { + return; + } + + this.setState("requesting-permission"); + + // Step 1: Request microphone permission + try { + this.mediaStream = await navigator.mediaDevices.getUserMedia(MIC_CONSTRAINTS); + } catch (err) { + const error = err as Error; + console.error("[dictation] mic permission error:", error.name, error.message); + if (error.name === "NotAllowedError" || error.name === "PermissionDeniedError") { + this.callbacks.onError("permission_denied"); + } else { + this.callbacks.onError(`Microphone error: ${error.message}`); + } + this.setState("error"); + return; + } + + this.setState("connecting"); + + // Step 2: Connect WebSocket and setup audio capture + try { + await this.connectWebSocket(); + await this.startAudioCapture(); + this.setState("recording"); + } catch (err) { + this.callbacks.onError(`Connection error: ${(err as Error).message}`); + this.cleanup(); + this.setState("error"); + } + } + + /** + * Stop dictation. + * + * Sends a finalize message to get any remaining transcript, + * then cleans up all resources. + */ + stop(): void { + if (this.state !== "recording") { + return; + } + + // Send finalize to flush any remaining audio/transcript + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: "Finalize" })); + } + + this.cleanup(); + this.setState("idle"); + } + + /** + * Check if dictation is currently active. + */ + get isRecording(): boolean { + return this.state === "recording"; + } + + private setState(state: DictationState): void { + this.state = state; + this.callbacks.onStateChange(state); + } + + private buildWebSocketUrl(): string { + // Convert http(s):// to ws(s):// + const wsUrl = this.gatewayUrl.replace(/^http/, "ws"); + const url = new URL("/dictation/stream", wsUrl); + url.searchParams.set("model", this.model); + // Note: v2/listen doesn't support language parameter - Flux is English-only + url.searchParams.set("sample_rate", String(SAMPLE_RATE)); + return url.toString(); + } + + private async connectWebSocket(): Promise { + return new Promise((resolve, reject) => { + const wsUrl = this.buildWebSocketUrl(); + this.ws = new WebSocket(wsUrl); + + const timeout = window.setTimeout(() => { + reject(new Error("Connection timeout")); + this.ws?.close(); + }, WEBSOCKET_CONNECT_TIMEOUT_MS); + + this.ws.addEventListener("open", () => { + window.clearTimeout(timeout); + resolve(); + }); + + this.ws.addEventListener("error", (e) => { + console.error("[dictation] WebSocket error", e); + window.clearTimeout(timeout); + reject(new Error("WebSocket connection failed")); + }); + + this.ws.addEventListener("message", (event) => { + this.handleMessage(event.data as string); + }); + + this.ws.addEventListener("close", () => { + // If we're still recording, this is an unexpected close + if (this.state === "recording") { + this.callbacks.onError("Connection closed unexpectedly"); + this.cleanup(); + this.setState("error"); + } + }); + }); + } + + private async startAudioCapture(): Promise { + // Create AudioContext at 16kHz to match Deepgram's expected sample rate + this.audioContext = new AudioContext({ sampleRate: SAMPLE_RATE }); + + // Resume AudioContext if it was suspended (browser autoplay policy) + if (this.audioContext.state === "suspended") { + await this.audioContext.resume(); + } + + // Load the AudioWorklet processor from an inline Blob URL + await this.audioContext.audioWorklet.addModule(getWorkletUrl()); + + // Create source from the mic stream + this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream!); + + // Create the worklet node + this.workletNode = new AudioWorkletNode(this.audioContext, "pcm-capture-processor"); + + // Handle PCM data from the worklet + this.workletNode.port.addEventListener("message", (event: MessageEvent) => { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(event.data); + } + }); + + // MessagePort requires explicit start() when using addEventListener + this.workletNode.port.start(); + + // Connect the audio graph: mic -> worklet + // We don't connect to destination since we don't want to play back the audio + this.sourceNode.connect(this.workletNode); + } + + private handleMessage(data: string): void { + let msg: DeepgramMessage; + try { + msg = JSON.parse(data) as DeepgramMessage; + } catch { + // Ignore non-JSON messages + return; + } + + if (msg.type === "Error") { + console.error("[dictation] error from server:", msg.message); + this.callbacks.onError(msg.message ?? "Transcription error"); + return; + } + + // v2 API sends TurnInfo messages instead of Results + if (msg.type === "TurnInfo") { + const transcript = msg.transcript ?? ""; + const isEndOfTurn = msg.event === "EndOfTurn"; + + this.callbacks.onTranscript({ + text: transcript, + isFinal: isEndOfTurn, + speechFinal: isEndOfTurn, + }); + + // Auto-stop on EndOfTurn (Flux end-of-thought detection) + if (isEndOfTurn && transcript) { + this.stop(); + } + } + + // v1 API sends Results messages (kept for backwards compatibility) + if (msg.type === "Results") { + const transcript = msg.channel?.alternatives?.[0]?.transcript ?? ""; + + this.callbacks.onTranscript({ + text: transcript, + isFinal: Boolean(msg.is_final), + speechFinal: msg.speech_final, + }); + + // Auto-stop on speech_final (Flux end-of-thought detection) + if (msg.speech_final) { + this.stop(); + } + } + } + + private cleanup(): void { + // Disconnect audio nodes + if (this.sourceNode) { + this.sourceNode.disconnect(); + this.sourceNode = null; + } + + if (this.workletNode) { + this.workletNode.disconnect(); + this.workletNode = null; + } + + // Close AudioContext + if (this.audioContext) { + // Close can fail if already closed, ignore errors + this.audioContext.close().catch(() => {}); + this.audioContext = null; + } + + // Stop all mic tracks + if (this.mediaStream) { + for (const track of this.mediaStream.getTracks()) { + track.stop(); + } + this.mediaStream = null; + } + + // Close WebSocket + if (this.ws) { + if (this.ws.readyState === WebSocket.OPEN) { + // Send close stream message before closing + this.ws.send(JSON.stringify({ type: "CloseStream" })); + } + this.ws.close(); + this.ws = null; + } + } +} + +/** + * Check if dictation is supported in the current browser. + * + * Requires: + * - navigator.mediaDevices.getUserMedia (for microphone access) + * - AudioWorkletNode (for efficient audio processing) + * + * @returns true if all required APIs are available + */ +export function isDictationSupported(): boolean { + return ( + typeof navigator !== "undefined" && + typeof navigator.mediaDevices !== "undefined" && + typeof navigator.mediaDevices.getUserMedia === "function" && + typeof AudioWorkletNode !== "undefined" + ); +} diff --git a/ui/src/ui/gateway.ts b/ui/src/ui/gateway.ts index 975cca4ab..f1a048979 100644 --- a/ui/src/ui/gateway.ts +++ b/ui/src/ui/gateway.ts @@ -28,7 +28,7 @@ export type GatewayResponseFrame = { export type GatewayHelloOk = { type: "hello-ok"; protocol: number; - features?: { methods?: string[]; events?: string[] }; + features?: { methods?: string[]; events?: string[]; dictation?: boolean }; snapshot?: unknown; auth?: { deviceToken?: string; diff --git a/ui/src/ui/icons.ts b/ui/src/ui/icons.ts index 1682dcfa9..6def7e6ec 100644 --- a/ui/src/ui/icons.ts +++ b/ui/src/ui/icons.ts @@ -124,6 +124,14 @@ export const icons = { `, + grid: html` + + + + + + + `, brain: html` @@ -155,6 +163,24 @@ export const icons = { `, + mic: html` + + + + + + `, + micOff: html` + + + + + + + + + `, + // Tool icons wrench: html` diff --git a/ui/src/ui/mobile-keyboard.ts b/ui/src/ui/mobile-keyboard.ts new file mode 100644 index 000000000..259de8028 --- /dev/null +++ b/ui/src/ui/mobile-keyboard.ts @@ -0,0 +1,35 @@ +export type KeyboardState = { + isOpen: boolean; + height: number; +}; + +type KeyboardCallback = (state: KeyboardState) => void; + +const KEYBOARD_THRESHOLD = 150; + +export function observeVirtualKeyboard(callback: KeyboardCallback): () => void { + const vv = window.visualViewport; + if (!vv) { + return () => {}; + } + + let wasOpen = false; + + const check = () => { + const heightDiff = window.innerHeight - vv.height; + const isOpen = heightDiff > KEYBOARD_THRESHOLD; + + if (isOpen !== wasOpen) { + wasOpen = isOpen; + callback({ isOpen, height: isOpen ? heightDiff : 0 }); + } + }; + + vv.addEventListener("resize", check); + vv.addEventListener("scroll", check); + + return () => { + vv.removeEventListener("resize", check); + vv.removeEventListener("scroll", check); + }; +} diff --git a/ui/src/ui/mobile-tab-bar.ts b/ui/src/ui/mobile-tab-bar.ts new file mode 100644 index 000000000..62772aeea --- /dev/null +++ b/ui/src/ui/mobile-tab-bar.ts @@ -0,0 +1,87 @@ +import { html, nothing } from "lit"; +import type { AppViewState } from "./app-view-state.ts"; +import type { Tab } from "./navigation.ts"; +import { icons } from "./icons.ts"; +import { MOBILE_PRIMARY_TABS, TAB_GROUPS, iconForTab, titleForTab } from "./navigation.ts"; + +export function renderMobileTabBar( + state: AppViewState, + moreOpen: boolean, + onToggleMore: () => void, +) { + const isPrimaryTab = MOBILE_PRIMARY_TABS.includes(state.tab); + + return html` + ${icons[iconForTab(tab)]} + ${titleForTab(tab)} + + `, + )} + + + + ${ + moreOpen + ? html` +
+
+ ${TAB_GROUPS.filter((group) => + group.tabs.some((tab) => !MOBILE_PRIMARY_TABS.includes(tab as Tab)), + ).map( + (group) => html` +
+
${group.label}
+
+ ${group.tabs + .filter((tab) => !MOBILE_PRIMARY_TABS.includes(tab as Tab)) + .map( + (tab) => html` + + `, + )} +
+
+ `, + )} +
+ ` + : nothing + } + `; +} diff --git a/ui/src/ui/navigation.ts b/ui/src/ui/navigation.ts index c4208fb50..1b6419034 100644 --- a/ui/src/ui/navigation.ts +++ b/ui/src/ui/navigation.ts @@ -10,6 +10,8 @@ export const TAB_GROUPS = [ { label: "Settings", tabs: ["config", "debug", "logs"] }, ] as const; +export const MOBILE_PRIMARY_TABS: Tab[] = ["chat", "overview", "channels", "agents"]; + export type Tab = | "agents" | "overview" diff --git a/ui/src/ui/types.ts b/ui/src/ui/types.ts index 1c85b8731..ac53ec538 100644 --- a/ui/src/ui/types.ts +++ b/ui/src/ui/types.ts @@ -326,6 +326,7 @@ export type GatewaySessionsDefaults = { export type GatewayAgentRow = { id: string; name?: string; + agentType?: "text" | "voice"; identity?: { name?: string; theme?: string; @@ -764,6 +765,15 @@ export type HealthSnapshot = Record; export type LogLevel = "trace" | "debug" | "info" | "warn" | "error" | "fatal"; +export type ModelCatalogEntry = { + id: string; + name: string; + provider: string; + contextWindow?: number; + reasoning?: boolean; + input?: Array<"text" | "image">; +}; + export type LogEntry = { raw: string; time?: string | null; diff --git a/ui/src/ui/ui-types.ts b/ui/src/ui/ui-types.ts index 7ce3c7399..4f825fb79 100644 --- a/ui/src/ui/ui-types.ts +++ b/ui/src/ui/ui-types.ts @@ -34,3 +34,12 @@ export type CronFormState = { deliveryTo: string; timeoutSeconds: string; }; + +export type AddAgentFormState = { + name: string; + emoji: string; + workspace: string; + agentType: "text" | "voice"; + voice: string; + greeting: string; +}; diff --git a/ui/src/ui/views/add-agent-modal.ts b/ui/src/ui/views/add-agent-modal.ts new file mode 100644 index 000000000..40b4365cd --- /dev/null +++ b/ui/src/ui/views/add-agent-modal.ts @@ -0,0 +1,168 @@ +import { html, nothing } from "lit"; +import type { AddAgentFormState } from "../ui-types.ts"; +import { renderEmojiPicker } from "./emoji-picker.ts"; + +export type AddAgentModalProps = { + open: boolean; + form: AddAgentFormState; + busy: boolean; + error: string | null; + onFormChange: (patch: Partial) => void; + onSubmit: () => void; + onClose: () => void; +}; + +const DEEPGRAM_VOICES = [ + { value: "aura-2-thalia-en", label: "Thalia (English)" }, + { value: "aura-2-andromeda-en", label: "Andromeda (English)" }, + { value: "aura-2-arcas-en", label: "Arcas (English)" }, + { value: "aura-2-atlas-en", label: "Atlas (English)" }, + { value: "aura-2-luna-en", label: "Luna (English)" }, + { value: "aura-2-helios-en", label: "Helios (English)" }, + { value: "aura-2-zeus-en", label: "Zeus (English)" }, + { value: "aura-2-orpheus-en", label: "Orpheus (English)" }, + { value: "aura-2-asteria-en", label: "Asteria (English)" }, + { value: "aura-2-stella-en", label: "Stella (English)" }, + { value: "aura-2-hera-en", label: "Hera (English)" }, + { value: "aura-2-athena-en", label: "Athena (English)" }, +] as const; + +function slugify(name: string): string { + return name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, ""); +} + +export function renderAddAgentModal(props: AddAgentModalProps) { + if (!props.open) { + return nothing; + } + const { form, busy } = props; + const isVoice = form.agentType === "voice"; + const placeholderWorkspace = form.name.trim() + ? `~/.openclaw/workspace/${slugify(form.name)}` + : "~/.openclaw/workspace/"; + + return html` + + `; +} diff --git a/ui/src/ui/views/agents.ts b/ui/src/ui/views/agents.ts index 765daa60e..53dfd6c63 100644 --- a/ui/src/ui/views/agents.ts +++ b/ui/src/ui/views/agents.ts @@ -8,6 +8,7 @@ import type { ChannelsStatusSnapshot, CronJob, CronStatus, + ModelCatalogEntry, SkillStatusEntry, SkillStatusReport, } from "../types.ts"; @@ -23,6 +24,7 @@ import { formatCronState, formatNextRun, } from "../presenter.ts"; +import { renderEmojiPicker } from "./emoji-picker.ts"; export type AgentsPanel = "overview" | "files" | "tools" | "skills" | "channels" | "cron"; @@ -54,11 +56,21 @@ export type AgentsProps = { agentIdentityLoading: boolean; agentIdentityError: string | null; agentIdentityById: Record; + identityDraftName: string | null; + identityDraftEmoji: string | null; + identitySaving: boolean; + onIdentityNameChange: (value: string) => void; + onIdentityEmojiChange: (value: string) => void; + onIdentitySave: (agentId: string) => void; + onIdentityReset: () => void; agentSkillsLoading: boolean; agentSkillsReport: SkillStatusReport | null; agentSkillsError: string | null; agentSkillsAgentId: string | null; skillsFilter: string; + modelCatalog: ModelCatalogEntry[]; + modelCatalogLoading: boolean; + onModelCatalogRefresh: () => void; onRefresh: () => void; onSelectAgent: (agentId: string) => void; onSelectPanel: (panel: AgentsPanel) => void; @@ -73,6 +85,7 @@ export type AgentsProps = { onConfigSave: () => void; onModelChange: (agentId: string, modelId: string | null) => void; onModelFallbacksChange: (agentId: string, fallbacks: string[]) => void; + onAgentTypeChange: (agentId: string, agentType: "text" | "voice") => void; onChannelsRefresh: () => void; onCronRefresh: () => void; onSkillsFilterChange: (next: string) => void; @@ -80,6 +93,7 @@ export type AgentsProps = { onAgentSkillToggle: (agentId: string, skillName: string, enabled: boolean) => void; onAgentSkillsClear: (agentId: string) => void; onAgentSkillsDisableAll: (agentId: string) => void; + onAddAgentOpen: () => void; }; const TOOL_SECTIONS = [ @@ -296,6 +310,7 @@ type AgentContext = { identityEmoji: string; skillsLabel: string; isDefault: boolean; + agentType: string; }; function buildAgentContext( @@ -329,6 +344,7 @@ function buildAgentContext( identityEmoji, skillsLabel: skillFilter ? `${skillCount} selected` : "all skills", isDefault: Boolean(defaultId && agent.id === defaultId), + agentType: agent.agentType === "voice" ? "Voice" : "Text", }; } @@ -451,6 +467,96 @@ function buildModelOptions(configForm: Record | null, current?: return options.map((option) => html``); } +function formatModelBadges(entry: ModelCatalogEntry): string { + const badges: string[] = []; + if (entry.reasoning) { + badges.push("[reasoning]"); + } + if (entry.input && entry.input.includes("image")) { + badges.push("[vision]"); + } + if (entry.contextWindow && entry.contextWindow > 0) { + const k = Math.round(entry.contextWindow / 1024); + badges.push(`[${k}k]`); + } + return badges.length > 0 ? ` ${badges.join(" ")}` : ""; +} + +function matchesCatalogEntry(current: string, entry: ModelCatalogEntry): boolean { + if (current === entry.id) { + return true; + } + // Check bare model id (without provider prefix) + const slashIdx = entry.id.indexOf("/"); + if (slashIdx > 0) { + const bareId = entry.id.slice(slashIdx + 1); + if (current === bareId) { + return true; + } + } + return false; +} + +function buildCatalogModelOptions( + catalog: ModelCatalogEntry[], + configForm: Record | null, + current?: string | null, +) { + // Group catalog entries by provider + const groups = new Map(); + for (const entry of catalog) { + const provider = entry.provider || "other"; + let list = groups.get(provider); + if (!list) { + list = []; + groups.set(provider, list); + } + list.push(entry); + } + + // Check if current model is in the catalog + const currentInCatalog = current + ? catalog.some((entry) => matchesCatalogEntry(current, entry)) + : false; + + // Include config-based models not in catalog + const configModels = resolveConfiguredModels(configForm); + const extraConfigModels = configModels.filter( + (opt) => !catalog.some((entry) => matchesCatalogEntry(opt.value, entry)), + ); + + const parts = []; + + // If current model is not in catalog or config models, add it + if (current && !currentInCatalog && !extraConfigModels.some((opt) => opt.value === current)) { + parts.push(html``); + } + + // Config-only models not found in catalog + if (extraConfigModels.length > 0) { + parts.push(html` + + ${extraConfigModels.map((opt) => html``)} + + `); + } + + // Catalog models grouped by provider + for (const [provider, entries] of groups) { + parts.push(html` + + ${entries.map((entry) => { + const badges = formatModelBadges(entry); + const label = `${entry.name}${badges}`; + return html``; + })} + + `); + } + + return parts; +} + type CompiledPattern = | { kind: "all" } | { kind: "exact"; value: string } @@ -550,9 +656,14 @@ export function renderAgents(props: AgentsProps) {
Agents
${agents.length} configured.
- +
+ + +
${ props.error @@ -567,6 +678,7 @@ export function renderAgents(props: AgentsProps) { ` : agents.map((agent) => { const badge = agentBadgeText(agent.id, defaultId); + const isVoice = agent.agentType === "voice"; const emoji = resolveAgentEmoji(agent, props.agentIdentityById[agent.id] ?? null); return html` `; }) @@ -614,13 +735,24 @@ export function renderAgents(props: AgentsProps) { agentIdentity: props.agentIdentityById[selectedAgent.id] ?? null, agentIdentityError: props.agentIdentityError, agentIdentityLoading: props.agentIdentityLoading, + identityDraftName: props.identityDraftName, + identityDraftEmoji: props.identityDraftEmoji, + identitySaving: props.identitySaving, + onIdentityNameChange: props.onIdentityNameChange, + onIdentityEmojiChange: props.onIdentityEmojiChange, + onIdentitySave: props.onIdentitySave, + onIdentityReset: props.onIdentityReset, configLoading: props.configLoading, configSaving: props.configSaving, configDirty: props.configDirty, + modelCatalog: props.modelCatalog, + modelCatalogLoading: props.modelCatalogLoading, + onModelCatalogRefresh: props.onModelCatalogRefresh, onConfigReload: props.onConfigReload, onConfigSave: props.onConfigSave, onModelChange: props.onModelChange, onModelFallbacksChange: props.onModelFallbacksChange, + onAgentTypeChange: props.onAgentTypeChange, }) : nothing } @@ -742,6 +874,13 @@ function renderAgentHeader(
${agent.id}
+ ${ + agent.agentType === "voice" + ? html` + voice + ` + : nothing + } ${badge ? html`${badge}` : nothing}
@@ -782,13 +921,24 @@ function renderAgentOverview(params: { agentIdentity: AgentIdentityResult | null; agentIdentityLoading: boolean; agentIdentityError: string | null; + identityDraftName: string | null; + identityDraftEmoji: string | null; + identitySaving: boolean; + onIdentityNameChange: (value: string) => void; + onIdentityEmojiChange: (value: string) => void; + onIdentitySave: (agentId: string) => void; + onIdentityReset: () => void; configLoading: boolean; configSaving: boolean; configDirty: boolean; + modelCatalog: ModelCatalogEntry[]; + modelCatalogLoading: boolean; + onModelCatalogRefresh: () => void; onConfigReload: () => void; onConfigSave: () => void; onModelChange: (agentId: string, modelId: string | null) => void; onModelFallbacksChange: (agentId: string, fallbacks: string[]) => void; + onAgentTypeChange: (agentId: string, agentType: "text" | "voice") => void; }) { const { agent, @@ -797,13 +947,24 @@ function renderAgentOverview(params: { agentIdentity, agentIdentityLoading, agentIdentityError, + identityDraftName, + identityDraftEmoji, + identitySaving, + onIdentityNameChange, + onIdentityEmojiChange, + onIdentitySave, + onIdentityReset, configLoading, configSaving, configDirty, + modelCatalog, + modelCatalogLoading, + onModelCatalogRefresh, onConfigReload, onConfigSave, onModelChange, onModelFallbacksChange, + onAgentTypeChange, } = params; const config = resolveAgentConfig(configForm, agent.id); const workspaceFromFiles = @@ -829,7 +990,6 @@ function renderAgentOverview(params: { config.entry?.name || "-"; const resolvedEmoji = resolveAgentEmoji(agent, agentIdentity); - const identityEmoji = resolvedEmoji || "-"; const skillFilter = Array.isArray(config.entry?.skills) ? config.entry?.skills : null; const skillCount = skillFilter?.length ?? null; const identityStatus = agentIdentityLoading @@ -854,7 +1014,23 @@ function renderAgentOverview(params: {
Identity Name
-
${identityName}
+ onIdentityNameChange((e.target as HTMLInputElement).value)} + placeholder="Agent name" + style=" + width: 100%; + font: inherit; + color: inherit; + background: var(--bg-secondary, rgba(255,255,255,0.06)); + border: 1px solid var(--border, rgba(255,255,255,0.1)); + border-radius: 4px; + padding: 4px 8px; + margin-top: 2px; + " + /> ${identityStatus ? html`
${identityStatus}
` : nothing}
@@ -862,14 +1038,57 @@ function renderAgentOverview(params: {
${isDefault ? "yes" : "no"}
+
Agent Type
+ +
+
Identity Emoji
-
${identityEmoji}
+
+ ${renderEmojiPicker({ + selected: identityDraftEmoji ?? resolvedEmoji, + disabled: identitySaving, + onSelect: (emoji) => onIdentityEmojiChange(emoji), + })} +
Skills Filter
${skillFilter ? `${skillCount} selected` : "all skills"}
+ ${ + identityDraftName != null || identityDraftEmoji != null + ? html` +
+ + +
+ ` + : nothing + }
Model Selection
@@ -893,7 +1112,15 @@ function renderAgentOverview(params: { ` } - ${buildModelOptions(configForm, effectivePrimary ?? undefined)} + ${ + modelCatalog.length > 0 + ? buildCatalogModelOptions( + modelCatalog, + configForm, + effectivePrimary ?? undefined, + ) + : buildModelOptions(configForm, effectivePrimary ?? undefined) + }
+
${context.isDefault ? "yes" : "no"}
+
+
Agent Type
+
${context.agentType}
+
`; diff --git a/ui/src/ui/views/channels.discord.ts b/ui/src/ui/views/channels.discord.ts index bd31af6c9..c07574db2 100644 --- a/ui/src/ui/views/channels.discord.ts +++ b/ui/src/ui/views/channels.discord.ts @@ -7,59 +7,52 @@ import { renderChannelConfigSection } from "./channels.config.ts"; export function renderDiscordCard(params: { props: ChannelsProps; discord?: DiscordStatus | null; - accountCountLabel: unknown; }) { - const { props, discord, accountCountLabel } = params; + const { props, discord } = params; return html` -
-
Discord
-
Bot status and channel configuration.
- ${accountCountLabel} - -
-
- Configured - ${discord?.configured ? "Yes" : "No"} -
-
- Running - ${discord?.running ? "Yes" : "No"} -
-
- Last start - ${discord?.lastStartAt ? formatRelativeTimestamp(discord.lastStartAt) : "n/a"} -
-
- Last probe - ${discord?.lastProbeAt ? formatRelativeTimestamp(discord.lastProbeAt) : "n/a"} -
+
+
+ Configured + ${discord?.configured ? "Yes" : "No"} +
+
+ Running + ${discord?.running ? "Yes" : "No"} +
+
+ Last start + ${discord?.lastStartAt ? formatRelativeTimestamp(discord.lastStartAt) : "n/a"}
+
+ Last probe + ${discord?.lastProbeAt ? formatRelativeTimestamp(discord.lastProbeAt) : "n/a"} +
+
- ${ - discord?.lastError - ? html`
- ${discord.lastError} -
` - : nothing - } + ${ + discord?.lastError + ? html`
+ ${discord.lastError} +
` + : nothing + } - ${ - discord?.probe - ? html`
- Probe ${discord.probe.ok ? "ok" : "failed"} · - ${discord.probe.status ?? ""} ${discord.probe.error ?? ""} -
` - : nothing - } + ${ + discord?.probe + ? html`
+ Probe ${discord.probe.ok ? "ok" : "failed"} · + ${discord.probe.status ?? ""} ${discord.probe.error ?? ""} +
` + : nothing + } - ${renderChannelConfigSection({ channelId: "discord", props })} + ${renderChannelConfigSection({ channelId: "discord", props })} -
- -
+
+
`; } diff --git a/ui/src/ui/views/channels.googlechat.ts b/ui/src/ui/views/channels.googlechat.ts index e0ce1ea10..391c191d9 100644 --- a/ui/src/ui/views/channels.googlechat.ts +++ b/ui/src/ui/views/channels.googlechat.ts @@ -7,73 +7,66 @@ import { renderChannelConfigSection } from "./channels.config.ts"; export function renderGoogleChatCard(params: { props: ChannelsProps; googleChat?: GoogleChatStatus | null; - accountCountLabel: unknown; }) { - const { props, googleChat, accountCountLabel } = params; + const { props, googleChat } = params; return html` -
-
Google Chat
-
Chat API webhook status and channel configuration.
- ${accountCountLabel} - -
-
- Configured - ${googleChat ? (googleChat.configured ? "Yes" : "No") : "n/a"} -
-
- Running - ${googleChat ? (googleChat.running ? "Yes" : "No") : "n/a"} -
-
- Credential - ${googleChat?.credentialSource ?? "n/a"} -
-
- Audience - - ${ - googleChat?.audienceType - ? `${googleChat.audienceType}${googleChat.audience ? ` · ${googleChat.audience}` : ""}` - : "n/a" - } - -
-
- Last start - ${googleChat?.lastStartAt ? formatRelativeTimestamp(googleChat.lastStartAt) : "n/a"} -
-
- Last probe - ${googleChat?.lastProbeAt ? formatRelativeTimestamp(googleChat.lastProbeAt) : "n/a"} -
+
+
+ Configured + ${googleChat ? (googleChat.configured ? "Yes" : "No") : "n/a"} +
+
+ Running + ${googleChat ? (googleChat.running ? "Yes" : "No") : "n/a"} +
+
+ Credential + ${googleChat?.credentialSource ?? "n/a"} +
+
+ Audience + + ${ + googleChat?.audienceType + ? `${googleChat.audienceType}${googleChat.audience ? ` · ${googleChat.audience}` : ""}` + : "n/a" + } +
+
+ Last start + ${googleChat?.lastStartAt ? formatRelativeTimestamp(googleChat.lastStartAt) : "n/a"} +
+
+ Last probe + ${googleChat?.lastProbeAt ? formatRelativeTimestamp(googleChat.lastProbeAt) : "n/a"} +
+
- ${ - googleChat?.lastError - ? html`
- ${googleChat.lastError} -
` - : nothing - } + ${ + googleChat?.lastError + ? html`
+ ${googleChat.lastError} +
` + : nothing + } - ${ - googleChat?.probe - ? html`
- Probe ${googleChat.probe.ok ? "ok" : "failed"} · - ${googleChat.probe.status ?? ""} ${googleChat.probe.error ?? ""} -
` - : nothing - } + ${ + googleChat?.probe + ? html`
+ Probe ${googleChat.probe.ok ? "ok" : "failed"} · + ${googleChat.probe.status ?? ""} ${googleChat.probe.error ?? ""} +
` + : nothing + } - ${renderChannelConfigSection({ channelId: "googlechat", props })} + ${renderChannelConfigSection({ channelId: "googlechat", props })} -
- -
+
+
`; } diff --git a/ui/src/ui/views/channels.imessage.ts b/ui/src/ui/views/channels.imessage.ts index 0010175a1..facf967c6 100644 --- a/ui/src/ui/views/channels.imessage.ts +++ b/ui/src/ui/views/channels.imessage.ts @@ -7,59 +7,52 @@ import { renderChannelConfigSection } from "./channels.config.ts"; export function renderIMessageCard(params: { props: ChannelsProps; imessage?: IMessageStatus | null; - accountCountLabel: unknown; }) { - const { props, imessage, accountCountLabel } = params; + const { props, imessage } = params; return html` -
-
iMessage
-
macOS bridge status and channel configuration.
- ${accountCountLabel} - -
-
- Configured - ${imessage?.configured ? "Yes" : "No"} -
-
- Running - ${imessage?.running ? "Yes" : "No"} -
-
- Last start - ${imessage?.lastStartAt ? formatRelativeTimestamp(imessage.lastStartAt) : "n/a"} -
-
- Last probe - ${imessage?.lastProbeAt ? formatRelativeTimestamp(imessage.lastProbeAt) : "n/a"} -
+
+
+ Configured + ${imessage?.configured ? "Yes" : "No"} +
+
+ Running + ${imessage?.running ? "Yes" : "No"} +
+
+ Last start + ${imessage?.lastStartAt ? formatRelativeTimestamp(imessage.lastStartAt) : "n/a"}
+
+ Last probe + ${imessage?.lastProbeAt ? formatRelativeTimestamp(imessage.lastProbeAt) : "n/a"} +
+
- ${ - imessage?.lastError - ? html`
- ${imessage.lastError} -
` - : nothing - } + ${ + imessage?.lastError + ? html`
+ ${imessage.lastError} +
` + : nothing + } - ${ - imessage?.probe - ? html`
- Probe ${imessage.probe.ok ? "ok" : "failed"} · - ${imessage.probe.error ?? ""} -
` - : nothing - } + ${ + imessage?.probe + ? html`
+ Probe ${imessage.probe.ok ? "ok" : "failed"} · + ${imessage.probe.error ?? ""} +
` + : nothing + } - ${renderChannelConfigSection({ channelId: "imessage", props })} + ${renderChannelConfigSection({ channelId: "imessage", props })} -
- -
+
+
`; } diff --git a/ui/src/ui/views/channels.nostr.ts b/ui/src/ui/views/channels.nostr.ts index d59ecb694..05c1a9121 100644 --- a/ui/src/ui/views/channels.nostr.ts +++ b/ui/src/ui/views/channels.nostr.ts @@ -26,7 +26,6 @@ export function renderNostrCard(params: { props: ChannelsProps; nostr?: NostrStatus | null; nostrAccounts: ChannelAccountSnapshot[]; - accountCountLabel: unknown; /** Profile form state (optional - if provided, shows form) */ profileFormState?: NostrProfileFormState | null; /** Profile form callbacks */ @@ -34,15 +33,8 @@ export function renderNostrCard(params: { /** Called when Edit Profile is clicked */ onEditProfile?: () => void; }) { - const { - props, - nostr, - nostrAccounts, - accountCountLabel, - profileFormState, - profileFormCallbacks, - onEditProfile, - } = params; + const { props, nostr, nostrAccounts, profileFormState, profileFormCallbacks, onEditProfile } = + params; const primaryAccount = nostrAccounts[0]; const summaryConfigured = nostr?.configured ?? primaryAccount?.configured ?? false; const summaryRunning = nostr?.running ?? primaryAccount?.running ?? false; @@ -183,55 +175,49 @@ export function renderNostrCard(params: { }; return html` -
-
Nostr
-
Decentralized DMs via Nostr relays (NIP-04).
- ${accountCountLabel} - - ${ - hasMultipleAccounts - ? html` -