From d03072dd493ffe2abb50dce5e5ea38468445c36f Mon Sep 17 00:00:00 2001
From: clairewangjia <clairewangjia@users.noreply.github.com>
Date: Sun, 12 Apr 2026 15:36:32 +0800
Subject: [PATCH] feat: support receiving voice messages via ASR transcription
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract voice text from incoming WeChat voice messages using the ASR
transcription provided by the iLink API. Voice messages now appear as
"[语音] <transcribed text>" and are processed by Claude like normal text.

- Handle both `voice_text` and `text` field names in VoiceItem (API returns `text`)
- Add `media` field to VoiceItem type for CDN data compatibility

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/wechat/media.ts | 8 +++++++-
 src/wechat/types.ts | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/wechat/media.ts b/src/wechat/media.ts
index fe053b4..807229b 100644
--- a/src/wechat/media.ts
+++ b/src/wechat/media.ts
@@ -73,9 +73,15 @@ export async function downloadImage(item: MessageItem): Promise<string | null> {
 
 /**
  * Extract text content from a message item.
- * Returns text_item.text or empty string.
+ * Handles text items and voice items (using ASR transcription).
  */
 export function extractText(item: MessageItem): string {
+  if (item.type === MessageItemType.VOICE) {
+    const voiceText = item.voice_item?.voice_text || item.voice_item?.text;
+    if (voiceText) {
+      return `[语音] ${voiceText}`;
+    }
+  }
   return item.text_item?.text ?? '';
 }
 
diff --git a/src/wechat/types.ts b/src/wechat/types.ts
index df135fe..46b255e 100644
--- a/src/wechat/types.ts
+++ b/src/wechat/types.ts
@@ -48,7 +48,9 @@ export interface ImageItem {
 
 export interface VoiceItem {
   cdn_media: CDNMedia;
+  media?: { encrypt_query_param: string; aes_key?: string };
   voice_text?: string;
+  text?: string;
 }
 
 export interface FileItem {