Skip to content

Commit dc6c63f

Browse files
committed
fix: session restart state handling improvements
Problem 1: SDK session ID persists after restart - After daemon crash/restart, sdkSessionId still points to stale server-side session - Resume causes SDK to continue processing old task instead of new request - Fix: Clear sdkSessionId on startup, backup to previousSdkSessionId Problem 2: Permission timeout uses wrong context - sharedCtx.lastContextToken gets overwritten by concurrent messages - Timeout message sent to wrong user/context - Fix: Store contextToken/fromUserId in PendingPermission itself Also: Protect code blocks when splitting long messages
1 parent df670b7 commit dc6c63f

4 files changed

Lines changed: 269 additions & 20 deletions

File tree

docs/session-restart-fix.md

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Session Restart State Handling 修复分析
2+
3+
本文档记录了 session restart 相关问题的修复分析。
4+
5+
---
6+
7+
## Problem 1: SDK session ID persists after restart
8+
9+
### 问题场景
10+
11+
```
12+
用户发送消息 → state=processing → sdkSessionId=abc123
13+
14+
服务 crash → systemd 重启 → 读取持久化 session.json
15+
16+
state=processing (卡住) + sdkSessionId=abc123 (仍存在)
17+
18+
用户发新消息 → resume: abc123 → SDK 尝试恢复 server-side 会话
19+
20+
问题:SDK 进程已死,abc123 指向的会话可能:
21+
- 已终止 → 出错
22+
- 还存在 → 继续处理旧任务(用户实际遇到的情况)
23+
```
24+
25+
### 实际影响
26+
27+
用户发送新消息后,收到的是旧任务的回复(请求和响应不匹配)。
28+
29+
### 修复方案
30+
31+
```typescript
32+
// src/main.ts - runDaemon()
33+
if (session.sdkSessionId) {
34+
logger.info('Clearing SDK session ID on restart', { accountId: account.accountId, sessionId: session.sdkSessionId });
35+
session.previousSdkSessionId = session.sdkSessionId; // 备份,便于手动恢复
36+
session.sdkSessionId = undefined; // 清除,避免 resume 继续旧任务
37+
}
38+
```
39+
40+
### 必要性评估
41+
42+
| 等级 | 说明 |
43+
|-----|-----|
44+
| **** | 用户实际遇到的问题,严重影响用户体验 |
45+
46+
---
47+
48+
## Problem 2: Permission timeout uses wrong context
49+
50+
### 问题场景
51+
52+
```
53+
消息1 → 创建权限请求 → sharedCtx.lastContextToken = token1
54+
55+
等待用户 y/n(最多120秒)
56+
57+
消息2 到达 → sharedCtx.lastContextToken = token2 ← 覆盖了!
58+
59+
120秒超时 → 用 sharedCtx.lastContextToken = token2 发送超时消息
60+
61+
问题:超时消息发到了消息2的用户(token2),而不是消息1(token1)
62+
```
63+
64+
### 修复方案
65+
66+
修改前:依赖共享状态 `sharedCtx.lastContextToken`
67+
68+
```typescript
69+
// 旧代码
70+
const permissionBroker = createPermissionBroker(async () => {
71+
await sender.sendText(fromUserId, sharedCtx.lastContextToken, '⏰ 权限请求超时');
72+
});
73+
```
74+
75+
修改后:将 context 存储在 PendingPermission 本身
76+
77+
```typescript
78+
// src/session.ts
79+
export interface PendingPermission {
80+
toolName: string;
81+
toolInput: string;
82+
contextToken: string; // Store context token for timeout message
83+
fromUserId: string; // Store user ID for timeout message
84+
resolve: (allowed: boolean) => void;
85+
timer: NodeJS.Timeout;
86+
}
87+
88+
// src/permission.ts
89+
function createPending(
90+
accountId: string,
91+
toolName: string,
92+
toolInput: string,
93+
contextToken: string, // 直接传入
94+
fromUserId: string, // 直接传入
95+
): Promise<boolean> {
96+
// ...
97+
pending.set(accountId, { toolName, toolInput, contextToken, fromUserId, resolve, timer });
98+
// timeout 时使用存储的 contextToken 和 fromUserId
99+
}
100+
```
101+
102+
### 必要性评估
103+
104+
| 等级 | 说明 |
105+
|-----|-----|
106+
| **** | 边缘场景,但架构上不应依赖共享状态,多账号支持更健壮 |
107+
108+
---
109+
110+
## 修复涉及的文件
111+
112+
| 文件 | 改动 |
113+
|-----|-----|
114+
| `src/session.ts` | PendingPermission 添加 contextToken/fromUserId 字段 |
115+
| `src/permission.ts` | createPending 接收这两个参数,timeout 时使用 |
116+
| `src/main.ts` | sdkSessionId 清除 + permissionBroker callback 传递参数 + splitMessage 代码块保护 |
117+

src/main.ts

Lines changed: 140 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,102 @@ import { MessageType, type WeixinMessage } from './wechat/types.js';
2525

2626
const MAX_MESSAGE_LENGTH = 2048;
2727

28+
/**
29+
* Split a long message into chunks, respecting code blocks.
30+
* - Never splits inside a fenced code block (``` ... ```)
31+
* - Adds continuation markers if split is unavoidable
32+
* - Prefers splitting at paragraph boundaries
33+
*/
2834
function splitMessage(text: string, maxLen: number = MAX_MESSAGE_LENGTH): string[] {
2935
if (text.length <= maxLen) return [text];
36+
3037
const chunks: string[] = [];
3138
let remaining = text;
39+
3240
while (remaining.length > 0) {
3341
if (remaining.length <= maxLen) {
3442
chunks.push(remaining);
3543
break;
3644
}
37-
// Try to split at a newline near the limit
38-
let splitIdx = remaining.lastIndexOf('\n', maxLen);
39-
if (splitIdx < maxLen * 0.3) {
40-
splitIdx = maxLen;
45+
46+
// Find the best split point
47+
const splitIdx = findBestSplitPoint(remaining, maxLen);
48+
49+
if (splitIdx <= 0) {
50+
// No good split point found, force split with continuation marker
51+
const forcedChunk = remaining.slice(0, maxLen - 15) + '\n... (续)';
52+
chunks.push(forcedChunk);
53+
remaining = '(续) ...\n' + remaining.slice(maxLen - 15);
54+
continue;
55+
}
56+
57+
const chunk = remaining.slice(0, splitIdx);
58+
const inCodeBlock = isInCodeBlock(chunk);
59+
60+
if (inCodeBlock) {
61+
// Close the code block and add continuation marker
62+
chunks.push(chunk + '\n```\n... (续)');
63+
remaining = '```\n(续) ...\n' + remaining.slice(splitIdx).replace(/^\n+/, '');
64+
} else {
65+
chunks.push(chunk);
66+
remaining = remaining.slice(splitIdx).replace(/^\n+/, '');
4167
}
42-
chunks.push(remaining.slice(0, splitIdx));
43-
remaining = remaining.slice(splitIdx).replace(/^\n+/, '');
4468
}
69+
4570
return chunks;
4671
}
4772

73+
/**
74+
* Find the best point to split text, avoiding code blocks.
75+
*/
76+
function findBestSplitPoint(text: string, maxLen: number): number {
77+
// First, try to find if there's a code block ending before maxLen
78+
const codeBlockPattern = /```/g;
79+
let inBlock = false;
80+
let lastCodeBlockEnd = -1;
81+
let match;
82+
83+
while ((match = codeBlockPattern.exec(text.slice(0, maxLen + 100))) !== null) {
84+
inBlock = !inBlock;
85+
if (!inBlock) {
86+
lastCodeBlockEnd = match.index + 3;
87+
}
88+
}
89+
90+
// If we're in a code block at maxLen, split at the end of the last complete block
91+
if (inBlock && lastCodeBlockEnd > maxLen * 0.3) {
92+
return lastCodeBlockEnd;
93+
}
94+
95+
// Try to split at paragraph boundary (double newline)
96+
const paragraphIdx = text.lastIndexOf('\n\n', maxLen);
97+
if (paragraphIdx > maxLen * 0.3) {
98+
return paragraphIdx;
99+
}
100+
101+
// Try to split at single newline
102+
const newlineIdx = text.lastIndexOf('\n', maxLen);
103+
if (newlineIdx > maxLen * 0.3) {
104+
return newlineIdx;
105+
}
106+
107+
// No good split point
108+
return -1;
109+
}
110+
111+
/**
112+
* Check if text ends inside an unclosed code block.
113+
*/
114+
function isInCodeBlock(text: string): boolean {
115+
const codeBlockPattern = /```/g;
116+
let count = 0;
117+
let match;
118+
while ((match = codeBlockPattern.exec(text)) !== null) {
119+
count++;
120+
}
121+
return count % 2 === 1;
122+
}
123+
48124
function promptUser(question: string, defaultValue?: string): Promise<string> {
49125
return new Promise((resolve) => {
50126
const rl = createInterface({ input: process.stdin, output: process.stdout });
@@ -174,19 +250,31 @@ async function runDaemon(): Promise<void> {
174250
sessionStore.save(account.accountId, session);
175251
}
176252

177-
// Fix: reset stale non-idle state on startup (e.g. after crash)
253+
// Fix: reset stuck session state after restart (processing/waiting_permission should not persist)
178254
if (session.state !== 'idle') {
179-
logger.warn('Resetting stale session state on startup', { state: session.state });
255+
logger.warn('Session state was stuck after restart, resetting to idle', { accountId: account.accountId, previousState: session.state });
180256
session.state = 'idle';
181-
sessionStore.save(account.accountId, session);
182257
}
183258

259+
// Always clear sdkSessionId on restart since the SDK process is gone
260+
// The SDK session may still exist server-side but local state is inconsistent
261+
// Move to previousSdkSessionId for potential manual recovery
262+
if (session.sdkSessionId) {
263+
logger.info('Clearing SDK session ID on restart', { accountId: account.accountId, sessionId: session.sdkSessionId });
264+
session.previousSdkSessionId = session.sdkSessionId;
265+
session.sdkSessionId = undefined;
266+
}
267+
268+
sessionStore.save(account.accountId, session);
269+
184270
const sender = createSender(api, account.accountId);
271+
// Note: sharedCtx is kept for backward compatibility but permission timeout now uses stored context
185272
const sharedCtx = { lastContextToken: '' };
186273
const activeControllers = new Map<string, AbortController>();
187-
const permissionBroker = createPermissionBroker(async () => {
274+
// Permission broker callback now receives contextToken and fromUserId directly (fixes concurrency issue)
275+
const permissionBroker = createPermissionBroker(async (contextToken: string, fromUserId: string) => {
188276
try {
189-
await sender.sendText(account.userId ?? '', sharedCtx.lastContextToken, '⏰ 权限请求超时,已自动拒绝。');
277+
await sender.sendText(fromUserId, contextToken, '⏰ 权限请求超时,已自动拒绝。');
190278
} catch {
191279
logger.warn('Failed to send permission timeout message');
192280
}
@@ -244,6 +332,7 @@ async function handleMessage(
244332

245333
const contextToken = msg.context_token ?? '';
246334
const fromUserId = msg.from_user_id;
335+
// Update sharedCtx for backward compatibility (though permission timeout now uses stored context)
247336
sharedCtx.lastContextToken = contextToken;
248337

249338
// Extract text from items
@@ -411,6 +500,7 @@ async function sendToClaude(
411500
try {
412501
// Download image if present
413502
let images: QueryOptions['images'];
503+
let imageDownloadError: string | undefined;
414504
if (imageItem) {
415505
const base64DataUri = await downloadImage(imageItem);
416506
if (base64DataUri) {
@@ -427,10 +517,21 @@ async function sendToClaude(
427517
},
428518
},
429519
];
520+
} else {
521+
imageDownloadError = '图片格式解析失败';
522+
logger.error('Failed to parse image data URI format');
430523
}
524+
} else {
525+
imageDownloadError = '图片下载失败';
526+
logger.error('Failed to download image', { imageItem });
431527
}
432528
}
433529

530+
// Notify user if image processing failed
531+
if (imageDownloadError && !images) {
532+
await sender.sendText(fromUserId, contextToken, `⚠️ ${imageDownloadError},将以纯文字模式处理。`);
533+
}
534+
434535
const effectivePermissionMode = session.permissionMode ?? config.permissionMode;
435536
const isAutoPermission = effectivePermissionMode === 'auto';
436537

@@ -482,18 +583,36 @@ async function sendToClaude(
482583
session.state = 'waiting_permission';
483584
sessionStore.save(account.accountId, session);
484585

485-
// Create pending permission
586+
// Create pending permission (includes context for timeout message - fixes concurrency)
486587
const permissionPromise = permissionBroker.createPending(
487588
account.accountId,
488589
toolName,
489590
toolInput,
591+
contextToken,
592+
fromUserId,
490593
);
491594

492595
// Send permission message to WeChat
493596
const perm = permissionBroker.getPending(account.accountId);
494597
if (perm) {
495-
const permMsg = permissionBroker.formatPendingMessage(perm);
496-
await sender.sendText(fromUserId, contextToken, permMsg);
598+
try {
599+
const permMsg = permissionBroker.formatPendingMessage(perm);
600+
await sender.sendText(fromUserId, contextToken, permMsg);
601+
} catch (sendErr) {
602+
// If we can't send the permission request, we must fail the permission
603+
// otherwise the SDK will hang indefinitely
604+
logger.error('Failed to send permission request to WeChat', { error: sendErr instanceof Error ? sendErr.message : String(sendErr) });
605+
permissionBroker.resolvePermission(account.accountId, false);
606+
session.state = 'processing';
607+
sessionStore.save(account.accountId, session);
608+
return false;
609+
}
610+
} else {
611+
// Should not happen: pending permission not found after creation
612+
logger.error('Pending permission not found after creation');
613+
session.state = 'processing';
614+
sessionStore.save(account.accountId, session);
615+
return false;
497616
}
498617

499618
const allowed = await permissionPromise;
@@ -536,7 +655,12 @@ async function sendToClaude(
536655
}
537656
} else if (result.error) {
538657
logger.error('Claude query error', { error: result.error });
539-
await sender.sendText(fromUserId, contextToken, '⚠️ Claude 处理请求时出错,请稍后重试。');
658+
// Check if it's a resume-related error that might be recoverable
659+
const isResumeError = result.error.includes('session') || result.error.includes('resume');
660+
const userMsg = isResumeError
661+
? '⚠️ 会话状态异常,已自动重置。请重新发送你的请求。'
662+
: '⚠️ Claude 处理请求时出错,请稍后重试。';
663+
await sender.sendText(fromUserId, contextToken, userMsg);
540664
} else if (!anySent) {
541665
await sender.sendText(fromUserId, contextToken, 'ℹ️ Claude 无返回内容(可能因权限被拒而终止)');
542666
}
@@ -584,4 +708,4 @@ if (command === 'setup') {
584708
console.error('启动失败:', err);
585709
process.exit(1);
586710
});
587-
}
711+
}

src/permission.ts

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@ import type { PendingPermission } from './session.js';
44
const PERMISSION_TIMEOUT = 120_000;
55
const GRACE_PERIOD = 15_000;
66

7-
export type OnPermissionTimeout = () => void;
7+
export type OnPermissionTimeout = (contextToken: string, fromUserId: string) => void;
88

99
export function createPermissionBroker(onTimeout?: OnPermissionTimeout) {
1010
const pending = new Map<string, PendingPermission>();
1111
const timedOut = new Map<string, number>(); // accountId → timestamp
1212

13-
function createPending(accountId: string, toolName: string, toolInput: string): Promise<boolean> {
13+
function createPending(
14+
accountId: string,
15+
toolName: string,
16+
toolInput: string,
17+
contextToken: string,
18+
fromUserId: string,
19+
): Promise<boolean> {
1420
// Clear any existing pending permission for this account to prevent timer leak
1521
const existing = pending.get(accountId);
1622
if (existing) {
@@ -29,10 +35,10 @@ export function createPermissionBroker(onTimeout?: OnPermissionTimeout) {
2935
// Clean up grace period entry after GRACE_PERIOD
3036
setTimeout(() => timedOut.delete(accountId), GRACE_PERIOD);
3137
resolve(false);
32-
onTimeout?.();
38+
onTimeout?.(contextToken, fromUserId);
3339
}, PERMISSION_TIMEOUT);
3440

35-
pending.set(accountId, { toolName, toolInput, resolve, timer });
41+
pending.set(accountId, { toolName, toolInput, contextToken, fromUserId, resolve, timer });
3642
});
3743
}
3844

0 commit comments

Comments
 (0)