Skip to content

Commit 306c204

Browse files
committed
fix: stabilize dynamic reasoning effort transitions and tokens use
1 parent fc1223b commit 306c204

3 files changed

Lines changed: 400 additions & 19 deletions

File tree

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import type { ReasoningEffort } from "../settings";
2+
import type { ToolCall, ToolExecutionResult } from "../tools/executor";
3+
4+
// ── Public types ──────────────────────────────────────────────
5+
6+
export type TurnInput = {
7+
/** Raw tool calls from the assistant response (before execution). */
8+
toolCalls: ToolCall[];
9+
/** Execution results after tool calls completed (same order as toolCalls). */
10+
toolExecutions: ToolExecutionResult[];
11+
};
12+
13+
// ── Internal state ───────────────────────────────────────────
14+
15+
type ManagerState = {
16+
currentEffort: ReasoningEffort;
17+
consecutiveFailures: number;
18+
consecutiveIdenticalCalls: number;
19+
lastFingerprint: string | null;
20+
turnsAtCurrentEffort: number;
21+
cleanTurnStreak: number;
22+
downgradeCooldownRemaining: number;
23+
downgradeThreshold: number;
24+
escalateCooldownRemaining: number;
25+
};
26+
27+
// ── Constants ─────────────────────────────────────────────────
28+
29+
const FAILURE_ESCALATION_THRESHOLD = 2;
30+
/**
31+
* Number of consecutive identical tool calls required to trigger escalation.
32+
* Per spec: "≥3 consecutive tool calls with identical (name, arguments) pairs".
33+
*/
34+
const REPETITION_ESCALATION_THRESHOLD = 3;
35+
const DEFAULT_DOWNGRADE_THRESHOLD = 5;
36+
const DOWNGRADE_COOLDOWN_TURNS = 3;
37+
const ESCALATE_COOLDOWN_TURNS = 2;
38+
39+
// ── Manager ───────────────────────────────────────────────────
40+
41+
export class RuntimeReasoningEffortManager {
42+
private state: ManagerState;
43+
44+
constructor() {
45+
this.state = {
46+
currentEffort: "high",
47+
consecutiveFailures: 0,
48+
consecutiveIdenticalCalls: 0,
49+
lastFingerprint: null,
50+
turnsAtCurrentEffort: 0,
51+
cleanTurnStreak: 0,
52+
downgradeCooldownRemaining: 0,
53+
downgradeThreshold: DEFAULT_DOWNGRADE_THRESHOLD,
54+
escalateCooldownRemaining: 0,
55+
};
56+
}
57+
58+
static computeFingerprint(toolCalls: ToolCall[]): string {
59+
const normalized = toolCalls.map((tc) => ({
60+
name: tc.function.name,
61+
args: tc.function.arguments.replace(/\s+/g, ""),
62+
}));
63+
return JSON.stringify(normalized);
64+
}
65+
66+
evaluate(input: TurnInput): ReasoningEffort | null {
67+
const fingerprint = RuntimeReasoningEffortManager.computeFingerprint(input.toolCalls);
68+
const allOk = input.toolExecutions.length > 0 && input.toolExecutions.every((e) => e.ok);
69+
70+
this.state.turnsAtCurrentEffort += 1;
71+
72+
let result: ReasoningEffort | null;
73+
if (this.state.currentEffort === "high") {
74+
result = this.evaluateEscalation(input, fingerprint, allOk);
75+
} else {
76+
result = this.evaluateDowngrade(allOk, fingerprint);
77+
}
78+
79+
// Only decrement cooldowns when no state change occurred.
80+
// If escalate()/downgrade() just fired, the new cooldown was set
81+
// and should NOT be decremented in the same turn.
82+
if (result === null) {
83+
this.state.downgradeCooldownRemaining = Math.max(0, this.state.downgradeCooldownRemaining - 1);
84+
this.state.escalateCooldownRemaining = Math.max(0, this.state.escalateCooldownRemaining - 1);
85+
}
86+
87+
return result;
88+
}
89+
90+
getCurrentEffort(): ReasoningEffort {
91+
return this.state.currentEffort;
92+
}
93+
94+
reset(): void {
95+
this.state = {
96+
currentEffort: "high",
97+
consecutiveFailures: 0,
98+
consecutiveIdenticalCalls: 0,
99+
lastFingerprint: null,
100+
turnsAtCurrentEffort: 0,
101+
cleanTurnStreak: 0,
102+
downgradeCooldownRemaining: 0,
103+
downgradeThreshold: DEFAULT_DOWNGRADE_THRESHOLD,
104+
escalateCooldownRemaining: 0,
105+
};
106+
}
107+
108+
getState(): Readonly<ManagerState> {
109+
return { ...this.state };
110+
}
111+
112+
// ── Private helpers ─────────────────────────────────────────
113+
114+
private evaluateEscalation(_input: TurnInput, fingerprint: string, allOk: boolean): ReasoningEffort | null {
115+
if (this.state.escalateCooldownRemaining > 0) {
116+
return null;
117+
}
118+
119+
if (!allOk) {
120+
this.state.consecutiveFailures += 1;
121+
// A failure breaks the "identical success" streak.
122+
this.state.consecutiveIdenticalCalls = 0;
123+
if (this.state.consecutiveFailures >= FAILURE_ESCALATION_THRESHOLD) {
124+
return this.escalate();
125+
}
126+
} else {
127+
this.state.consecutiveFailures = 0;
128+
}
129+
130+
if (fingerprint === this.state.lastFingerprint && fingerprint !== null && this.state.lastFingerprint !== null) {
131+
this.state.consecutiveIdenticalCalls += 1;
132+
if (this.state.consecutiveIdenticalCalls >= REPETITION_ESCALATION_THRESHOLD) {
133+
return this.escalate();
134+
}
135+
} else {
136+
// First occurrence of this fingerprint — start the streak at 1.
137+
// (Per spec: escalation triggers on ≥3 identical calls; the 3rd triggers.)
138+
this.state.consecutiveIdenticalCalls = 1;
139+
}
140+
141+
this.state.lastFingerprint = fingerprint;
142+
return null;
143+
}
144+
145+
private evaluateDowngrade(allOk: boolean, fingerprint: string): ReasoningEffort | null {
146+
if (this.state.downgradeCooldownRemaining > 0) {
147+
this.state.lastFingerprint = fingerprint;
148+
return null;
149+
}
150+
151+
if (allOk && fingerprint !== this.state.lastFingerprint) {
152+
this.state.cleanTurnStreak += 1;
153+
if (this.state.cleanTurnStreak >= this.state.downgradeThreshold) {
154+
return this.downgrade();
155+
}
156+
} else if (!allOk) {
157+
this.state.cleanTurnStreak = 0;
158+
}
159+
160+
this.state.lastFingerprint = fingerprint;
161+
return null;
162+
}
163+
164+
private escalate(): ReasoningEffort {
165+
this.state.currentEffort = "max";
166+
this.state.consecutiveFailures = 0;
167+
this.state.consecutiveIdenticalCalls = 0;
168+
this.state.cleanTurnStreak = 0;
169+
this.state.downgradeCooldownRemaining = DOWNGRADE_COOLDOWN_TURNS;
170+
this.state.turnsAtCurrentEffort = 0;
171+
return "max";
172+
}
173+
174+
private downgrade(): ReasoningEffort | null {
175+
this.state.currentEffort = "high";
176+
this.state.cleanTurnStreak = 0;
177+
this.state.escalateCooldownRemaining = ESCALATE_COOLDOWN_TURNS;
178+
this.state.consecutiveFailures = 0;
179+
this.state.consecutiveIdenticalCalls = 0;
180+
this.state.turnsAtCurrentEffort = 0;
181+
if (this.state.downgradeThreshold === DEFAULT_DOWNGRADE_THRESHOLD) {
182+
this.state.downgradeThreshold = DEFAULT_DOWNGRADE_THRESHOLD * 2;
183+
} else {
184+
this.state.downgradeThreshold = DEFAULT_DOWNGRADE_THRESHOLD * 4;
185+
}
186+
return "high";
187+
}
188+
}

0 commit comments

Comments
 (0)