diff --git a/scripts/benchmark-glm-thinking.ts b/scripts/benchmark-glm-thinking.ts new file mode 100644 index 00000000000..4688a7e27cc --- /dev/null +++ b/scripts/benchmark-glm-thinking.ts @@ -0,0 +1,179 @@ +#!/usr/bin/env bun + +const DEFAULT_MODEL = "z-ai/glm-5.1" +const DEFAULT_ITERATIONS = 3 +const BENCHMARK_PROMPT = `Analyze this function and explain its time complexity, then suggest an optimization: + +function findPairs(arr: number[], target: number): [number, number][] { + const pairs: [number, number][] = [] + for (let i = 0; i < arr.length; i++) { + for (let j = i + 1; j < arr.length; j++) { + if (arr[i] + arr[j] === target) { + pairs.push([arr[i], arr[j]]) + } + } + } + return pairs +}` + +interface BenchmarkResult { + model: string + thinkingEnabled: boolean + iteration: number + timeToFirstTokenMs: number | null + totalTimeMs: number + thinkingTokens: number | null + responseTokens: number | null + error: string | null +} + +interface BenchmarkSummary { + model: string + timestamp: string + gitBranch: string + gitCommit: string + thinkingOn: { + avgTotalTimeMs: number + avgTTFTMs: number | null + avgThinkingTokens: number | null + results: BenchmarkResult[] + } + thinkingOff: { + avgTotalTimeMs: number + avgTTFTMs: number | null + results: BenchmarkResult[] + } + factoryTestResults: { + totalTests: number + passed: number + failed: number + } +} + +function parseArgs(): { model: string; iterations: number } { + const args = process.argv.slice(2) + let model = DEFAULT_MODEL + let iterations = DEFAULT_ITERATIONS + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--model" && args[i + 1]) { + model = args[i + 1] + i++ + } else if (args[i] === "--iterations" && args[i + 1]) { + iterations = parseInt(args[i + 1], 10) + i++ + } + } + + return { model, iterations } +} + +function average(values: number[]): number { + return values.reduce((a, b) => a + b, 0) / values.length +} + +async function getGitInfo(): Promise<{ branch: string; commit: string }> { + const { execSync } = await import("child_process") + try { + const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim() + const commit = execSync("git rev-parse --short HEAD", { encoding: "utf-8" }).trim() + return { branch, commit } + } catch { + return { branch: "unknown", commit: "unknown" } + } +} + +async function runFactoryBenchmark(): Promise<{ totalTests: number; passed: number; failed: number }> { + const { execSync } = await import("child_process") + try { + const output = execSync( + "bun test src/agents/glm-thinking-benchmark.test.ts src/agents/types.test.ts 2>&1", + { encoding: "utf-8" } + ) + const match = output.match(/(\d+) pass.*?(\d+) fail/) + if (match) { + return { passed: parseInt(match[1], 10), failed: parseInt(match[2], 10), totalTests: parseInt(match[1], 10) + parseInt(match[2], 10) } + } + return { totalTests: 0, passed: 0, failed: 0 } + } catch { + return { totalTests: 0, passed: 0, failed: -1 } + } +} + +async function callModelDirect(_model: string, _prompt: string, _thinking: boolean): Promise { + return { + model: _model, + thinkingEnabled: _thinking, + iteration: 0, + timeToFirstTokenMs: null, + totalTimeMs: 0, + thinkingTokens: null, + responseTokens: null, + error: "Direct API calls require OpenCode runtime. Use factory benchmark results for config verification.", + } +} + +async function main() { + const { model, iterations } = parseArgs() + const git = await getGitInfo() + + console.error(`\n=== GLM Thinking Benchmark ===`) + console.error(`Model: ${model}`) + console.error(`Iterations: ${iterations}`) + console.error(`Branch: ${git.branch} (${git.commit})`) + console.error() + + console.error("Phase 1: Factory config correctness benchmark...") + const factoryResults = await runFactoryBenchmark() + console.error(` Factory tests: ${factoryResults.passed}/${factoryResults.totalTests} passed`) + + console.error("\nPhase 2: Runtime benchmark (requires OpenCode runtime)...") + const thinkingOnResults: BenchmarkResult[] = [] + const thinkingOffResults: BenchmarkResult[] = [] + + for (let i = 0; i < iterations; i++) { + const onResult = await callModelDirect(model, BENCHMARK_PROMPT, true) + onResult.iteration = i + 1 + thinkingOnResults.push(onResult) + + const offResult = await callModelDirect(model, BENCHMARK_PROMPT, false) + offResult.iteration = i + 1 + thinkingOffResults.push(offResult) + } + + const summary: BenchmarkSummary = { + model, + timestamp: new Date().toISOString(), + gitBranch: git.branch, + gitCommit: git.commit, + thinkingOn: { + avgTotalTimeMs: thinkingOnResults.length > 0 ? average(thinkingOnResults.map(r => r.totalTimeMs)) : 0, + avgTTFTMs: null, + avgThinkingTokens: null, + results: thinkingOnResults, + }, + thinkingOff: { + avgTotalTimeMs: thinkingOffResults.length > 0 ? average(thinkingOffResults.map(r => r.totalTimeMs)) : 0, + avgTTFTMs: null, + results: thinkingOffResults, + }, + factoryTestResults: factoryResults, + } + + console.log(JSON.stringify(summary, null, 2)) + + console.error("\n=== Summary ===") + console.error(`Factory benchmark: ${factoryResults.passed}/${factoryResults.totalTests} tests passed`) + console.error(` - GLM-5+ text models: thinking enabled, NO budgetTokens`) + console.error(` - Claude models: thinking enabled with budgetTokens`) + console.error(` - GPT models: reasoningEffort, no thinking`) + console.error(` - GLM VLM models: default path (budgetTokens)`) + console.error() + console.error("Runtime benchmark: skipped (requires OpenCode runtime)") + console.error(" To run runtime benchmark manually:") + console.error(" opencode --model z-ai/glm-5.1 --prompt 'Explain time complexity of this function: ...'") + console.error() + console.error("Full results written to stdout (JSON)") +} + +main().catch(console.error) diff --git a/src/agents/glm-prompt-quality.test.ts b/src/agents/glm-prompt-quality.test.ts new file mode 100644 index 00000000000..addd2e20373 --- /dev/null +++ b/src/agents/glm-prompt-quality.test.ts @@ -0,0 +1,268 @@ +/** + * GLM Sisyphus Prompt Quality Benchmark + * + * Measures prompt quality across 3 dimensions: + * 1. Instruction Compliance (작업 지시 이행능력) + * 2. Speed (속도) — delegation-first, concise thinking, parallel dispatch + * 3. Accuracy (정확도) — verification tiers, error recovery, evidence requirements + * + * Compares the GLM-specific prompt builder output against minimum quality thresholds. + */ +import { describe, test, expect } from "bun:test" +import { createSisyphusAgent } from "./sisyphus" +import { createSisyphusJuniorAgentWithOverrides as createSJ } from "./sisyphus-junior/agent" + +const GLM_MODEL = "zai/glm-5.1" +const GLM_MODELS = [ + "zai/glm-5", + "zai/glm-5.1", + "zai/glm-5-turbo", + "opencode-go/glm5-turbo", +] as const + +function getGlmPrompt(model: string): string { + return createSisyphusAgent(model).prompt ?? "" +} + +function getSjPrompt(model: string): string { + return createSJ({ model }).prompt ?? "" +} + +function estimateTokenCount(text: string): number { + return Math.ceil(text.length / 3.5) +} + +describe("GLM Prompt Quality: Instruction Compliance", () => { + test("#given GLM Sisyphus prompt #then contains DISPATCH→DELEGATE→COLLECT→SYNTHESIZE execution loop", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("DISPATCH") + expect(prompt).toContain("DELEGATE") + expect(prompt).toContain("COLLECT") + expect(prompt).toContain("SYNTHESIZE") + expect(prompt).toContain("DONE") + }) + + test("#given GLM Sisyphus prompt #then mandates delegation before self-implementation", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/delegate.*before|dispatch.*first|delegation.*default/i) + expect(prompt).toMatch(/self.?implement.*only.*trivial|trivially simple/i) + }) + + test("#given GLM Sisyphus prompt #then includes re-entry rule to avoid re-verbalization", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("re_entry_rule") + expect(prompt).toMatch(/confirmation turn.*do not.*preamble|skip the preamble/i) + expect(prompt).toMatch(/already.*context.*return it|already.*context.*do not.*search/i) + }) + + test("#given GLM Sisyphus prompt #then includes working memory slice system", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("Small_Context_Working_Memory") + expect(prompt).toContain(".sisyphus/state/") + expect(prompt).toContain("goal.md") + expect(prompt).toContain("decisions.md") + expect(prompt).toContain("verification.md") + }) + + test("#given GLM Sisyphus prompt #then includes vision constraint for text-only models", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/GLM.*text.?only|text.?only.*models/i) + expect(prompt).toMatch(/multimodal.?looker|delegate.*visual/i) + }) + + test("#given GLM Sisyphus prompt #then includes tiered verification system", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("verification_tiers") + expect(prompt).toMatch(/V1.*trivial/i) + expect(prompt).toMatch(/V2.*moderate/i) + expect(prompt).toMatch(/V3.*full.*rigor/i) + }) + + test("#given GLM Sisyphus prompt #then includes Hephaestus delegation for heavy work", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Hephaestus/i) + expect(prompt).toMatch(/deep.?thinking.*worker|autonomous.*worker/i) + expect(prompt).toMatch(/more than 3 sequential self-edits.*delegate|delegate.*Hephaestus instead/i) + }) + + for (const model of GLM_MODELS) { + test(`#given ${model} #then GLM-specific prompt used (not default overlay)`, () => { + const prompt = getGlmPrompt(model) + + expect(prompt).toContain("DISPATCH") + expect(prompt).not.toContain("Phase 2B - Implementation") + }) + } +}) + +describe("GLM Prompt Quality: Speed", () => { + test("#given GLM Sisyphus prompt #then contains concise thinking mandate", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/think briefly|concise.*thinking|thinking.*concise/i) + expect(prompt).toMatch(/delegate before deep.?div/i) + }) + + test("#given GLM Sisyphus prompt #then contains exploration budget with hard stops", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("exploration_budget") + expect(prompt).toMatch(/hard stop/i) + expect(prompt).toMatch(/two.*parallel wave|at most two/i) + }) + + test("#given GLM Sisyphus prompt #then contains parallel dispatch mandate", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("parallel_dispatch") + expect(prompt).toMatch(/Fire ALL.*background.*FIRST/i) + expect(prompt).toMatch(/One wave.*sequential/i) + }) + + test("#given GLM Sisyphus prompt #then contains token economy for unconstrained reasoning", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("token_economy") + expect(prompt).toMatch(/budgetTokens.*unsupported|unconstrained/i) + expect(prompt).toMatch(/restraint.*behavior|prompt.*level.*restraint/i) + }) + + test("#given GLM Sisyphus prompt #then intent classification is one-line, not full analysis", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/one.?line intent|classify intent in one line/i) + expect(prompt).toContain("Intent routes:") + }) + + test("#given GLM SJ prompt #then contains brief thinking mandate", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Think concisely|Brief Thinking/i) + expect(prompt).toMatch(/Execute immediately|No deliberation/i) + }) + + test("#given GLM SJ prompt #then exploration is capped at 2 iterations", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/2 search iteration|capped at 2/i) + }) + + test("#given GLM SJ prompt #then contains tiered verification for speed", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/V1.*trivial|V1.*lsp_diagnostics/i) + expect(prompt).toMatch(/V2.*moderate/i) + expect(prompt).toMatch(/V3.*broad/i) + expect(prompt).toMatch(/Stop after the first successful verification/i) + }) + + test("#given GLM prompt #then prompt length is within reasonable bounds", () => { + const prompt = getGlmPrompt(GLM_MODEL) + const tokens = estimateTokenCount(prompt) + + expect(tokens).toBeGreaterThan(2000) + expect(tokens).toBeLessThan(15000) + }) +}) + +describe("GLM Prompt Quality: Accuracy", () => { + test("#given GLM Sisyphus prompt #then requires evidence-based verification", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/verification.*mandatory|mandatory.*verification/i) + expect(prompt).toMatch(/Diagnostics clean.*only after tool output|verification evidence.*concrete/i) + }) + + test("#given GLM Sisyphus prompt #then requires reading subagent output before trusting", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Read enough.*verify|do not trust subagent summaries blindly/i) + }) + + test("#given GLM Sisyphus prompt #then includes failure recovery protocol", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/failure recovery|Fix root causes/i) + expect(prompt).toMatch(/One retry.*V1|up to two.*V2/i) + expect(prompt).toMatch(/consult Oracle/i) + }) + + test("#given GLM Sisyphus prompt #then scope discipline prevents over-implementation", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Self-implement only trivial|trivial local work/i) + }) + + test("#given GLM Sisyphus prompt #then includes hard blocks and anti-patterns", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/as any|@ts-ignore|@ts-expect-error/i) + expect(prompt).toMatch(/Never leave.*broken/i) + }) + + test("#given GLM Sisyphus prompt #then asks clarification only when materially necessary", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/ask.*only when missing.*materially|materially change.*outcome/i) + }) + + test("#given GLM SJ prompt #then contains scope discipline", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/EXACTLY.*ONLY.*delegated|No extra features.*no scope creep/i) + }) + + test("#given GLM SJ prompt #then contains re-entry rule for context reuse", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/re.?entry.*rule|Re-entry Rule/i) + expect(prompt).toMatch(/already.*context.*use it|Do not re.?search.*re.?derive/i) + }) +}) + +describe("GLM Prompt Quality: Cross-Agent Consistency", () => { + test("#given GLM Sisyphus + SJ #then both have vision constraint", () => { + const sisyphusPrompt = getGlmPrompt(GLM_MODEL) + const sjPrompt = getSjPrompt(GLM_MODEL) + + expect(sisyphusPrompt).toMatch(/text.?only/i) + expect(sjPrompt).toMatch(/text.?only|GLM.*CANNOT.*images/i) + }) + + test("#given GLM Sisyphus + SJ #then both have working memory reference", () => { + const sisyphusPrompt = getGlmPrompt(GLM_MODEL) + const sjPrompt = getSjPrompt(GLM_MODEL) + + expect(sisyphusPrompt).toContain("Small_Context_Working_Memory") + expect(sjPrompt).toContain("Small_Context_Working_Memory") + }) + + test("#given GLM Sisyphus + SJ #then both have tiered verification", () => { + const sisyphusPrompt = getGlmPrompt(GLM_MODEL) + const sjPrompt = getSjPrompt(GLM_MODEL) + + expect(sisyphusPrompt).toMatch(/V1.*trivial|V1.*trivial.*local/i) + expect(sisyphusPrompt).toMatch(/V2.*moderate/i) + expect(sisyphusPrompt).toMatch(/V3.*full.*rigor|V3.*broad/i) + expect(sjPrompt).toMatch(/V1 trivial change|V1.*lsp_diagnostics.*changed file/i) + expect(sjPrompt).toMatch(/V2 moderate change/i) + expect(sjPrompt).toMatch(/V3 broad.*risky change/i) + }) + + test("#given GLM factory config #then thinking is enabled without budgetTokens for all models", () => { + for (const model of GLM_MODELS) { + const sisyphus = createSisyphusAgent(model) + const sj = createSJ({ model }) + + expect(sisyphus.thinking).toEqual({ type: "enabled" }) + expect(sj.thinking).toEqual({ type: "enabled" }) + } + }) +}) diff --git a/src/agents/glm-thinking-benchmark.test.ts b/src/agents/glm-thinking-benchmark.test.ts new file mode 100644 index 00000000000..7fa0c6c22a4 --- /dev/null +++ b/src/agents/glm-thinking-benchmark.test.ts @@ -0,0 +1,214 @@ +/** + * GLM Thinking Configuration Benchmark + * + * Verifies thinking config across all agents for every GLM model variant. + * This is the factory-level correctness benchmark for the tune/glm-performance branch. + * + * Categories tested: + * 1. GLM-5+ text models → thinking: { type: "enabled" } (no budgetTokens) + * 2. GLM VLM models → no thinking override (default path) + * 3. Claude models → thinking: { type: "enabled", budgetTokens: 32000 } + * 4. GPT models → reasoningEffort (no thinking) + */ +import { describe, test, expect } from "bun:test" +import { createOracleAgent } from "./oracle" +import { createMetisAgent } from "./metis" +import { createMomusAgent } from "./momus" +import { createSisyphusAgent } from "./sisyphus" +import { createSisyphusJuniorAgentWithOverrides as createSisyphusJuniorAgent } from "./sisyphus-junior/agent" + +const GLM_TEXT_MODELS = [ + "z-ai/glm-5", + "opencode/glm-5", + "opencode-go/glm-5", + "zai-coding-plan/glm-5", + "z-ai/glm-5.1", + "z-ai/glm-5-turbo", + "vercel/zai/glm-5", +] as const + +const GLM_VLM_MODELS = [ + "opencode/glm-4.6v", + "zai-coding-plan/glm-4.6v", + "opencode/glm-5v-turbo", + "opencode-go/glm5v-turbo", +] as const + +const CLAUDE_MODELS = [ + "anthropic/claude-opus-4-7", + "anthropic/claude-sonnet-4-6", +] as const + +const GPT_MODELS = [ + "openai/gpt-5.4", + "openai/gpt-5.5", +] as const + +function hasBudgetTokens(thinking: unknown): boolean { + return typeof thinking === "object" + && thinking !== null + && "budgetTokens" in (thinking as Record) +} + +describe("GLM Thinking Benchmark: Sisyphus", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createSisyphusAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createSisyphusAgent(model) + expect(config.thinking).toBeDefined() + expect((config.thinking as Record).type).toBe("enabled") + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createSisyphusAgent(model) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +describe("GLM Thinking Benchmark: Sisyphus-Junior", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createSisyphusJuniorAgent({ model }) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createSisyphusJuniorAgent({ model }) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createSisyphusJuniorAgent({ model }) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +describe("GLM Thinking Benchmark: Oracle", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createOracleAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of GLM_VLM_MODELS) { + test(`#given ${model} (VLM) #then falls to default thinking with budgetTokens`, () => { + const config = createOracleAgent(model) + // VLM models are NOT thinking models, so they get the default path + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createOracleAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createOracleAgent(model) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +describe("GLM Thinking Benchmark: Metis", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createMetisAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of GLM_VLM_MODELS) { + test(`#given ${model} (VLM) #then falls to default thinking with budgetTokens`, () => { + const config = createMetisAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createMetisAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } +}) + +describe("GLM Thinking Benchmark: Momus", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createMomusAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of GLM_VLM_MODELS) { + test(`#given ${model} (VLM) #then falls to default thinking with budgetTokens`, () => { + const config = createMomusAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createMomusAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createMomusAgent(model) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +describe("GLM Thinking Benchmark: Cross-agent budgetTokens guard", () => { + const agentFactories = [ + { name: "Sisyphus", fn: (m: string) => createSisyphusAgent(m) }, + { name: "Sisyphus-Junior", fn: (m: string) => createSisyphusJuniorAgent({ model: m }) }, + { name: "Oracle", fn: (m: string) => createOracleAgent(m) }, + { name: "Metis", fn: (m: string) => createMetisAgent(m) }, + { name: "Momus", fn: (m: string) => createMomusAgent(m) }, + ] + + for (const agent of agentFactories) { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${agent.name} + ${model} #then NEVER receives budgetTokens`, () => { + const config = agent.fn(model) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + } +}) diff --git a/src/agents/metis.ts b/src/agents/metis.ts index 4285696ed03..40a18c527f4 100644 --- a/src/agents/metis.ts +++ b/src/agents/metis.ts @@ -1,24 +1,12 @@ import type { AgentConfig } from "@opencode-ai/sdk" import type { AgentMode, AgentPromptMetadata } from "./types" +import { isGlmThinkingModel } from "./types" +import { buildGlmSubagentVisionBlock } from "./sisyphus/glm" import { buildAntiDuplicationSection } from "./dynamic-agent-prompt-builder" import { createAgentToolRestrictions } from "../shared/permission-compat" const MODE: AgentMode = "subagent" -/** - * Metis - Plan Consultant Agent - * - * Named after the Greek goddess of wisdom, prudence, and deep counsel. - * Metis analyzes user requests BEFORE planning to prevent AI failures. - * - * Core responsibilities: - * - Identify hidden intentions and unstated requirements - * - Detect ambiguities that could derail implementation - * - Flag potential AI-slop patterns (over-engineering, scope creep) - * - Generate clarifying questions for the user - * - Prepare directives for the planner agent - */ - export const METIS_SYSTEM_PROMPT = `# Metis - Pre-Planning Consultant ## CONSTRAINTS @@ -299,7 +287,7 @@ const metisRestrictions = createAgentToolRestrictions([ ]) export function createMetisAgent(model: string): AgentConfig { - return { + const base = { description: "Pre-planning consultant that analyzes requests to identify hidden intentions, ambiguities, and AI failure points. (Metis - OhMyOpenCode)", mode: MODE, @@ -307,8 +295,13 @@ export function createMetisAgent(model: string): AgentConfig { temperature: 0.3, ...metisRestrictions, prompt: METIS_SYSTEM_PROMPT, - thinking: { type: "enabled", budgetTokens: 32000 }, } as AgentConfig + + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" }, prompt: base.prompt + buildGlmSubagentVisionBlock() } as AgentConfig + } + + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 } } as AgentConfig } createMetisAgent.mode = MODE diff --git a/src/agents/momus.ts b/src/agents/momus.ts index 76eb592e5a8..6000e103ac7 100644 --- a/src/agents/momus.ts +++ b/src/agents/momus.ts @@ -1,6 +1,6 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; -import { isGptModel } from "./types"; +import { isGlmThinkingModel, isGptModel } from "./types"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -307,6 +307,10 @@ export function createMomusAgent(model: string): AgentConfig { } as AgentConfig; } + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" } } as AgentConfig; + } + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 }, diff --git a/src/agents/oracle.ts b/src/agents/oracle.ts index 1779a0b16f1..c38c1179102 100644 --- a/src/agents/oracle.ts +++ b/src/agents/oracle.ts @@ -1,6 +1,7 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; -import { isGpt5_5Model, isGptModel } from "./types"; +import { isGlmThinkingModel, isGpt5_5Model, isGptModel } from "./types"; +import { buildGlmSubagentVisionBlock } from "./sisyphus/glm"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -37,10 +38,6 @@ export const ORACLE_PROMPT_METADATA: AgentPromptMetadata = { ], }; -/** - * Default Oracle prompt - used for Claude and other non-GPT models. - * XML-tagged structure with extended thinking support. - */ const ORACLE_DEFAULT_PROMPT = `You are a strategic technical advisor with deep reasoning capabilities, operating as a specialized consultant within an AI-assisted development environment. @@ -152,16 +149,6 @@ Before finalizing answers on architecture, security, or performance: Your response goes directly to the user with no intermediate processing. Make your final message self-contained: a clear recommendation they can act on immediately, covering both what to do and why. `; -/** - * GPT-5.4 Optimized Oracle System Prompt - * - * Tuned for GPT-5.4 system prompt design principles: - * - Expert advisor framing with approach-first mentality - * - Prose-first output (favor conciseness, avoid bullet defaults) - * - Explicit opener blacklist - * - Deterministic decision criteria - * - XML-tagged structure for clear instruction parsing - */ const ORACLE_GPT_PROMPT = `You are a strategic technical advisor operating as an expert consultant within an AI-assisted development environment. You approach each consultation by first understanding the full technical landscape, then reasoning through the trade-offs before recommending a path. @@ -443,6 +430,10 @@ export function createOracleAgent(model: string): AgentConfig { } as AgentConfig; } + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" }, prompt: base.prompt + buildGlmSubagentVisionBlock() } as AgentConfig; + } + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 }, diff --git a/src/agents/sisyphus-junior/agent.ts b/src/agents/sisyphus-junior/agent.ts index 5f01b2914b0..e79b64a82b5 100644 --- a/src/agents/sisyphus-junior/agent.ts +++ b/src/agents/sisyphus-junior/agent.ts @@ -1,18 +1,12 @@ -/** - * Sisyphus-Junior - Focused Task Executor - * - * Executes delegated tasks directly without spawning other agents. - * Category-spawned executor with domain-specific configurations. - * - * Routing: - * 1. GPT models (openai/*, github-copilot/gpt-*) -> gpt.ts (GPT-5.4 optimized) - * 2. Gemini models (google/*, google-vertex/*) -> gemini.ts (Gemini-optimized) - * 3. Default (Claude, etc.) -> default.ts (Claude-optimized) - */ - import type { AgentConfig } from "@opencode-ai/sdk" import type { AgentMode } from "../types" -import { isGlmModel, isGpt5_5Model, isGptModel, isGeminiModel, isKimiK2Model } from "../types" +import { + isGlmSisyphusHarnessModel, + isGpt5_5Model, + isGptModel, + isGeminiModel, + isKimiK2Model, +} from "../types" import type { AgentOverrideConfig } from "../../config/schema" import { createAgentToolRestrictions, @@ -21,6 +15,7 @@ import { import { getGptApplyPatchPermission } from "../gpt-apply-patch-guard" import { buildDefaultSisyphusJuniorPrompt } from "./default" +import { buildGlmSisyphusJuniorPrompt } from "./glm" import { buildKimiK26SisyphusJuniorPrompt } from "./kimi-k2-6" import { buildGptSisyphusJuniorPrompt } from "./gpt" import { buildGpt54SisyphusJuniorPrompt } from "./gpt-5-4" @@ -30,8 +25,6 @@ import { buildGeminiSisyphusJuniorPrompt } from "./gemini" const MODE: AgentMode = "subagent" -// Core tools that Sisyphus-Junior must NEVER have access to -// Note: call_omo_agent is ALLOWED so subagents can spawn explore/librarian const BLOCKED_TOOLS = ["task"] const GPT_BLOCKED_TOOLS = ["task", "apply_patch"] @@ -43,6 +36,7 @@ export const SISYPHUS_JUNIOR_DEFAULTS = { export type SisyphusJuniorPromptSource = | "default" | "kimi-k2" + | "glm" | "gpt" | "gpt-5-5" | "gpt-5-4" @@ -51,6 +45,7 @@ export type SisyphusJuniorPromptSource = export function getSisyphusJuniorPromptSource(model?: string): SisyphusJuniorPromptSource { if (model && isKimiK2Model(model)) return "kimi-k2" + if (model && isGlmSisyphusHarnessModel(model)) return "glm" if (model && isGptModel(model)) { if (isGpt5_5Model(model)) return "gpt-5-5" const lower = model.toLowerCase() @@ -64,9 +59,6 @@ export function getSisyphusJuniorPromptSource(model?: string): SisyphusJuniorPro return "default" } -/** - * Builds the appropriate Sisyphus-Junior prompt based on model. - */ export function buildSisyphusJuniorPrompt( model: string | undefined, useTaskSystem: boolean, @@ -77,6 +69,8 @@ export function buildSisyphusJuniorPrompt( switch (source) { case "kimi-k2": return buildKimiK26SisyphusJuniorPrompt(useTaskSystem, promptAppend) + case "glm": + return buildGlmSisyphusJuniorPrompt(useTaskSystem, promptAppend) case "gpt-5-5": return buildGpt55SisyphusJuniorPrompt(useTaskSystem, promptAppend) case "gpt-5-4": @@ -145,8 +139,8 @@ export function createSisyphusJuniorAgentWithOverrides( return { ...base, reasoningEffort: "medium" } as AgentConfig } - if (isGlmModel(model)) { - return base as AgentConfig + if (isGlmSisyphusHarnessModel(model)) { + return { ...base, thinking: { type: "enabled" } } as AgentConfig } return { diff --git a/src/agents/sisyphus-junior/glm.test.ts b/src/agents/sisyphus-junior/glm.test.ts new file mode 100644 index 00000000000..198cb6ead3b --- /dev/null +++ b/src/agents/sisyphus-junior/glm.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, test } from "bun:test" +import { buildDefaultSisyphusJuniorPrompt } from "./default" +import { buildGlmSisyphusJuniorPrompt } from "./glm" + +function countOccurrences(text: string, needle: string): number { + return text.split(needle).length - 1 +} + +describe("buildGlmSisyphusJuniorPrompt", () => { + test("#given no append #then keeps the default Junior prompt as its base", () => { + // given + const basePrompt = buildDefaultSisyphusJuniorPrompt(false) + + // when + const prompt = buildGlmSisyphusJuniorPrompt(false) + + // then + expect(prompt.startsWith(basePrompt)).toBe(true) + }) + + test("#given no append #then adds exactly one GLM context block", () => { + // given / when + const prompt = buildGlmSisyphusJuniorPrompt(false) + + // then + expect(countOccurrences(prompt, "")).toBe(1) + expect(countOccurrences(prompt, "")).toBe(1) + }) + + test("#given no append #then stays lightweight and avoids full ledger instructions", () => { + // given / when + const prompt = buildGlmSisyphusJuniorPrompt(false) + + // then + expect(prompt).toContain("Read only the slice named in the task prompt") + expect(prompt).not.toContain("Toggle RL") + expect(prompt).not.toContain("goal.md") + expect(prompt).not.toContain("decisions.md") + expect(prompt).not.toContain("files.md") + expect(prompt).not.toContain("blockers.md") + expect(prompt).not.toContain("verification.md") + }) + + test("#given promptAppend #then appends it exactly once", () => { + // given + const promptAppend = "Extra instructions here" + + // when + const prompt = buildGlmSisyphusJuniorPrompt(false, promptAppend) + + // then + expect(countOccurrences(prompt, promptAppend)).toBe(1) + }) +}) diff --git a/src/agents/sisyphus-junior/glm.ts b/src/agents/sisyphus-junior/glm.ts new file mode 100644 index 00000000000..98d4557215d --- /dev/null +++ b/src/agents/sisyphus-junior/glm.ts @@ -0,0 +1,24 @@ +import { resolvePromptAppend } from "../builtin-agents/resolve-file-uri" +import { buildDefaultSisyphusJuniorPrompt } from "./default" + +export function buildGlmSisyphusJuniorPrompt( + useTaskSystem: boolean, + promptAppend?: string +): string { + const prompt = `${buildDefaultSisyphusJuniorPrompt(useTaskSystem)} + + +## GLM context priorities +- Keep the working set tiny: start from the current task prompt, the current file, and the latest verification output. +- Read only the slice named in the task prompt, or the file/output directly needed for the current step. +- Do not expand into a full ledger or read unrelated state files. + +## Vision Constraint (GLM text-only) +- GLM models (GLM-5, GLM-5.1, GLM-5-turbo) CANNOT render or analyze images, screenshots, or visual content. +- When a task involves viewing images or visual content, delegate to the multimodal-looker agent instead of attempting it yourself. +- NEVER call look_at, read (on image files), or screenshot tools. They WILL FAIL. +- ALWAYS delegate to multimodal-looker agent. If zai-mcp-server tools appear in your tool list, you may use them as secondary option. +`; + if (!promptAppend) return prompt + return prompt + "\n\n" + resolvePromptAppend(promptAppend) +} diff --git a/src/agents/sisyphus-junior/index.test.ts b/src/agents/sisyphus-junior/index.test.ts index 7da727f30d9..4b87aa26e5a 100644 --- a/src/agents/sisyphus-junior/index.test.ts +++ b/src/agents/sisyphus-junior/index.test.ts @@ -2,6 +2,7 @@ import { describe, expect, test } from "bun:test" import { createSisyphusJuniorAgentWithOverrides, SISYPHUS_JUNIOR_DEFAULTS, + buildGlmSisyphusJuniorPrompt, getSisyphusJuniorPromptSource, buildSisyphusJuniorPrompt, } from "./index" @@ -112,7 +113,6 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { // when const result = createSisyphusJuniorAgentWithOverrides(override) - // then - defaults should be used, not the overrides expect(result.model).toBe(SISYPHUS_JUNIOR_DEFAULTS.model) expect(result.temperature).toBe(SISYPHUS_JUNIOR_DEFAULTS.temperature) }) @@ -168,7 +168,7 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { expect(result.thinking).toEqual({ type: "enabled", budgetTokens: 32000 }) }) - test("#given GLM reasoning model #when agent is created #then skips injected thinking", () => { + test("#given GLM reasoning model #when agent is created #then uses GLM-native thinking", () => { // given const override = { model: "z-ai/glm-5" } @@ -177,7 +177,20 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { // then expect(result.reasoningEffort).toBeUndefined() - expect(result.thinking).toBeUndefined() + expect(result.thinking).toEqual({ type: "enabled" }) + }) + }) + + describe("barrel exports", () => { + test("exposes buildGlmSisyphusJuniorPrompt", () => { + // given + const model = false + + // when + const prompt = buildGlmSisyphusJuniorPrompt(model) + + // then + expect(prompt).toContain("GLM context priorities") }) }) @@ -200,13 +213,11 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { const permission = result.permission as Record | undefined if (tools) { expect(tools.task).toBe(false) - // call_omo_agent is NOW ALLOWED for subagents to spawn explore/librarian expect(tools.call_omo_agent).toBe(true) expect(tools.read).toBe(true) } if (permission) { expect(permission.task).toBe("deny") - // call_omo_agent is NOW ALLOWED for subagents to spawn explore/librarian expect(permission.call_omo_agent).toBe("allow") } }) @@ -224,7 +235,6 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { // when const result = createSisyphusJuniorAgentWithOverrides(override as Parameters[0]) - // then - task blocked, but call_omo_agent allowed for explore/librarian spawning const tools = result.tools as Record | undefined const permission = result.permission as Record | undefined if (tools) { @@ -464,6 +474,116 @@ describe("getSisyphusJuniorPromptSource", () => { expect(source).toBe("gpt-5-4") }) + test("returns 'glm' for z-ai/glm-5", () => { + // given + const model = "z-ai/glm-5" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for z-ai/glm-5.1", () => { + // given + const model = "z-ai/glm-5.1" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for zai-org/glm-5.1:thinking", () => { + // given + const model = "zai-org/glm-5.1:thinking" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for opencode-go/glm-5-turbo", () => { + // given + const model = "opencode-go/glm-5-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for opencode-go/glm5-turbo", () => { + // given + const model = "opencode-go/glm5-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for z-ai/glm-5v-turbo", () => { + // given + const model = "z-ai/glm-5v-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for z-ai/glm5v-turbo", () => { + // given + const model = "z-ai/glm5v-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'default' for z-ai/glm-4.6v", () => { + // given + const model = "z-ai/glm-4.6v" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("default") + }) + + test("returns 'default' for z-ai/glm-5.1-preview", () => { + // given + const model = "z-ai/glm-5.1-preview" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("default") + }) + + test("returns 'default' for big-pickle/glm", () => { + // given + const model = "big-pickle/glm" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("default") + }) + test("returns 'gpt-5-4' for GitHub Copilot GPT 5.4", () => { // given const model = "github-copilot/gpt-5.4" @@ -557,6 +677,18 @@ describe("buildSisyphusJuniorPrompt", () => { expect(prompt).toContain("Do not use `apply_patch`") }) + test("GLM harness model uses GLM prompt", () => { + // given + const model = "z-ai/glm-5" + + // when + const prompt = buildSisyphusJuniorPrompt(model, false) + + // then + expect(prompt).toContain("GLM context priorities") + expect(prompt).toContain("") + }) + test("GPT 5.3 Codex model uses GPT-5.3-codex prompt", () => { // given const model = "openai/gpt-5.3-codex" diff --git a/src/agents/sisyphus-junior/index.ts b/src/agents/sisyphus-junior/index.ts index 5232b23fdd1..f866bdf7cb1 100644 --- a/src/agents/sisyphus-junior/index.ts +++ b/src/agents/sisyphus-junior/index.ts @@ -1,4 +1,5 @@ export { buildDefaultSisyphusJuniorPrompt } from "./default" +export { buildGlmSisyphusJuniorPrompt } from "./glm" export { buildKimiK26SisyphusJuniorPrompt } from "./kimi-k2-6" export { buildGptSisyphusJuniorPrompt } from "./gpt" export { buildGpt54SisyphusJuniorPrompt } from "./gpt-5-4" diff --git a/src/agents/sisyphus.glm-routing.test.ts b/src/agents/sisyphus.glm-routing.test.ts new file mode 100644 index 00000000000..e8371ec7122 --- /dev/null +++ b/src/agents/sisyphus.glm-routing.test.ts @@ -0,0 +1,103 @@ +/// + +import { describe, expect, test } from "bun:test"; +import { createSisyphusAgent } from "./sisyphus"; + +function getPrompt(model: string, availableCategories?: Parameters[4]): string { + const agent = createSisyphusAgent(model, undefined, undefined, undefined, availableCategories); + return agent.prompt ?? ""; +} + +describe("createSisyphusAgent - GLM routing", () => { + test("#given GLM harness models #then routes to the GLM prompt builder", () => { + const models = [ + "opencode-go/glm-5", + "opencode-go/glm-5.1", + "opencode-go/glm-5-1", + "opencode-go/glm-5-turbo", + "opencode-go/glm5-turbo", + "zai/glm-5.1:thinking", + "zai/glm-5v-turbo", + "zai/glm5v-turbo", + ]; + + for (const model of models) { + const prompt = getPrompt(model); + + expect(prompt).toContain(".sisyphus/state/"); + expect(prompt).toContain(""); + expect(prompt).toContain("goal.md"); + } + }); + + test("#given excluded GLM-like models #then does not route to the GLM prompt builder", () => { + const models = ["zai-coding-plan/glm-4.6v", "zai/glm-5.1-preview", "big-pickle/glm"]; + + for (const model of models) { + const prompt = getPrompt(model); + + expect(prompt).not.toContain(".sisyphus/state/"); + expect(prompt).not.toContain(""); + expect(prompt).not.toContain("goal.md"); + expect(prompt).not.toContain("verification.md"); + } + }); + + test("#given Kimi model #then keeps Kimi markers and avoids GLM ledger markers", () => { + const prompt = getPrompt("moonshotai/kimi-k2.6"); + + expect(prompt).toContain("Toggle RL"); + expect(prompt).toContain("K2.x post-training context"); + expect(prompt).not.toContain("DIRECT HEPHAESTUS DELEGATION"); + expect(prompt).not.toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); + expect(prompt).not.toContain(".sisyphus/state/"); + expect(prompt).not.toContain(""); + expect(prompt).not.toContain("goal.md"); + expect(prompt).not.toContain("verification.md"); + }); + + test("#given GLM harness model #then returns config with thinking enabled (no budgetTokens)", () => { + const agent = createSisyphusAgent("zai/glm-5.1"); + + expect(agent.thinking).toEqual({ type: "enabled" }); + expect((agent as Record).reasoningEffort).toBeUndefined(); + }); + + test("#given GLM harness model #then strongly biases implementation toward category delegation", () => { + const prompt = getPrompt("zai/glm-5.1"); + + expect(prompt).toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); + expect(prompt).toContain("NEVER implement directly"); + expect(prompt).toContain("DEEP DELEGATION - YOUR IMPLEMENTATION PATH"); + expect(prompt).toContain('task(category="deep"'); + expect(prompt).toContain("delegate to Hephaestus"); + }); + + test("#given GLM harness model #when building prompt #then preserves direct delegation markers", () => { + const prompt = getPrompt("zai/glm-5.1"); + + expect(prompt).toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); + expect(prompt).toContain("DEEP DELEGATION"); + }); + + test("#given GLM harness model #when building prompt #then places delegation before verification and style sections", () => { + const prompt = getPrompt("zai/glm-5.1"); + const delegationIndex = prompt.indexOf("DEEP DELEGATION"); + const verificationIndex = prompt.indexOf(""); + const styleIndex = prompt.indexOf("` +} + +export function buildGlmSisyphusPrompt( + model: string, + availableAgents: AvailableAgent[], + availableTools: AvailableTool[] = [], + availableSkills: AvailableSkill[] = [], + availableCategories: AvailableCategory[] = [], + useTaskSystem = false, +): string { + const keyTriggers = buildKeyTriggersSection(availableAgents, availableSkills) + const toolSelection = buildToolSelectionTable( + availableAgents, + availableTools, + availableSkills, + ) + const exploreSection = buildExploreSection(availableAgents) + const librarianSection = buildLibrarianSection(availableAgents) + const categorySkillsGuide = buildCategorySkillsDelegationGuide( + availableCategories, + availableSkills, + ) + const delegationTable = buildDelegationTable(availableAgents) + const oracleSection = buildOracleSection(availableAgents) + const hardBlocks = buildHardBlocksSection() + const antiPatterns = buildAntiPatternsSection() + const nonClaudePlannerSection = buildNonClaudePlannerSection(model) + const parallelDelegationSection = `### DEEP DELEGATION - YOUR IMPLEMENTATION PATH + +For long or complex implementation, delegate to Hephaestus via \`task(category="deep", load_skills=[], run_in_background=true, ...)\`. Hephaestus is the autonomous implementation worker; GLM is the orchestrator. + +Use domain-specific categories when they are more precise than \`deep\`. + +${buildParallelDelegationSection(model, availableCategories) || + `### DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER + +**YOUR FAILURE MODE: You attempt to do work yourself instead of decomposing and delegating.** When an implementation task is not V1 trivial, specialized subagents are faster and more reliable than GLM solo execution. + +**MANDATORY - for ANY non-trivial implementation task:** + +1. **ALWAYS decompose** the task into independent work units. +2. **ALWAYS delegate** each unit to available category workers, preferably \`deep\` or \`unspecified-high\`, in parallel. +3. **NEVER implement directly** when delegation is possible. You write prompts, collect results, verify, and synthesize. + +**Your value is orchestration, decomposition, and quality control. Delegating with crystal-clear prompts IS your work.**`}` + + const todoHookNote = useTaskSystem + ? "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" + : "YOUR TODO CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TODO CONTINUATION])" + + const identityBlock = buildIdentityBlock(todoHookNote) + const constraintsBlock = buildConstraintsBlock(hardBlocks, antiPatterns) + const intentBlock = buildIntentBlock(keyTriggers) + const exploreBlock = buildExploreBlock(toolSelection, exploreSection, librarianSection) + const executionLoopBlock = buildExecutionLoopBlock() + const delegationBlock = buildDelegationBlock( + categorySkillsGuide, + nonClaudePlannerSection, + parallelDelegationSection, + delegationTable, + oracleSection, + ) + const styleBlock = buildStyleBlock() + const tasksSection = buildGlmTasksSection(useTaskSystem) + + return `${identityBlock} + +${constraintsBlock} + +${intentBlock} + +${exploreBlock} + +${executionLoopBlock} + +${delegationBlock} + +${tasksSection} + +${styleBlock}` +} + +export { categorizeTools } diff --git a/src/agents/sisyphus/glm.test.ts b/src/agents/sisyphus/glm.test.ts new file mode 100644 index 00000000000..c6cd0337d0d --- /dev/null +++ b/src/agents/sisyphus/glm.test.ts @@ -0,0 +1,86 @@ +/// + +import { describe, test, expect } from "bun:test"; +import { buildGlmWorkingMemory, buildGlmVisionConstraint } from "./glm"; + +describe("buildGlmWorkingMemory", () => { + test("#given call #then contains exactly one Small_Context_Working_Memory block", () => { + const block = buildGlmWorkingMemory(); + + const openCount = block.split("").length - 1; + const closeCount = + block.split("").length - 1; + expect(openCount).toBe(1); + expect(closeCount).toBe(1); + }); + + test("#given call #then anchors slices under .sisyphus/state/{plan-or-session}/", () => { + // given + const block = buildGlmWorkingMemory(); + + // then + expect(block).toContain(".sisyphus/state/{plan-or-session}/"); + }); + + test("#given call #then declares all five state slice file names", () => { + // given + const block = buildGlmWorkingMemory(); + + // then + expect(block).toContain("goal.md"); + expect(block).toContain("decisions.md"); + expect(block).toContain("files.md"); + expect(block).toContain("blockers.md"); + expect(block).toContain("verification.md"); + }); + + test("#given call #then encodes the 500-token soft slice target", () => { + // given + const block = buildGlmWorkingMemory(); + + // then + expect(block).toContain("500"); + }); + + test("#given call #then caps slice reads at 4 per turn", () => { + // given + const block = buildGlmWorkingMemory(); + + // then + expect(block).toMatch(/AT MOST 4 slices/); + }); + + test("#given call #then requires relevant-slice-only reads", () => { + // given + const block = buildGlmWorkingMemory(); + + // then + expect(block.toLowerCase()).toContain("relevant-slice-only"); + }); + + test("#given call #then explains that missing slices mean first run", () => { + // given + const block = buildGlmWorkingMemory(); + + // then + expect(block.toLowerCase()).toContain("first run"); + }); +}); + +describe("buildGlmVisionConstraint", () => { + test("#given call #then contains GLM_VISION_CONSTRAINT tag", () => { + const block = buildGlmVisionConstraint(); + + expect(block).toContain(""); + expect(block).toContain(""); + }); + + test("#given call #then mentions text-only limitation", () => { + // given + const block = buildGlmVisionConstraint(); + + // then + expect(block).toContain("text-only"); + expect(block).toContain("multimodal-looker"); + }); +}); diff --git a/src/agents/sisyphus/glm.ts b/src/agents/sisyphus/glm.ts new file mode 100644 index 00000000000..8d7c4064cfc --- /dev/null +++ b/src/agents/sisyphus/glm.ts @@ -0,0 +1,66 @@ +export function buildGlmWorkingMemory(): string { + return ` +## Working Memory via Small Context Slices + +GLM keeps a lightweight working memory under \`.sisyphus/state/{plan-or-session}/\` so continuity across turns does not require re-reading the full plan file or scrolling old messages. The directory key is the active plan name when one is present (\`.sisyphus/plans/{plan-name}.md\`), otherwise the current session label. + +### State slice files (created by you, only when needed) + +- \`goal.md\` - the active user goal in plain language: what is being built and what success looks like. +- \`decisions.md\` - architectural and routing choices already made, with one-line rationale. +- \`files.md\` - paths you have edited or that are part of the current working set. +- \`blockers.md\` - open questions, unresolved errors, or items waiting on user or specialist. +- \`verification.md\` - lsp/test/build evidence captured during this session. + +### Slice budget and read protocol + +- Treat every slice as a small context with a soft target of about 500 tokens. Keep entries terse and append-only. +- Read AT MOST 4 slices per turn. Pick only the slices that are directly relevant to what you are about to do; never load the full set "to be safe". +- Relevant-slice-only: if the current move does not depend on a slice, do not read it. +- Missing files means this is the first run for the current plan/session. Proceed without them and create slices only when you have something concrete to record. +- Slice reads substitute for re-reading the plan file or prior turns. They never substitute for actual code reads or tool output. + +### Slice write protocol + +- Append the new line(s) needed; do not rewrite the whole file. +- Update \`goal.md\` when the goal or scope changes; \`decisions.md\` when you pick a routing or architectural option; \`files.md\` when the working set shifts; \`blockers.md\` when something blocks you; \`verification.md\` when you run lsp/tests/build. +- Never create the \`.sisyphus/state\` directory speculatively. Only when a real state update is required. +`; +} + +export function buildGlmVisionConstraint(): string { + return ` +**Vision/Image Constraint (GLM text-only models):** +- GLM-5, GLM-5.1, GLM-5-turbo are text-only models. They CANNOT render or analyze images, screenshots, PDFs, or visual content. +- When a task involves viewing/analyzing images or visual content, ALWAYS delegate to the \`multimodal-looker\` agent. NEVER attempt to use \`look_at\`, \`read\`, or screenshot tools on image files yourself. +- For browser visual testing (screenshot verification, UI diff), delegate to \`multimodal-looker\` or use \`visual-engineering\` category with \`playwright\` skill. +`; +} + +export function buildGlmVisionHardBlock(): string { + return `## GLM Vision Constraint (HARD BLOCK) + +You are a TEXT-ONLY model. You CANNOT see images. + +NEVER call these tools yourself for images/screenshots/PDFs: +- \`look_at\` +- \`read\` (on image/PDF/binary files) +- \`brave-devtools_take_screenshot\` +- \`playwright_browser_take_screenshot\` +- \`figma_get_screenshot\` + +When user shares an image, screenshot, or asks to analyze visual content: +1. Delegate to \`multimodal-looker\` agent. +2. If \`zai-mcp-server_*\` tools are available, you may use them as a secondary option. + +Do not inspect visual content directly.`; +} + +export function buildGlmSubagentVisionBlock(): string { + return ` + +## GLM Vision Constraint (HARD BLOCK) +You are a TEXT-ONLY model. You CANNOT see images. +Never call look_at, read (on image files), or screenshot tools. Delegate to multimodal-looker; if zai-mcp-server tools are available, they may be used as a secondary option. +`; +} diff --git a/src/agents/types.test.ts b/src/agents/types.test.ts index 13cb0bf932d..1d23054a724 100644 --- a/src/agents/types.test.ts +++ b/src/agents/types.test.ts @@ -3,6 +3,9 @@ import { isGptModel, isGeminiModel, isGlmModel, + isGlmSisyphusHarnessModel, + isGlmThinkingModel, + isGlmVisionModel, isGptNativeSisyphusModel, isMiniMaxModel, } from "./types"; @@ -141,6 +144,87 @@ describe("isGlmModel", () => { }); }); +describe("isGlmVisionModel", () => { + test("#given GLM VLM variants #then returns true", () => { + expect(isGlmVisionModel("opencode/glm-4.6v")).toBe(true); + expect(isGlmVisionModel("opencode/glm-5v")).toBe(true); + expect(isGlmVisionModel("opencode/glm-5v-turbo")).toBe(true); + expect(isGlmVisionModel("z-ai/glm-5v-turbo")).toBe(true); + expect(isGlmVisionModel("opencode-go/glm5v-turbo")).toBe(true); + }); + + test("#given GLM text models #then returns false", () => { + expect(isGlmVisionModel("opencode/glm-5")).toBe(false); + expect(isGlmVisionModel("z-ai/glm-5.1")).toBe(false); + expect(isGlmVisionModel("opencode/glm-5-turbo")).toBe(false); + expect(isGlmVisionModel("opencode-go/glm5-turbo")).toBe(false); + }); + + test("#given non-GLM models #then returns false", () => { + expect(isGlmVisionModel("openai/gpt-5.4")).toBe(false); + expect(isGlmVisionModel("anthropic/claude-opus-4-7")).toBe(false); + }); +}); + +describe("isGlmThinkingModel", () => { + test("#given GLM-5+ text models #then returns true", () => { + expect(isGlmThinkingModel("opencode/glm-5")).toBe(true); + expect(isGlmThinkingModel("z-ai/glm-5.1")).toBe(true); + expect(isGlmThinkingModel("opencode/glm-5-turbo")).toBe(true); + expect(isGlmThinkingModel("opencode-go/glm5-turbo")).toBe(true); + expect(isGlmThinkingModel("zai-coding-plan/glm-5")).toBe(true); + }); + + test("#given GLM VLM models #then returns false", () => { + expect(isGlmThinkingModel("opencode/glm-4.6v")).toBe(false); + expect(isGlmThinkingModel("opencode/glm-5v")).toBe(false); + expect(isGlmThinkingModel("opencode/glm-5v-turbo")).toBe(false); + }); + + test("#given non-GLM models #then returns false", () => { + expect(isGlmThinkingModel("openai/gpt-5.4")).toBe(false); + expect(isGlmThinkingModel("anthropic/claude-opus-4-7")).toBe(false); + expect(isGlmThinkingModel("google/gemini-3.1-pro")).toBe(false); + }); +}); + +describe("isGlmSisyphusHarnessModel", () => { + test("#given exact GLM Sisyphus harness families #then returns true", () => { + expect(isGlmSisyphusHarnessModel("z-ai/glm-5")).toBe(true); + expect(isGlmSisyphusHarnessModel("vercel/zai/glm-5")).toBe(true); + expect(isGlmSisyphusHarnessModel("zai-coding-plan/glm-5")).toBe(true); + expect(isGlmSisyphusHarnessModel("z-ai/glm-5.1")).toBe(true); + expect(isGlmSisyphusHarnessModel("zai-org/glm-5.1:thinking")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm-5-1")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm5.1")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode/glm-5-turbo")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm5-turbo")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode/glm-5v-turbo")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm5v-turbo")).toBe(true); + }); + + test("#given compact plain glm5 #then returns false", () => { + expect(isGlmSisyphusHarnessModel("z-ai/glm5")).toBe(false); + expect(isGlmSisyphusHarnessModel("glm5")).toBe(false); + expect(isGlmSisyphusHarnessModel("z-ai/glm5:thinking")).toBe(false); + }); + + test("#given non-target GLM variants #then returns false", () => { + expect(isGlmSisyphusHarnessModel("opencode/glm-4.6v")).toBe(false); + expect(isGlmSisyphusHarnessModel("opencode/go/glm-4-6v")).toBe(false); + expect(isGlmSisyphusHarnessModel("z-ai/glm-5.1-preview")).toBe(false); + expect(isGlmSisyphusHarnessModel("accounts/fireworks/models/glm-5p1")).toBe(false); + expect(isGlmSisyphusHarnessModel("opencode/big-pickle")).toBe(false); + }); + + test("#given other providers and families #then returns false", () => { + expect(isGlmSisyphusHarnessModel("openai/gpt-5.4")).toBe(false); + expect(isGlmSisyphusHarnessModel("anthropic/claude-opus-4-7")).toBe(false); + expect(isGlmSisyphusHarnessModel("google/gemini-3.1-pro")).toBe(false); + expect(isGlmSisyphusHarnessModel("moonshotai/kimi-k2.5")).toBe(false); + }); +}); + describe("isGeminiModel", () => { test("#given google provider models #then returns true", () => { expect(isGeminiModel("google/gemini-3.1-pro")).toBe(true); diff --git a/src/agents/types.ts b/src/agents/types.ts index 79fcec7f870..eb82c4b71cb 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -1,72 +1,32 @@ import type { AgentConfig } from "@opencode-ai/sdk"; -/** - * Agent mode determines UI model selection behavior: - * - "primary": Respects user's UI-selected model (sisyphus, atlas) - * - "subagent": Uses own fallback chain, ignores UI selection (oracle, explore, etc.) - * - "all": Available in both contexts (OpenCode compatibility) - */ export type AgentMode = "primary" | "subagent" | "all"; -/** - * Agent factory function with static mode property. - * Mode is exposed as static property for pre-instantiation access. - */ export type AgentFactory = ((model: string) => AgentConfig) & { mode: AgentMode; }; -/** - * Agent category for grouping in Sisyphus prompt sections - */ export type AgentCategory = | "exploration" | "specialist" | "advisor" | "utility"; -/** - * Cost classification for Tool Selection table - */ export type AgentCost = "FREE" | "CHEAP" | "EXPENSIVE"; -/** - * Delegation trigger for Sisyphus prompt's Delegation Table - */ export interface DelegationTrigger { - /** Domain of work (e.g., "Frontend UI/UX") */ domain: string; - /** When to delegate (e.g., "Visual changes only...") */ trigger: string; } -/** - * Metadata for generating Sisyphus prompt sections dynamically - * This allows adding/removing agents without manually updating the Sisyphus prompt - */ export interface AgentPromptMetadata { - /** Category for grouping in prompt sections */ category: AgentCategory; - - /** Cost classification for Tool Selection table */ cost: AgentCost; - - /** Domain triggers for Delegation Table */ triggers: DelegationTrigger[]; - - /** When to use this agent (for detailed sections) */ useWhen?: string[]; - - /** When NOT to use this agent */ avoidWhen?: string[]; - - /** Optional dedicated prompt section (markdown) - for agents like Oracle that have special sections */ dedicatedSection?: string; - - /** Nickname/alias used in prompt (e.g., "Oracle" instead of "oracle") */ promptAlias?: string; - - /** Key triggers that should appear in Phase 0 (e.g., "External library mentioned → fire librarian") */ keyTrigger?: string; } @@ -130,6 +90,42 @@ export function isGlmModel(model: string): boolean { return modelName.includes("glm"); } +/** Matches GLM VLM variants (e.g., glm-4.6v, glm-5v, glm-5v-turbo). */ +const GLM_VISION_MODEL_RE = /glm[\d.-]+v/ +export function isGlmVisionModel(model: string): boolean { + const modelName = extractModelName(model).toLowerCase(); + return modelName.includes("glm") && GLM_VISION_MODEL_RE.test(modelName); +} + +/** Matches GLM-5+ text-only models that support extended thinking. + * Excludes VLM variants (glm-5v-turbo, glm-4.6v) which may not support thinking. + */ +export function isGlmThinkingModel(model: string): boolean { + const modelName = extractModelName(model).toLowerCase(); + return isGlmModel(model) && !isGlmVisionModel(model) && /^glm[-]?5/.test(modelName); +} + +const GLM_SISYPHUS_HARNESS_MODELS: ReadonlySet = new Set([ + "glm-5", + "glm-5.1", + "glm-5-1", + "glm-5.1:thinking", + "glm-5-1:thinking", + "glm5.1", + "glm5-1", + "glm5.1:thinking", + "glm5-1:thinking", + "glm-5-turbo", + "glm5-turbo", + "glm-5v-turbo", + "glm5v-turbo", +]) + +export function isGlmSisyphusHarnessModel(model: string): boolean { + const modelName = extractModelName(model).toLowerCase(); + return GLM_SISYPHUS_HARNESS_MODELS.has(modelName); +} + export function isGeminiModel(model: string): boolean { if (GEMINI_PROVIDERS.some((prefix) => model.startsWith(prefix))) return true; diff --git a/src/hooks/ralph-loop/ralph-loop-event-handler.ts b/src/hooks/ralph-loop/ralph-loop-event-handler.ts index fcffb9db492..cbf313873d2 100644 --- a/src/hooks/ralph-loop/ralph-loop-event-handler.ts +++ b/src/hooks/ralph-loop/ralph-loop-event-handler.ts @@ -58,6 +58,8 @@ export function createRalphLoopEventHandler( ) { const inFlightSessions = new Set() const runtimeErrorRetriedSessions = new Map() + const MAX_RUNTIME_ERROR_RETRIES = 3 + let runtimeErrorRetryCount = 0 return async ({ event }: { event: { type: string; properties?: unknown } }): Promise => { const props = event.properties as Record | undefined @@ -153,6 +155,7 @@ export function createRalphLoopEventHandler( if (completionViaTranscript || completionViaApi) { runtimeErrorRetriedSessions.delete(sessionID) + runtimeErrorRetryCount = 0 log(`[${HOOK_NAME}] Completion detected!`, { sessionID, iteration: state.iteration, @@ -221,6 +224,7 @@ export function createRalphLoopEventHandler( log(`[${HOOK_NAME}] Failed to increment iteration`, { sessionID }) return } + runtimeErrorRetryCount = 0 log(`[${HOOK_NAME}] Continuing loop`, { sessionID, @@ -305,6 +309,18 @@ export function createRalphLoopEventHandler( return } + if (runtimeErrorRetryCount >= MAX_RUNTIME_ERROR_RETRIES) { + log(`[${HOOK_NAME}] Runtime error retry cap reached`, { + sessionID, + iteration: state.iteration, + retries: runtimeErrorRetryCount, + cap: MAX_RUNTIME_ERROR_RETRIES, + }) + options.loopState.clear() + await showMaxIterationsToast(ctx, state) + return + } + if ( typeof state.max_iterations === "number" && state.iteration >= state.max_iterations @@ -334,6 +350,7 @@ export function createRalphLoopEventHandler( loopState: options.loopState, }) runtimeErrorRetriedSessions.set(sessionID, newState.iteration) + runtimeErrorRetryCount++ } catch (err) { log(`[${HOOK_NAME}] Failed to retry after runtime error`, { sessionID, diff --git a/src/plugin-handlers/tool-config-handler.test.ts b/src/plugin-handlers/tool-config-handler.test.ts index 7344b48e223..87ee1f42ebf 100644 --- a/src/plugin-handlers/tool-config-handler.test.ts +++ b/src/plugin-handlers/tool-config-handler.test.ts @@ -217,6 +217,53 @@ describe("applyToolConfig", () => { }) }) + describe("#given Sisyphus uses task delegation instead of call_omo_agent", () => { + it("#then should deny call_omo_agent for Sisyphus", () => { + const params = createParams({ agents: ["sisyphus"] }) + + applyToolConfig(params) + + const agent = params.agentResult.sisyphus as { + permission: Record + } + expect(agent.permission.call_omo_agent).toBe("deny") + expect(agent.permission.task).toBe("allow") + }) + + it("#then should keep call_omo_agent denied for Atlas", () => { + const params = createParams({ agents: ["atlas"] }) + + applyToolConfig(params) + + const agent = params.agentResult.atlas as { + permission: Record + } + expect(agent.permission.call_omo_agent).toBe("deny") + }) + + it("#then should keep call_omo_agent denied for Prometheus", () => { + const params = createParams({ agents: ["prometheus"] }) + + applyToolConfig(params) + + const agent = params.agentResult.prometheus as { + permission: Record + } + expect(agent.permission.call_omo_agent).toBe("deny") + }) + + it("#then should keep call_omo_agent denied for Hephaestus", () => { + const params = createParams({ agents: ["hephaestus"] }) + + applyToolConfig(params) + + const agent = params.agentResult.hephaestus as { + permission: Record + } + expect(agent.permission.call_omo_agent).toBe("deny") + }) + }) + describe("#given task_system is undefined", () => { describe("#when applying tool config", () => { it("#then should not disable todo tools globally by default", () => {