From 3dfbc0947bd4d3095a0266106f45ab88c3f9fed2 Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 30 Apr 2026 18:31:49 +0900 Subject: [PATCH 01/25] feat(agents): add GLM-5 harness routing and context optimizations - Add isGlmSisyphusHarnessModel for GLM-5/5.1/5-turbo detection - Route GLM harness models to specialized prompts in Sisyphus agents - Add Small Context Working Memory with state slices for GLM context optimization - Add GLM-specific context priorities and vision constraints for Sisyphus Junior - Add comprehensive tests for GLM prompt validation and routing --- src/agents/sisyphus-junior/agent.ts | 23 +- src/agents/sisyphus-junior/glm.test.ts | 55 ++++ src/agents/sisyphus-junior/glm.ts | 24 ++ src/agents/sisyphus-junior/index.test.ts | 136 ++++++++ src/agents/sisyphus-junior/index.ts | 1 + src/agents/sisyphus.glm-routing.test.ts | 56 ++++ src/agents/sisyphus.ts | 28 ++ src/agents/sisyphus/glm.test.ts | 105 ++++++ src/agents/sisyphus/glm.ts | 387 +++++++++++++++++++++++ src/agents/types.test.ts | 38 +++ src/agents/types.ts | 8 + 11 files changed, 856 insertions(+), 5 deletions(-) create mode 100644 src/agents/sisyphus-junior/glm.test.ts create mode 100644 src/agents/sisyphus-junior/glm.ts create mode 100644 src/agents/sisyphus.glm-routing.test.ts create mode 100644 src/agents/sisyphus/glm.test.ts create mode 100644 src/agents/sisyphus/glm.ts diff --git a/src/agents/sisyphus-junior/agent.ts b/src/agents/sisyphus-junior/agent.ts index 5f01b2914b0..520ec9e4e69 100644 --- a/src/agents/sisyphus-junior/agent.ts +++ b/src/agents/sisyphus-junior/agent.ts @@ -5,14 +5,22 @@ * Category-spawned executor with domain-specific configurations. * * Routing: - * 1. GPT models (openai/*, github-copilot/gpt-*) -> gpt.ts (GPT-5.4 optimized) - * 2. Gemini models (google/*, google-vertex/*) -> gemini.ts (Gemini-optimized) - * 3. Default (Claude, etc.) -> default.ts (Claude-optimized) + * 1. Kimi models -> kimi-k2-6.ts (Kimi-optimized) + * 2. GLM harness models -> glm.ts (GLM-optimized) + * 3. GPT models (openai/*, github-copilot/gpt-*) -> gpt.ts (GPT-5.4 optimized) + * 4. Gemini models (google/*, google-vertex/*) -> gemini.ts (Gemini-optimized) + * 5. Default (Claude, etc.) -> default.ts (Claude-optimized) */ import type { AgentConfig } from "@opencode-ai/sdk" import type { AgentMode } from "../types" -import { isGlmModel, isGpt5_5Model, isGptModel, isGeminiModel, isKimiK2Model } from "../types" +import { + isGlmSisyphusHarnessModel, + isGpt5_5Model, + isGptModel, + isGeminiModel, + isKimiK2Model, +} from "../types" import type { AgentOverrideConfig } from "../../config/schema" import { createAgentToolRestrictions, @@ -21,6 +29,7 @@ import { import { getGptApplyPatchPermission } from "../gpt-apply-patch-guard" import { buildDefaultSisyphusJuniorPrompt } from "./default" +import { buildGlmSisyphusJuniorPrompt } from "./glm" import { buildKimiK26SisyphusJuniorPrompt } from "./kimi-k2-6" import { buildGptSisyphusJuniorPrompt } from "./gpt" import { buildGpt54SisyphusJuniorPrompt } from "./gpt-5-4" @@ -43,6 +52,7 @@ export const SISYPHUS_JUNIOR_DEFAULTS = { export type SisyphusJuniorPromptSource = | "default" | "kimi-k2" + | "glm" | "gpt" | "gpt-5-5" | "gpt-5-4" @@ -51,6 +61,7 @@ export type SisyphusJuniorPromptSource = export function getSisyphusJuniorPromptSource(model?: string): SisyphusJuniorPromptSource { if (model && isKimiK2Model(model)) return "kimi-k2" + if (model && isGlmSisyphusHarnessModel(model)) return "glm" if (model && isGptModel(model)) { if (isGpt5_5Model(model)) return "gpt-5-5" const lower = model.toLowerCase() @@ -77,6 +88,8 @@ export function buildSisyphusJuniorPrompt( switch (source) { case "kimi-k2": return buildKimiK26SisyphusJuniorPrompt(useTaskSystem, promptAppend) + case "glm": + return buildGlmSisyphusJuniorPrompt(useTaskSystem, promptAppend) case "gpt-5-5": return buildGpt55SisyphusJuniorPrompt(useTaskSystem, promptAppend) case "gpt-5-4": @@ -145,7 +158,7 @@ export function createSisyphusJuniorAgentWithOverrides( return { ...base, reasoningEffort: "medium" } as AgentConfig } - if (isGlmModel(model)) { + if (isGlmSisyphusHarnessModel(model)) { return base as AgentConfig } diff --git a/src/agents/sisyphus-junior/glm.test.ts b/src/agents/sisyphus-junior/glm.test.ts new file mode 100644 index 00000000000..d86eeaab54b --- /dev/null +++ b/src/agents/sisyphus-junior/glm.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, test } from "bun:test" +import { buildDefaultSisyphusJuniorPrompt } from "./default" +import { buildGlmSisyphusJuniorPrompt } from "./glm" + +function countOccurrences(text: string, needle: string): number { + return text.split(needle).length - 1 +} + +describe("buildGlmSisyphusJuniorPrompt", () => { + test("#given no append #then keeps the default Junior prompt as its base", () => { + // given + const basePrompt = buildDefaultSisyphusJuniorPrompt(false) + + // when + const prompt = buildGlmSisyphusJuniorPrompt(false) + + // then + expect(prompt.startsWith(basePrompt)).toBe(true) + }) + + test("#given no append #then adds exactly one GLM context block", () => { + // given / when + const prompt = buildGlmSisyphusJuniorPrompt(false) + + // then + expect(countOccurrences(prompt, "")).toBe(1) + expect(countOccurrences(prompt, "")).toBe(1) + }) + + test("#given no append #then stays lightweight and avoids full ledger instructions", () => { + // given / when + const prompt = buildGlmSisyphusJuniorPrompt(false) + + // then + expect(prompt).toContain(".sisyphus/state/{plan-or-session}/") + expect(prompt).toContain("Read only the slice named in the task prompt") + expect(prompt).not.toContain("Toggle RL") + expect(prompt).not.toContain("goal.md") + expect(prompt).not.toContain("decisions.md") + expect(prompt).not.toContain("files.md") + expect(prompt).not.toContain("blockers.md") + expect(prompt).not.toContain("verification.md") + }) + + test("#given promptAppend #then appends it exactly once", () => { + // given + const promptAppend = "Extra instructions here" + + // when + const prompt = buildGlmSisyphusJuniorPrompt(false, promptAppend) + + // then + expect(countOccurrences(prompt, promptAppend)).toBe(1) + }) +}) diff --git a/src/agents/sisyphus-junior/glm.ts b/src/agents/sisyphus-junior/glm.ts new file mode 100644 index 00000000000..85c456ff400 --- /dev/null +++ b/src/agents/sisyphus-junior/glm.ts @@ -0,0 +1,24 @@ +import { resolvePromptAppend } from "../builtin-agents/resolve-file-uri" +import { buildDefaultSisyphusJuniorPrompt } from "./default" + +export function buildGlmSisyphusJuniorPrompt( + useTaskSystem: boolean, + promptAppend?: string +): string { + const prompt = `${buildDefaultSisyphusJuniorPrompt(useTaskSystem)} + + +## GLM context priorities +- Keep the working set tiny: start from the current task prompt, the current file, and the latest verification output. +- Treat .sisyphus/state/{plan-or-session}/ as optional Sisyphus handoff context only. +- Read only the slice named in the task prompt, or the file/output directly needed for the current step. +- Do not expand into a full ledger or read unrelated state files. + +## Vision Constraint (GLM text-only) +- GLM models (GLM-5, GLM-5.1, GLM-5-turbo) CANNOT render or analyze images, screenshots, or visual content. +- When a task involves viewing images or visual content, delegate to the multimodal-looker agent instead of attempting it yourself. +` + + if (!promptAppend) return prompt + return prompt + "\n\n" + resolvePromptAppend(promptAppend) +} diff --git a/src/agents/sisyphus-junior/index.test.ts b/src/agents/sisyphus-junior/index.test.ts index 7da727f30d9..1edacfbdfb5 100644 --- a/src/agents/sisyphus-junior/index.test.ts +++ b/src/agents/sisyphus-junior/index.test.ts @@ -2,6 +2,7 @@ import { describe, expect, test } from "bun:test" import { createSisyphusJuniorAgentWithOverrides, SISYPHUS_JUNIOR_DEFAULTS, + buildGlmSisyphusJuniorPrompt, getSisyphusJuniorPromptSource, buildSisyphusJuniorPrompt, } from "./index" @@ -181,6 +182,19 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { }) }) + describe("barrel exports", () => { + test("exposes buildGlmSisyphusJuniorPrompt", () => { + // given + const model = false + + // when + const prompt = buildGlmSisyphusJuniorPrompt(model) + + // then + expect(prompt).toContain("GLM context priorities") + }) + }) + describe("tool safety (task blocked, call_omo_agent allowed)", () => { test("task remains blocked, call_omo_agent is allowed via tools format", () => { // given @@ -464,6 +478,116 @@ describe("getSisyphusJuniorPromptSource", () => { expect(source).toBe("gpt-5-4") }) + test("returns 'glm' for z-ai/glm-5", () => { + // given + const model = "z-ai/glm-5" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for z-ai/glm-5.1", () => { + // given + const model = "z-ai/glm-5.1" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for zai-org/glm-5.1:thinking", () => { + // given + const model = "zai-org/glm-5.1:thinking" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for opencode-go/glm-5-turbo", () => { + // given + const model = "opencode-go/glm-5-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for opencode-go/glm5-turbo", () => { + // given + const model = "opencode-go/glm5-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for z-ai/glm-5v-turbo", () => { + // given + const model = "z-ai/glm-5v-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'glm' for z-ai/glm5v-turbo", () => { + // given + const model = "z-ai/glm5v-turbo" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("glm") + }) + + test("returns 'default' for z-ai/glm-4.6v", () => { + // given + const model = "z-ai/glm-4.6v" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("default") + }) + + test("returns 'default' for z-ai/glm-5.1-preview", () => { + // given + const model = "z-ai/glm-5.1-preview" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("default") + }) + + test("returns 'default' for big-pickle/glm", () => { + // given + const model = "big-pickle/glm" + + // when + const source = getSisyphusJuniorPromptSource(model) + + // then + expect(source).toBe("default") + }) + test("returns 'gpt-5-4' for GitHub Copilot GPT 5.4", () => { // given const model = "github-copilot/gpt-5.4" @@ -557,6 +681,18 @@ describe("buildSisyphusJuniorPrompt", () => { expect(prompt).toContain("Do not use `apply_patch`") }) + test("GLM harness model uses GLM prompt", () => { + // given + const model = "z-ai/glm-5" + + // when + const prompt = buildSisyphusJuniorPrompt(model, false) + + // then + expect(prompt).toContain("GLM context priorities") + expect(prompt).toContain("") + }) + test("GPT 5.3 Codex model uses GPT-5.3-codex prompt", () => { // given const model = "openai/gpt-5.3-codex" diff --git a/src/agents/sisyphus-junior/index.ts b/src/agents/sisyphus-junior/index.ts index 5232b23fdd1..f866bdf7cb1 100644 --- a/src/agents/sisyphus-junior/index.ts +++ b/src/agents/sisyphus-junior/index.ts @@ -1,4 +1,5 @@ export { buildDefaultSisyphusJuniorPrompt } from "./default" +export { buildGlmSisyphusJuniorPrompt } from "./glm" export { buildKimiK26SisyphusJuniorPrompt } from "./kimi-k2-6" export { buildGptSisyphusJuniorPrompt } from "./gpt" export { buildGpt54SisyphusJuniorPrompt } from "./gpt-5-4" diff --git a/src/agents/sisyphus.glm-routing.test.ts b/src/agents/sisyphus.glm-routing.test.ts new file mode 100644 index 00000000000..f8de73c1c36 --- /dev/null +++ b/src/agents/sisyphus.glm-routing.test.ts @@ -0,0 +1,56 @@ +/// + +import { describe, expect, test } from "bun:test"; +import { createSisyphusAgent } from "./sisyphus"; + +function getPrompt(model: string): string { + const agent = createSisyphusAgent(model); + return agent.prompt ?? ""; +} + +describe("createSisyphusAgent - GLM routing", () => { + test("#given GLM harness models #then routes to the GLM prompt builder", () => { + const models = [ + "opencode-go/glm-5", + "opencode-go/glm-5.1", + "opencode-go/glm-5-1", + "opencode-go/glm-5-turbo", + "opencode-go/glm5-turbo", + "zai/glm-5.1:thinking", + "zai/glm-5v-turbo", + "zai/glm5v-turbo", + ]; + + for (const model of models) { + const prompt = getPrompt(model); + + expect(prompt).toContain(".sisyphus/state/"); + expect(prompt).toContain(""); + expect(prompt).toContain("goal.md"); + } + }); + + test("#given excluded GLM-like models #then does not route to the GLM prompt builder", () => { + const models = ["zai-coding-plan/glm-4.6v", "zai/glm-5.1-preview", "big-pickle/glm"]; + + for (const model of models) { + const prompt = getPrompt(model); + + expect(prompt).not.toContain(".sisyphus/state/"); + expect(prompt).not.toContain(""); + expect(prompt).not.toContain("goal.md"); + expect(prompt).not.toContain("verification.md"); + } + }); + + test("#given Kimi model #then keeps Kimi markers and avoids GLM ledger markers", () => { + const prompt = getPrompt("moonshotai/kimi-k2.6"); + + expect(prompt).toContain("Toggle RL"); + expect(prompt).toContain("K2.x post-training context"); + expect(prompt).not.toContain(".sisyphus/state/"); + expect(prompt).not.toContain(""); + expect(prompt).not.toContain("goal.md"); + expect(prompt).not.toContain("verification.md"); + }); +}); diff --git a/src/agents/sisyphus.ts b/src/agents/sisyphus.ts index 2ecda49707c..04f6d183a45 100644 --- a/src/agents/sisyphus.ts +++ b/src/agents/sisyphus.ts @@ -7,6 +7,7 @@ import { isGptNativeSisyphusModel, isClaudeOpus47Model, isKimiK2Model, + isGlmSisyphusHarnessModel, } from "./types"; import { buildGeminiToolMandate, @@ -17,6 +18,7 @@ import { buildGeminiToolCallExamples, } from "./sisyphus/gemini"; import { buildClaudeOpus47SisyphusPrompt } from "./sisyphus/claude-opus-4-7"; +import { buildGlmSisyphusPrompt } from "./sisyphus/glm"; import { buildGpt54SisyphusPrompt } from "./sisyphus/gpt-5-4"; import { buildGpt55SisyphusPrompt } from "./sisyphus/gpt-5-5"; import { buildKimiK26SisyphusPrompt } from "./sisyphus/kimi-k2-6"; @@ -518,6 +520,32 @@ export function createSisyphusAgent( }; } + if (isGlmSisyphusHarnessModel(model)) { + const prompt = buildGlmSisyphusPrompt( + model, + agents, + tools, + skills, + categories, + useTaskSystem, + ); + return { + description: + "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", + mode: MODE, + model, + maxTokens: 64000, + prompt, + color: "#00CED1", + permission: { + question: "allow", + call_omo_agent: "deny", + ...getFrontierToolSchemaPermission(model), + ...getGptApplyPatchPermission(model), + } as AgentConfig["permission"], + }; + } + if (isGpt5_5Model(model)) { const prompt = buildGpt55SisyphusPrompt( model, diff --git a/src/agents/sisyphus/glm.test.ts b/src/agents/sisyphus/glm.test.ts new file mode 100644 index 00000000000..6c6e9441ec8 --- /dev/null +++ b/src/agents/sisyphus/glm.test.ts @@ -0,0 +1,105 @@ +/// + +import { describe, test, expect } from "bun:test"; +import { buildGlmSisyphusPrompt } from "./glm"; +import { buildKimiK26SisyphusPrompt } from "./kimi-k2-6"; + +const MODEL = "z-ai/glm-5"; + +function buildEmptyGlmPrompt(useTaskSystem = false): string { + return buildGlmSisyphusPrompt(MODEL, [], [], [], [], useTaskSystem); +} + +describe("buildGlmSisyphusPrompt - Small_Context_Working_Memory block", () => { + test("#given empty inputs #then prompt contains exactly one Small_Context_Working_Memory block", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + const openCount = prompt.split("").length - 1; + const closeCount = + prompt.split("").length - 1; + expect(openCount).toBe(1); + expect(closeCount).toBe(1); + }); + + test("#given empty inputs #then prompt anchors slices under .sisyphus/state/{plan-or-session}/", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt).toContain(".sisyphus/state/{plan-or-session}/"); + }); + + test("#given empty inputs #then prompt declares all five state slice file names", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt).toContain("goal.md"); + expect(prompt).toContain("decisions.md"); + expect(prompt).toContain("files.md"); + expect(prompt).toContain("blockers.md"); + expect(prompt).toContain("verification.md"); + }); + + test("#given empty inputs #then prompt encodes the 500-token soft slice target", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt).toContain("500"); + }); + + test("#given empty inputs #then prompt caps slice reads at 4 per turn", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt).toMatch(/AT MOST 4 slices/); + }); + + test("#given empty inputs #then prompt requires relevant-slice-only reads", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt.toLowerCase()).toContain("relevant-slice-only"); + }); + + test("#given empty inputs #then prompt explains that missing slices mean first run", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt.toLowerCase()).toContain("first run"); + }); +}); + +describe("buildGlmSisyphusPrompt - divergence from Kimi prompt", () => { + test("#given empty inputs #then GLM prompt does not mention Toggle RL", () => { + // given / when + const prompt = buildEmptyGlmPrompt(); + + // then + expect(prompt).not.toContain("Toggle RL"); + }); + + test("#given identical empty inputs #then GLM prompt is shorter than the Kimi prompt", () => { + // given + const glmPrompt = buildGlmSisyphusPrompt(MODEL, [], [], [], [], false); + const kimiPrompt = buildKimiK26SisyphusPrompt(MODEL, [], [], [], [], false); + + // then + expect(glmPrompt.length).toBeLessThan(kimiPrompt.length); + }); + + test("#given empty inputs with task system enabled #then GLM prompt is still shorter than Kimi prompt", () => { + // given + const glmPrompt = buildGlmSisyphusPrompt(MODEL, [], [], [], [], true); + const kimiPrompt = buildKimiK26SisyphusPrompt(MODEL, [], [], [], [], true); + + // then + expect(glmPrompt.length).toBeLessThan(kimiPrompt.length); + }); +}); diff --git a/src/agents/sisyphus/glm.ts b/src/agents/sisyphus/glm.ts new file mode 100644 index 00000000000..416edd37ba6 --- /dev/null +++ b/src/agents/sisyphus/glm.ts @@ -0,0 +1,387 @@ +/** + * GLM-tuned Sisyphus prompt builder. + * + * Design goals: + * - Base structure mirrors `default.ts` (Phase 0/1/2A/2B/2C/3 + helper sections), + * keeping GLM behavior close to Claude/default rather than Kimi-style 8-block prompts. + * - Diverges from the default in exactly one place: a `` + * block that points GLM at the lightweight `.sisyphus/state/{plan-or-session}/` slice + * convention introduced for GLM tuning. + * - State slices are read-on-demand context, not new persistence infrastructure. + * The harness does not create or read these files; the agent does, only when it has + * something concrete to record. + */ + +import type { + AvailableAgent, + AvailableTool, + AvailableSkill, + AvailableCategory, +} from "../dynamic-agent-prompt-builder"; +import { + buildKeyTriggersSection, + buildToolSelectionTable, + buildExploreSection, + buildLibrarianSection, + buildDelegationTable, + buildCategorySkillsDelegationGuide, + buildOracleSection, + buildHardBlocksSection, + buildAntiPatternsSection, + buildParallelDelegationSection, + buildNonClaudePlannerSection, + buildAntiDuplicationSection, + categorizeTools, +} from "../dynamic-agent-prompt-builder"; +import { buildTaskManagementSection } from "./default"; + +export function buildGlmSisyphusPrompt( + model: string, + availableAgents: AvailableAgent[], + availableTools: AvailableTool[] = [], + availableSkills: AvailableSkill[] = [], + availableCategories: AvailableCategory[] = [], + useTaskSystem = false, +): string { + const keyTriggers = buildKeyTriggersSection(availableAgents, availableSkills); + const toolSelection = buildToolSelectionTable( + availableAgents, + availableTools, + availableSkills, + ); + const exploreSection = buildExploreSection(availableAgents); + const librarianSection = buildLibrarianSection(availableAgents); + const categorySkillsGuide = buildCategorySkillsDelegationGuide( + availableCategories, + availableSkills, + ); + const delegationTable = buildDelegationTable(availableAgents); + const oracleSection = buildOracleSection(availableAgents); + const hardBlocks = buildHardBlocksSection(); + const antiPatterns = buildAntiPatternsSection(); + const parallelDelegationSection = buildParallelDelegationSection( + model, + availableCategories, + ); + const nonClaudePlannerSection = buildNonClaudePlannerSection(model); + const taskManagementSection = buildTaskManagementSection(useTaskSystem); + const todoHookNote = useTaskSystem + ? "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" + : "YOUR TODO CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TODO CONTINUATION])"; + + return ` +You are "Sisyphus" - Powerful AI Agent with orchestration capabilities from OhMyOpenCode. + +**Identity**: SF Bay Area engineer. Work, delegate, verify, ship. No AI slop. + +**Operating Mode**: You NEVER work alone when specialists are available. Frontend work → delegate. Deep research → parallel background agents. Complex architecture → consult Oracle. + +**Implementation Gate**: Follow user instructions. NEVER START IMPLEMENTING unless the user explicitly asks. ${todoHookNote} - if no implementation request, NEVER start work. + + + +## Working Memory via Small Context Slices + +GLM keeps a lightweight working memory under \`.sisyphus/state/{plan-or-session}/\` so continuity across turns does not require re-reading the full plan file or scrolling old messages. The directory key is the active plan name when one is present (\`.sisyphus/plans/{plan-name}.md\`), otherwise the current session label. + +### State slice files (created by you, only when needed) + +- \`goal.md\` - the active user goal in plain language: what is being built and what success looks like. +- \`decisions.md\` - architectural and routing choices already made, with one-line rationale. +- \`files.md\` - paths you have edited or that are part of the current working set. +- \`blockers.md\` - open questions, unresolved errors, or items waiting on user or specialist. +- \`verification.md\` - lsp/test/build evidence captured during this session. + +### Slice budget and read protocol + +- Treat every slice as a small context with a soft target of about 500 tokens. Keep entries terse and append-only. +- Read AT MOST 4 slices per turn. Pick only the slices that are directly relevant to what you are about to do; never load the full set "to be safe". +- Relevant-slice-only: if the current move does not depend on a slice, do not read it. +- Missing files means this is the first run for the current plan/session. Proceed without them and create slices only when you have something concrete to record. +- Slice reads substitute for re-reading the plan file or prior turns. They never substitute for actual code reads or tool output. + +### Slice write protocol + +- Append the new line(s) needed; do not rewrite the whole file. +- Update \`goal.md\` when the goal or scope changes; \`decisions.md\` when you pick a routing or architectural option; \`files.md\` when the working set shifts; \`blockers.md\` when something blocks you; \`verification.md\` when you run lsp/tests/build. +- Never create the \`.sisyphus/state\` directory speculatively. Only when a real state update is required. + + + + +## Phase 0 - Intent Gate (EVERY message) + +${keyTriggers} + + +### Step 0: Verbalize Intent (BEFORE Classification) + +Before classifying the task, identify what the user actually wants from you as an orchestrator. Map the surface form to the true intent, then announce your routing decision in one short line. + +**Intent → Routing Map:** + +| Surface Form | True Intent | Routing | +|---|---|---| +| "explain X", "how does Y work" | Research/understanding | explore/librarian → synthesize → answer | +| "implement X", "add Y", "create Z" | Implementation (explicit) | plan → delegate or execute | +| "look into X", "check Y", "investigate" | Investigation | explore → report findings | +| "what do you think about X?" | Evaluation | evaluate → propose → wait for confirmation | +| "I'm seeing error X" / "Y is broken" | Fix needed | diagnose → fix minimally | +| "refactor", "improve", "clean up" | Open-ended change | assess codebase first → propose approach | + +**Verbalize before proceeding:** + +> "I detect [research / implementation / investigation / evaluation / fix / open-ended] intent - [reason]. My approach: [plan]." + +This anchors routing. It does NOT commit you to implementation - only the user's explicit request does. + + +### Step 1: Classify Request Type + +- **Trivial** (single file, known location) → direct tools, unless a Key Trigger applies +- **Explicit** (specific file/line, clear command) → execute directly +- **Exploratory** ("How does X work?") → fire 1-3 explore agents in parallel + tools +- **Open-ended** ("Improve", "Refactor", "Add feature") → assess codebase first +- **Ambiguous** (unclear scope) → ask ONE clarifying question + +### Step 1.5: Turn-Local Intent Reset + +Reclassify intent from the CURRENT user message only. Never auto-carry implementation mode from prior turns. + +- Question/explanation/investigation → answer or analyze ONLY. No todos. No file edits. +- Still gathering context → confirm context first; do not start implementation yet. + +### Step 2: Check for Ambiguity + +- Single valid interpretation → proceed +- Multiple interpretations, similar effort → proceed with reasonable default, note assumption +- Multiple interpretations, 2x+ effort difference → MUST ask +- Missing critical info → MUST ask +- User's design seems flawed → MUST raise concern before implementing + +### Step 2.5: Context-Completion Gate (BEFORE Implementation) + +Implement only when ALL are true: +1. Current message contains an explicit implementation verb (implement/add/create/fix/change/write). +2. Scope is concrete enough to execute without guessing. +3. No blocking specialist result is pending (especially Oracle). + +If any condition fails, do research/clarification only, then wait. + +### Step 3: Validate Before Acting + +**Delegation Check (mandatory before acting directly):** +1. Is there a specialized agent that fits this request? +2. If not, which \`task\` category fits (visual-engineering, ultrabrain, quick, etc.)? Which skills should ride along via \`load_skills\`? +3. Self only when the task is demonstrably trivial and local AND no category/specialist fits. + +**Default Bias: DELEGATE. Work yourself only when it is super simple.** + +**Vision/Image Constraint (GLM text-only models):** +- GLM-5, GLM-5.1, GLM-5-turbo are text-only models. They CANNOT render or analyze images, screenshots, PDFs, or visual content. +- When a task involves viewing/analyzing images or visual content, ALWAYS delegate to the \`multimodal-looker\` agent. NEVER attempt to use \`look_at\`, \`read\`, or screenshot tools on image files yourself. +- For browser visual testing (screenshot verification, UI diff), delegate to \`multimodal-looker\` or use \`visual-engineering\` category with \`playwright\` skill. + +### When to Challenge the User + +If you observe a design that will cause obvious problems, contradicts established codebase patterns, or misunderstands existing code: raise the concern concisely, propose an alternative, ask whether to proceed. + +\`\`\` +I notice [observation]. This might cause [problem] because [reason]. +Alternative: [your suggestion]. +Should I proceed with your original request, or try the alternative? +\`\`\` + +--- + +## Phase 1 - Codebase Assessment (for open-ended tasks) + +Before following existing patterns, assess whether they are worth following. + +### Quick Assessment: +1. Check linter/formatter/type configs. +2. Sample 2-3 similar files for consistency. +3. Note project age signals (dependencies, patterns). + +### State Classification: + +- **Disciplined** (consistent patterns, configs, tests) → follow existing style strictly +- **Transitional** (mixed patterns) → ask which pattern to follow +- **Legacy/Chaotic** (no consistency) → propose conventions, get confirmation +- **Greenfield** → modern best practices + +Different patterns may serve different purposes (intentional). Migration may be in progress. Verify before assuming. + +--- + +## Phase 2A - Exploration & Research + +${toolSelection} + +${exploreSection} + +${librarianSection} + +### Parallel Execution (DEFAULT behavior) + +**Parallelize EVERYTHING. Independent reads, searches, and agents run SIMULTANEOUSLY.** + + +- Parallelize independent tool calls: multiple file reads, grep searches, agent fires - all at once +- Explore/Librarian = background grep. ALWAYS \`run_in_background=true\`, ALWAYS parallel +- Fire 2-5 explore/librarian agents in parallel for any non-trivial codebase question +- Parallelize independent file reads +- After any write/edit tool call, briefly restate what changed, where, and what validation follows +- Prefer tools over internal knowledge whenever you need specific data (files, configs, patterns) + + +**Explore/Librarian = Grep, not consultants.** + +Each agent prompt should include: +- [CONTEXT]: what task, which modules, what approach +- [GOAL]: what decision the results unblock +- [DOWNSTREAM]: how you will use the results +- [REQUEST]: what to find, what format, what to skip + +### Background Result Collection: +1. Launch parallel agents → receive task_ids. +2. Continue only with non-overlapping work; otherwise END YOUR RESPONSE. +3. The system sends \`\` when tasks complete. +4. Collect via \`background_output(task_id="...")\` ONLY after the reminder. +5. Cancel disposable tasks individually via \`background_cancel(taskId="...")\`. + +${buildAntiDuplicationSection()} + +### Search Stop Conditions + +STOP searching when: enough context to proceed, info repeating across sources, 2 iterations with no new data, or direct answer found. **Time is precious.** + +--- + +## Phase 2B - Implementation + +### Pre-Implementation: +0. Find relevant skills and load them IMMEDIATELY via the \`skill\` tool. +1. 2+ steps → create todo list IMMEDIATELY, in detail. No announcements. +2. Mark current task \`in_progress\` before starting. +3. Mark \`completed\` as soon as done. Never batch. + +${categorySkillsGuide} + +${nonClaudePlannerSection} + +${parallelDelegationSection} + +${delegationTable} + +### Delegation Prompt Structure (ALL 6 sections required): + +\`\`\` +1. TASK: Atomic, specific goal +2. EXPECTED OUTCOME: Concrete deliverables with success criteria +3. REQUIRED TOOLS: Explicit tool whitelist +4. MUST DO: Exhaustive requirements - leave nothing implicit +5. MUST NOT DO: Forbidden actions +6. CONTEXT: File paths, existing patterns, constraints +\`\`\` + +After delegation: VERIFY against MUST DO/MUST NOT DO and existing patterns. Vague prompts → vague results. Be exhaustive. + +### Session Continuity (MANDATORY) + +Every \`task()\` returns a task_id. **USE IT** for follow-ups: +- Failed/incomplete → \`task_id="{id}", prompt="Fix: {specific error}"\` +- Follow-up question → \`task_id="{id}", prompt="Also: {question}"\` +- Multi-turn with same agent → \`task_id="{id}"\` - never start fresh + +This preserves full context, avoids repeated exploration, saves 70%+ tokens. + +### Code Changes: +- Match existing patterns in disciplined codebases. +- Propose approach first in chaotic codebases. +- Never suppress type errors with \`as any\`, \`@ts-ignore\`, \`@ts-expect-error\`. +- Never commit unless explicitly requested. +- **Bugfix Rule**: fix minimally. Never refactor while fixing. + +### Verification: + +Run \`lsp_diagnostics\` on changed files at the end of each logical task unit, before marking a todo complete, and before reporting completion. If the project has build/test commands, run them at task completion. + +### Evidence Requirements (task NOT complete without these): + +- **File edit** → \`lsp_diagnostics\` clean on changed files +- **Build command** → exit code 0 +- **Test run** → pass (or pre-existing failures explicitly noted) +- **Delegation** → result received and verified + +**NO EVIDENCE = NOT COMPLETE.** + +--- + +## Phase 2C - Failure Recovery + +1. Fix root causes, not symptoms. +2. Re-verify after every fix attempt. +3. Never shotgun debug. +4. After 3 consecutive failures: stop, revert to last known working state, document, consult Oracle, then ask the user if Oracle cannot resolve. + +Never leave code in a broken state. Never delete failing tests to "pass". + +--- + +## Phase 3 - Completion + +A task is complete when: +- All planned todos are done +- Diagnostics are clean on changed files +- Build passes (if applicable) +- The user's original request is fully addressed + +If verification fails: fix issues you caused. Do NOT fix pre-existing issues unless asked. Report: "Done. Note: N pre-existing errors unrelated to my changes." + +### Before Delivering Final Answer: +- Oracle running → end your response and wait for the completion notification first. +- Cancel disposable tasks individually via \`background_cancel(taskId="...")\`. + + +${oracleSection} + +${taskManagementSection} + + +## Communication Style + +### Be Concise +- Start work immediately. No acknowledgments. +- Answer directly without preamble. +- Don't summarize what you did unless asked. +- Don't explain code unless asked. + +### No Flattery +Never start responses with praise of the user's input. + +### No Status Updates +Skip casual acknowledgments. Use todos for tracking. + +### When User is Wrong +State your concern and the alternative concisely. Ask if they want to proceed anyway. + +### Match User's Style +Terse user → terse you. Detail wanted → detail given. + + + +${hardBlocks} + +${antiPatterns} + +## Soft Guidelines + +- Prefer existing libraries over new dependencies +- Prefer small, focused changes over large refactors +- When uncertain about scope, ask + +`; +} + +export { categorizeTools }; diff --git a/src/agents/types.test.ts b/src/agents/types.test.ts index 13cb0bf932d..7e528f78d72 100644 --- a/src/agents/types.test.ts +++ b/src/agents/types.test.ts @@ -3,6 +3,7 @@ import { isGptModel, isGeminiModel, isGlmModel, + isGlmSisyphusHarnessModel, isGptNativeSisyphusModel, isMiniMaxModel, } from "./types"; @@ -141,6 +142,43 @@ describe("isGlmModel", () => { }); }); +describe("isGlmSisyphusHarnessModel", () => { + test("#given exact GLM Sisyphus harness families #then returns true", () => { + expect(isGlmSisyphusHarnessModel("z-ai/glm-5")).toBe(true); + expect(isGlmSisyphusHarnessModel("vercel/zai/glm-5")).toBe(true); + expect(isGlmSisyphusHarnessModel("zai-coding-plan/glm-5")).toBe(true); + expect(isGlmSisyphusHarnessModel("z-ai/glm-5.1")).toBe(true); + expect(isGlmSisyphusHarnessModel("zai-org/glm-5.1:thinking")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm-5-1")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm5.1")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode/glm-5-turbo")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm5-turbo")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode/glm-5v-turbo")).toBe(true); + expect(isGlmSisyphusHarnessModel("opencode-go/glm5v-turbo")).toBe(true); + }); + + test("#given compact plain glm5 #then returns false", () => { + expect(isGlmSisyphusHarnessModel("z-ai/glm5")).toBe(false); + expect(isGlmSisyphusHarnessModel("glm5")).toBe(false); + expect(isGlmSisyphusHarnessModel("z-ai/glm5:thinking")).toBe(false); + }); + + test("#given non-target GLM variants #then returns false", () => { + expect(isGlmSisyphusHarnessModel("opencode/glm-4.6v")).toBe(false); + expect(isGlmSisyphusHarnessModel("opencode/go/glm-4-6v")).toBe(false); + expect(isGlmSisyphusHarnessModel("z-ai/glm-5.1-preview")).toBe(false); + expect(isGlmSisyphusHarnessModel("accounts/fireworks/models/glm-5p1")).toBe(false); + expect(isGlmSisyphusHarnessModel("opencode/big-pickle")).toBe(false); + }); + + test("#given other providers and families #then returns false", () => { + expect(isGlmSisyphusHarnessModel("openai/gpt-5.4")).toBe(false); + expect(isGlmSisyphusHarnessModel("anthropic/claude-opus-4-7")).toBe(false); + expect(isGlmSisyphusHarnessModel("google/gemini-3.1-pro")).toBe(false); + expect(isGlmSisyphusHarnessModel("moonshotai/kimi-k2.5")).toBe(false); + }); +}); + describe("isGeminiModel", () => { test("#given google provider models #then returns true", () => { expect(isGeminiModel("google/gemini-3.1-pro")).toBe(true); diff --git a/src/agents/types.ts b/src/agents/types.ts index 79fcec7f870..f1db0505843 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -130,6 +130,14 @@ export function isGlmModel(model: string): boolean { return modelName.includes("glm"); } +const GLM_SISYPHUS_HARNESS_RE = + /^(?:glm-5|glm-5[.-]1(?::thinking)?|glm5[.-]1(?::thinking)?|glm-5-turbo|glm5-turbo|glm-5v-turbo|glm5v-turbo)$/; + +export function isGlmSisyphusHarnessModel(model: string): boolean { + const modelName = extractModelName(model).toLowerCase(); + return GLM_SISYPHUS_HARNESS_RE.test(modelName); +} + export function isGeminiModel(model: string): boolean { if (GEMINI_PROVIDERS.some((prefix) => model.startsWith(prefix))) return true; From dd4e73d04d54d0d20dc3f049e11807fd4be15df9 Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 30 Apr 2026 20:31:07 +0900 Subject: [PATCH 02/25] feat(agents): GLM-5 thinking optimization and overlay refactor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Refactor sisyphus/glm.ts: 387→69 lines via overlay pattern (string replacement) - Add isGlmThinkingModel() for GLM-5+ text models (excludes VLM) - Add isGlmVisionModel() for GLM VLM variants (glm-4.6v, glm-5v-turbo) - Oracle/Metis/Momus: GLM-5+ text → thinking: { type: enabled }, Claude → budgetTokens - Sisyphus-Junior: GLM → thinking: { type: enabled } (was bare base) - Sisyphus: GLM overlay injection + thinking config, fact-checked comments - Update stale test: sisyphus-junior GLM now returns thinking - Add 100-test factory benchmark (5 agents × 7 GLM variants + cross-agent guards) - Add runtime benchmark script (scripts/benchmark-glm-thinking.ts) Benchmark: 100 factory tests pass, 452 agent tests pass, typecheck clean Verified: No GLM text model receives budgetTokens across any agent Refs: #3210, #3256, #3568 --- scripts/benchmark-glm-thinking.ts | 207 ++++++++++++ src/agents/glm-thinking-benchmark.test.ts | 230 +++++++++++++ src/agents/metis.ts | 10 +- src/agents/momus.ts | 6 +- src/agents/oracle.ts | 6 +- src/agents/sisyphus-junior/agent.ts | 2 +- src/agents/sisyphus-junior/index.test.ts | 4 +- src/agents/sisyphus.glm-routing.test.ts | 7 + src/agents/sisyphus.ts | 48 ++- src/agents/sisyphus/glm.test.ts | 103 +++--- src/agents/sisyphus/glm.ts | 385 ++-------------------- src/agents/types.test.ts | 46 +++ src/agents/types.ts | 15 + 13 files changed, 625 insertions(+), 444 deletions(-) create mode 100644 scripts/benchmark-glm-thinking.ts create mode 100644 src/agents/glm-thinking-benchmark.test.ts diff --git a/scripts/benchmark-glm-thinking.ts b/scripts/benchmark-glm-thinking.ts new file mode 100644 index 00000000000..c56ecc8d35f --- /dev/null +++ b/scripts/benchmark-glm-thinking.ts @@ -0,0 +1,207 @@ +#!/usr/bin/env bun +/** + * GLM Thinking Runtime Benchmark + * + * Measures GLM-5.1 API performance with thinking ON vs OFF. + * Run via: bun run scripts/benchmark-glm-thinking.ts [--model z-ai/glm-5.1] [--iterations 3] + * + * Prerequisites: OpenCode must be configured with a GLM provider. + * This script uses the agent factory functions to generate configs, + * then calls the model directly through the configured provider. + * + * Output: JSON with timing metrics to stdout, human-readable summary to stderr. + */ + +const DEFAULT_MODEL = "z-ai/glm-5.1" +const DEFAULT_ITERATIONS = 3 +const BENCHMARK_PROMPT = `Analyze this function and explain its time complexity, then suggest an optimization: + +function findPairs(arr: number[], target: number): [number, number][] { + const pairs: [number, number][] = [] + for (let i = 0; i < arr.length; i++) { + for (let j = i + 1; j < arr.length; j++) { + if (arr[i] + arr[j] === target) { + pairs.push([arr[i], arr[j]]) + } + } + } + return pairs +}` + +interface BenchmarkResult { + model: string + thinkingEnabled: boolean + iteration: number + timeToFirstTokenMs: number | null + totalTimeMs: number + thinkingTokens: number | null + responseTokens: number | null + error: string | null +} + +interface BenchmarkSummary { + model: string + timestamp: string + gitBranch: string + gitCommit: string + thinkingOn: { + avgTotalTimeMs: number + avgTTFTMs: number | null + avgThinkingTokens: number | null + results: BenchmarkResult[] + } + thinkingOff: { + avgTotalTimeMs: number + avgTTFTMs: number | null + results: BenchmarkResult[] + } + factoryTestResults: { + totalTests: number + passed: number + failed: number + } +} + +function parseArgs(): { model: string; iterations: number } { + const args = process.argv.slice(2) + let model = DEFAULT_MODEL + let iterations = DEFAULT_ITERATIONS + + for (let i = 0; i < args.length; i++) { + if (args[i] === "--model" && args[i + 1]) { + model = args[i + 1] + i++ + } else if (args[i] === "--iterations" && args[i + 1]) { + iterations = parseInt(args[i + 1], 10) + i++ + } + } + + return { model, iterations } +} + +function average(values: number[]): number { + return values.reduce((a, b) => a + b, 0) / values.length +} + +async function getGitInfo(): Promise<{ branch: string; commit: string }> { + const { execSync } = await import("child_process") + try { + const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim() + const commit = execSync("git rev-parse --short HEAD", { encoding: "utf-8" }).trim() + return { branch, commit } + } catch { + return { branch: "unknown", commit: "unknown" } + } +} + +async function runFactoryBenchmark(): Promise<{ totalTests: number; passed: number; failed: number }> { + const { execSync } = await import("child_process") + try { + const output = execSync( + "bun test src/agents/glm-thinking-benchmark.test.ts src/agents/types.test.ts 2>&1", + { encoding: "utf-8" } + ) + const match = output.match(/(\d+) pass.*?(\d+) fail/) + if (match) { + return { passed: parseInt(match[1], 10), failed: parseInt(match[2], 10), totalTests: parseInt(match[1], 10) + parseInt(match[2], 10) } + } + return { totalTests: 0, passed: 0, failed: 0 } + } catch (error) { + return { totalTests: 0, passed: 0, failed: -1 } + } +} + +async function callModelDirect(_model: string, _prompt: string, _thinking: boolean): Promise { + // NOTE: Direct API calls require provider credentials configured in OpenCode. + // This is a placeholder that measures agent factory overhead only. + // + // For actual runtime benchmarks, run through OpenCode: + // opencode --model z-ai/glm-5.1 --prompt "your task here" + // + // Or use curl with Z.AI API: + // curl -X POST https://open.bigmodel.cn/api/paas/v4/chat/completions \ + // -H "Authorization: Bearer $ZAI_API_KEY" \ + // -d '{"model": "glm-5.1", "messages": [...], "thinking": {"type": "enabled"}}' + + return { + model: _model, + thinkingEnabled: _thinking, + iteration: 0, + timeToFirstTokenMs: null, + totalTimeMs: 0, + thinkingTokens: null, + responseTokens: null, + error: "Direct API calls require OpenCode runtime. Use factory benchmark results for config verification.", + } +} + +async function main() { + const { model, iterations } = parseArgs() + const git = await getGitInfo() + + console.error(`\n=== GLM Thinking Benchmark ===`) + console.error(`Model: ${model}`) + console.error(`Iterations: ${iterations}`) + console.error(`Branch: ${git.branch} (${git.commit})`) + console.error() + + // Phase 1: Factory-level benchmark (no API calls needed) + console.error("Phase 1: Factory config correctness benchmark...") + const factoryResults = await runFactoryBenchmark() + console.error(` Factory tests: ${factoryResults.passed}/${factoryResults.totalTests} passed`) + + // Phase 2: Runtime benchmark placeholder + console.error("\nPhase 2: Runtime benchmark (requires OpenCode runtime)...") + const thinkingOnResults: BenchmarkResult[] = [] + const thinkingOffResults: BenchmarkResult[] = [] + + for (let i = 0; i < iterations; i++) { + const onResult = await callModelDirect(model, BENCHMARK_PROMPT, true) + onResult.iteration = i + 1 + thinkingOnResults.push(onResult) + + const offResult = await callModelDirect(model, BENCHMARK_PROMPT, false) + offResult.iteration = i + 1 + thinkingOffResults.push(offResult) + } + + // Summary + const summary: BenchmarkSummary = { + model, + timestamp: new Date().toISOString(), + gitBranch: git.branch, + gitCommit: git.commit, + thinkingOn: { + avgTotalTimeMs: thinkingOnResults.length > 0 ? average(thinkingOnResults.map(r => r.totalTimeMs)) : 0, + avgTTFTMs: null, + avgThinkingTokens: null, + results: thinkingOnResults, + }, + thinkingOff: { + avgTotalTimeMs: thinkingOffResults.length > 0 ? average(thinkingOffResults.map(r => r.totalTimeMs)) : 0, + avgTTFTMs: null, + results: thinkingOffResults, + }, + factoryResults, + } + + // Output JSON to stdout + console.log(JSON.stringify(summary, null, 2)) + + // Human-readable summary to stderr + console.error("\n=== Summary ===") + console.error(`Factory benchmark: ${factoryResults.passed}/${factoryResults.totalTests} tests passed`) + console.error(` - GLM-5+ text models: thinking enabled, NO budgetTokens`) + console.error(` - Claude models: thinking enabled with budgetTokens`) + console.error(` - GPT models: reasoningEffort, no thinking`) + console.error(` - GLM VLM models: default path (budgetTokens)`) + console.error() + console.error("Runtime benchmark: skipped (requires OpenCode runtime)") + console.error(" To run runtime benchmark manually:") + console.error(" opencode --model z-ai/glm-5.1 --prompt 'Explain time complexity of this function: ...'") + console.error() + console.error("Full results written to stdout (JSON)") +} + +main().catch(console.error) diff --git a/src/agents/glm-thinking-benchmark.test.ts b/src/agents/glm-thinking-benchmark.test.ts new file mode 100644 index 00000000000..f323b83e51e --- /dev/null +++ b/src/agents/glm-thinking-benchmark.test.ts @@ -0,0 +1,230 @@ +/** + * GLM Thinking Configuration Benchmark + * + * Verifies thinking config across all agents for every GLM model variant. + * This is the factory-level correctness benchmark for the tune/glm-performance branch. + * + * Categories tested: + * 1. GLM-5+ text models → thinking: { type: "enabled" } (no budgetTokens) + * 2. GLM VLM models → no thinking override (default path) + * 3. Claude models → thinking: { type: "enabled", budgetTokens: 32000 } + * 4. GPT models → reasoningEffort (no thinking) + */ +import { describe, test, expect } from "bun:test" +import { createOracleAgent } from "./oracle" +import { createMetisAgent } from "./metis" +import { createMomusAgent } from "./momus" +import { createSisyphusAgent } from "./sisyphus" +import { createSisyphusJuniorAgentWithOverrides as createSisyphusJuniorAgent } from "./sisyphus-junior/agent" + +// --- Model IDs to test (with provider prefixes as they appear at runtime) --- + +const GLM_TEXT_MODELS = [ + "z-ai/glm-5", + "opencode/glm-5", + "opencode-go/glm-5", + "zai-coding-plan/glm-5", + "z-ai/glm-5.1", + "z-ai/glm-5-turbo", + "vercel/zai/glm-5", +] as const + +const GLM_VLM_MODELS = [ + "opencode/glm-4.6v", + "zai-coding-plan/glm-4.6v", + "opencode/glm-5v-turbo", + "opencode-go/glm5v-turbo", +] as const + +const CLAUDE_MODELS = [ + "anthropic/claude-opus-4-7", + "anthropic/claude-sonnet-4-6", +] as const + +const GPT_MODELS = [ + "openai/gpt-5.4", + "openai/gpt-5.5", +] as const + +// --- Helper --- + +function hasBudgetTokens(thinking: unknown): boolean { + return typeof thinking === "object" + && thinking !== null + && "budgetTokens" in (thinking as Record) +} + +// === Benchmark: Sisyphus === + +describe("GLM Thinking Benchmark: Sisyphus", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createSisyphusAgent(model as string) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createSisyphusAgent(model as string) + expect(config.thinking).toBeDefined() + expect((config.thinking as Record)?.type).toBe("enabled") + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createSisyphusAgent(model as string) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +// === Benchmark: Sisyphus-Junior === + +describe("GLM Thinking Benchmark: Sisyphus-Junior", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createSisyphusJuniorAgent({ model }) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createSisyphusJuniorAgent({ model }) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createSisyphusJuniorAgent({ model }) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +// === Benchmark: Oracle === + +describe("GLM Thinking Benchmark: Oracle", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createOracleAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of GLM_VLM_MODELS) { + test(`#given ${model} (VLM) #then falls to default thinking with budgetTokens`, () => { + const config = createOracleAgent(model) + // VLM models are NOT thinking models, so they get the default path + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createOracleAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createOracleAgent(model) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +// === Benchmark: Metis === + +describe("GLM Thinking Benchmark: Metis", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createMetisAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of GLM_VLM_MODELS) { + test(`#given ${model} (VLM) #then falls to default thinking with budgetTokens`, () => { + const config = createMetisAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createMetisAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } +}) + +// === Benchmark: Momus === + +describe("GLM Thinking Benchmark: Momus", () => { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${model} #then thinking enabled without budgetTokens`, () => { + const config = createMomusAgent(model) + expect(config.thinking).toEqual({ type: "enabled" }) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + + for (const model of GLM_VLM_MODELS) { + test(`#given ${model} (VLM) #then falls to default thinking with budgetTokens`, () => { + const config = createMomusAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of CLAUDE_MODELS) { + test(`#given ${model} #then thinking enabled with budgetTokens`, () => { + const config = createMomusAgent(model) + expect(config.thinking).toBeDefined() + expect(hasBudgetTokens(config.thinking)).toBe(true) + }) + } + + for (const model of GPT_MODELS) { + test(`#given ${model} #then uses reasoningEffort not thinking`, () => { + const config = createMomusAgent(model) + expect(config.reasoningEffort).toBeDefined() + }) + } +}) + +// === Summary: No GLM text model should ever receive budgetTokens === + +describe("GLM Thinking Benchmark: Cross-agent budgetTokens guard", () => { + const agentFactories = [ + { name: "Sisyphus", fn: (m: string) => createSisyphusAgent(m) }, + { name: "Sisyphus-Junior", fn: (m: string) => createSisyphusJuniorAgent({ model: m }) }, + { name: "Oracle", fn: (m: string) => createOracleAgent(m) }, + { name: "Metis", fn: (m: string) => createMetisAgent(m) }, + { name: "Momus", fn: (m: string) => createMomusAgent(m) }, + ] + + for (const agent of agentFactories) { + for (const model of GLM_TEXT_MODELS) { + test(`#given ${agent.name} + ${model} #then NEVER receives budgetTokens`, () => { + const config = agent.fn(model) + expect(hasBudgetTokens(config.thinking)).toBe(false) + }) + } + } +}) diff --git a/src/agents/metis.ts b/src/agents/metis.ts index 4959d935c97..32ee2e3af6f 100644 --- a/src/agents/metis.ts +++ b/src/agents/metis.ts @@ -1,5 +1,6 @@ import type { AgentConfig } from "@opencode-ai/sdk" import type { AgentMode, AgentPromptMetadata } from "./types" +import { isGlmThinkingModel } from "./types" import { buildAntiDuplicationSection } from "./dynamic-agent-prompt-builder" import { createAgentToolRestrictions } from "../shared/permission-compat" @@ -300,7 +301,7 @@ const metisRestrictions = createAgentToolRestrictions([ ]) export function createMetisAgent(model: string): AgentConfig { - return { + const base = { description: "Pre-planning consultant that analyzes requests to identify hidden intentions, ambiguities, and AI failure points. (Metis - OhMyOpenCode)", mode: MODE, @@ -308,8 +309,13 @@ export function createMetisAgent(model: string): AgentConfig { temperature: 0.3, ...metisRestrictions, prompt: METIS_SYSTEM_PROMPT, - thinking: { type: "enabled", budgetTokens: 32000 }, } as AgentConfig + + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" } } as AgentConfig + } + + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 } } as AgentConfig } createMetisAgent.mode = MODE diff --git a/src/agents/momus.ts b/src/agents/momus.ts index 0c5ea6496fd..bf89b2dfb55 100644 --- a/src/agents/momus.ts +++ b/src/agents/momus.ts @@ -1,6 +1,6 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; -import { isGptModel } from "./types"; +import { isGlmThinkingModel, isGptModel } from "./types"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -308,6 +308,10 @@ export function createMomusAgent(model: string): AgentConfig { } as AgentConfig; } + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" } } as AgentConfig; + } + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 }, diff --git a/src/agents/oracle.ts b/src/agents/oracle.ts index 1779a0b16f1..5452c78e18e 100644 --- a/src/agents/oracle.ts +++ b/src/agents/oracle.ts @@ -1,6 +1,6 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; -import { isGpt5_5Model, isGptModel } from "./types"; +import { isGlmThinkingModel, isGpt5_5Model, isGptModel } from "./types"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -443,6 +443,10 @@ export function createOracleAgent(model: string): AgentConfig { } as AgentConfig; } + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" } } as AgentConfig; + } + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 }, diff --git a/src/agents/sisyphus-junior/agent.ts b/src/agents/sisyphus-junior/agent.ts index 520ec9e4e69..4474f163d25 100644 --- a/src/agents/sisyphus-junior/agent.ts +++ b/src/agents/sisyphus-junior/agent.ts @@ -159,7 +159,7 @@ export function createSisyphusJuniorAgentWithOverrides( } if (isGlmSisyphusHarnessModel(model)) { - return base as AgentConfig + return { ...base, thinking: { type: "enabled" } } as AgentConfig } return { diff --git a/src/agents/sisyphus-junior/index.test.ts b/src/agents/sisyphus-junior/index.test.ts index 1edacfbdfb5..fed83ca1870 100644 --- a/src/agents/sisyphus-junior/index.test.ts +++ b/src/agents/sisyphus-junior/index.test.ts @@ -169,7 +169,7 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { expect(result.thinking).toEqual({ type: "enabled", budgetTokens: 32000 }) }) - test("#given GLM reasoning model #when agent is created #then skips injected thinking", () => { + test("#given GLM reasoning model #when agent is created #then uses GLM-native thinking", () => { // given const override = { model: "z-ai/glm-5" } @@ -178,7 +178,7 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { // then expect(result.reasoningEffort).toBeUndefined() - expect(result.thinking).toBeUndefined() + expect(result.thinking).toEqual({ type: "enabled" }) }) }) diff --git a/src/agents/sisyphus.glm-routing.test.ts b/src/agents/sisyphus.glm-routing.test.ts index f8de73c1c36..5e82b2a9596 100644 --- a/src/agents/sisyphus.glm-routing.test.ts +++ b/src/agents/sisyphus.glm-routing.test.ts @@ -53,4 +53,11 @@ describe("createSisyphusAgent - GLM routing", () => { expect(prompt).not.toContain("goal.md"); expect(prompt).not.toContain("verification.md"); }); + + test("#given GLM harness model #then returns config with thinking enabled (no budgetTokens)", () => { + const agent = createSisyphusAgent("zai/glm-5.1"); + + expect(agent.thinking).toEqual({ type: "enabled" }); + expect((agent as Record).reasoningEffort).toBeUndefined(); + }); }); diff --git a/src/agents/sisyphus.ts b/src/agents/sisyphus.ts index 04f6d183a45..2e2cf1e0522 100644 --- a/src/agents/sisyphus.ts +++ b/src/agents/sisyphus.ts @@ -18,7 +18,7 @@ import { buildGeminiToolCallExamples, } from "./sisyphus/gemini"; import { buildClaudeOpus47SisyphusPrompt } from "./sisyphus/claude-opus-4-7"; -import { buildGlmSisyphusPrompt } from "./sisyphus/glm"; +import { buildGlmWorkingMemory, buildGlmVisionConstraint } from "./sisyphus/glm"; import { buildGpt54SisyphusPrompt } from "./sisyphus/gpt-5-4"; import { buildGpt55SisyphusPrompt } from "./sisyphus/gpt-5-5"; import { buildKimiK26SisyphusPrompt } from "./sisyphus/kimi-k2-6"; @@ -520,31 +520,6 @@ export function createSisyphusAgent( }; } - if (isGlmSisyphusHarnessModel(model)) { - const prompt = buildGlmSisyphusPrompt( - model, - agents, - tools, - skills, - categories, - useTaskSystem, - ); - return { - description: - "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", - mode: MODE, - model, - maxTokens: 64000, - prompt, - color: "#00CED1", - permission: { - question: "allow", - call_omo_agent: "deny", - ...getFrontierToolSchemaPermission(model), - ...getGptApplyPatchPermission(model), - } as AgentConfig["permission"], - }; - } if (isGpt5_5Model(model)) { const prompt = buildGpt55SisyphusPrompt( @@ -658,6 +633,21 @@ export function createSisyphusAgent( ); } + if (isGlmSisyphusHarnessModel(model)) { + // 1. Working Memory - after Role to provide lightweight state convention + // prevents premature compaction by giving GLM on-demand context slices + prompt = prompt.replace( + "", + `\n\n${buildGlmWorkingMemory()}` + ); + + // 2. Vision constraint - in Phase 0 Step 3 (delegation check area) + prompt = prompt.replace( + "**Default Bias: DELEGATE. WORK YOURSELF ONLY WHEN IT IS SUPER SIMPLE.**", + `**Default Bias: DELEGATE. WORK YOURSELF ONLY WHEN IT IS SUPER SIMPLE.**\n\n${buildGlmVisionConstraint()}` + ); + } + const permission = { question: "allow", call_omo_agent: "deny", @@ -679,6 +669,12 @@ export function createSisyphusAgent( return { ...base, reasoningEffort: "medium" }; } + if (isGlmSisyphusHarnessModel(model)) { + // GLM-5.x supports thinking: { type: "enabled" } natively (Z.AI docs). + // GLM does not support budgetTokens. Set explicitly for cross-provider consistency. + return { ...base, thinking: { type: "enabled" } }; + } + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 } }; } createSisyphusAgent.mode = MODE; diff --git a/src/agents/sisyphus/glm.test.ts b/src/agents/sisyphus/glm.test.ts index 6c6e9441ec8..528df97f510 100644 --- a/src/agents/sisyphus/glm.test.ts +++ b/src/agents/sisyphus/glm.test.ts @@ -1,105 +1,90 @@ /// import { describe, test, expect } from "bun:test"; -import { buildGlmSisyphusPrompt } from "./glm"; -import { buildKimiK26SisyphusPrompt } from "./kimi-k2-6"; +import { buildGlmWorkingMemory, buildGlmVisionConstraint } from "./glm"; -const MODEL = "z-ai/glm-5"; - -function buildEmptyGlmPrompt(useTaskSystem = false): string { - return buildGlmSisyphusPrompt(MODEL, [], [], [], [], useTaskSystem); -} - -describe("buildGlmSisyphusPrompt - Small_Context_Working_Memory block", () => { - test("#given empty inputs #then prompt contains exactly one Small_Context_Working_Memory block", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); +describe("buildGlmWorkingMemory", () => { + test("#given call #then contains exactly one Small_Context_Working_Memory block", () => { + // given + const block = buildGlmWorkingMemory(); // then - const openCount = prompt.split("").length - 1; + const openCount = block.split("").length - 1; const closeCount = - prompt.split("").length - 1; + block.split("").length - 1; expect(openCount).toBe(1); expect(closeCount).toBe(1); }); - test("#given empty inputs #then prompt anchors slices under .sisyphus/state/{plan-or-session}/", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); + test("#given call #then anchors slices under .sisyphus/state/{plan-or-session}/", () => { + // given + const block = buildGlmWorkingMemory(); // then - expect(prompt).toContain(".sisyphus/state/{plan-or-session}/"); + expect(block).toContain(".sisyphus/state/{plan-or-session}/"); }); - test("#given empty inputs #then prompt declares all five state slice file names", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); + test("#given call #then declares all five state slice file names", () => { + // given + const block = buildGlmWorkingMemory(); // then - expect(prompt).toContain("goal.md"); - expect(prompt).toContain("decisions.md"); - expect(prompt).toContain("files.md"); - expect(prompt).toContain("blockers.md"); - expect(prompt).toContain("verification.md"); + expect(block).toContain("goal.md"); + expect(block).toContain("decisions.md"); + expect(block).toContain("files.md"); + expect(block).toContain("blockers.md"); + expect(block).toContain("verification.md"); }); - test("#given empty inputs #then prompt encodes the 500-token soft slice target", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); + test("#given call #then encodes the 500-token soft slice target", () => { + // given + const block = buildGlmWorkingMemory(); // then - expect(prompt).toContain("500"); + expect(block).toContain("500"); }); - test("#given empty inputs #then prompt caps slice reads at 4 per turn", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); + test("#given call #then caps slice reads at 4 per turn", () => { + // given + const block = buildGlmWorkingMemory(); // then - expect(prompt).toMatch(/AT MOST 4 slices/); + expect(block).toMatch(/AT MOST 4 slices/); }); - test("#given empty inputs #then prompt requires relevant-slice-only reads", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); + test("#given call #then requires relevant-slice-only reads", () => { + // given + const block = buildGlmWorkingMemory(); // then - expect(prompt.toLowerCase()).toContain("relevant-slice-only"); + expect(block.toLowerCase()).toContain("relevant-slice-only"); }); - test("#given empty inputs #then prompt explains that missing slices mean first run", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); + test("#given call #then explains that missing slices mean first run", () => { + // given + const block = buildGlmWorkingMemory(); // then - expect(prompt.toLowerCase()).toContain("first run"); + expect(block.toLowerCase()).toContain("first run"); }); }); -describe("buildGlmSisyphusPrompt - divergence from Kimi prompt", () => { - test("#given empty inputs #then GLM prompt does not mention Toggle RL", () => { - // given / when - const prompt = buildEmptyGlmPrompt(); - - // then - expect(prompt).not.toContain("Toggle RL"); - }); - - test("#given identical empty inputs #then GLM prompt is shorter than the Kimi prompt", () => { +describe("buildGlmVisionConstraint", () => { + test("#given call #then contains GLM_VISION_CONSTRAINT tag", () => { // given - const glmPrompt = buildGlmSisyphusPrompt(MODEL, [], [], [], [], false); - const kimiPrompt = buildKimiK26SisyphusPrompt(MODEL, [], [], [], [], false); + const block = buildGlmVisionConstraint(); // then - expect(glmPrompt.length).toBeLessThan(kimiPrompt.length); + expect(block).toContain(""); + expect(block).toContain(""); }); - test("#given empty inputs with task system enabled #then GLM prompt is still shorter than Kimi prompt", () => { + test("#given call #then mentions text-only limitation", () => { // given - const glmPrompt = buildGlmSisyphusPrompt(MODEL, [], [], [], [], true); - const kimiPrompt = buildKimiK26SisyphusPrompt(MODEL, [], [], [], [], true); + const block = buildGlmVisionConstraint(); // then - expect(glmPrompt.length).toBeLessThan(kimiPrompt.length); + expect(block).toContain("text-only"); + expect(block).toContain("multimodal-looker"); }); }); diff --git a/src/agents/sisyphus/glm.ts b/src/agents/sisyphus/glm.ts index 416edd37ba6..37940347586 100644 --- a/src/agents/sisyphus/glm.ts +++ b/src/agents/sisyphus/glm.ts @@ -1,85 +1,29 @@ /** - * GLM-tuned Sisyphus prompt builder. + * GLM-specific overlay sections for Sisyphus prompt. * - * Design goals: - * - Base structure mirrors `default.ts` (Phase 0/1/2A/2B/2C/3 + helper sections), - * keeping GLM behavior close to Claude/default rather than Kimi-style 8-block prompts. - * - Diverges from the default in exactly one place: a `` - * block that points GLM at the lightweight `.sisyphus/state/{plan-or-session}/` slice - * convention introduced for GLM tuning. - * - State slices are read-on-demand context, not new persistence infrastructure. - * The harness does not create or read these files; the agent does, only when it has - * something concrete to record. + * GLM harness models (GLM-5, GLM-5.1, GLM-5-turbo) are text-only and + * suffer from premature context compaction during long sessions. + * + * These overlays inject corrective sections at strategic points + * in the dynamic Sisyphus prompt to counter these tendencies: + * + * 1. Working Memory block — lightweight `.sisyphus/state/` slices that + * preserve continuity across turns without re-reading full plans. + * 2. Vision constraint — GLM text-only models cannot handle images/PDFs. + * + * Follows the same overlay pattern as `gemini.ts`: small functions + * injected via string replacement in `sisyphus.ts`. */ -import type { - AvailableAgent, - AvailableTool, - AvailableSkill, - AvailableCategory, -} from "../dynamic-agent-prompt-builder"; -import { - buildKeyTriggersSection, - buildToolSelectionTable, - buildExploreSection, - buildLibrarianSection, - buildDelegationTable, - buildCategorySkillsDelegationGuide, - buildOracleSection, - buildHardBlocksSection, - buildAntiPatternsSection, - buildParallelDelegationSection, - buildNonClaudePlannerSection, - buildAntiDuplicationSection, - categorizeTools, -} from "../dynamic-agent-prompt-builder"; -import { buildTaskManagementSection } from "./default"; - -export function buildGlmSisyphusPrompt( - model: string, - availableAgents: AvailableAgent[], - availableTools: AvailableTool[] = [], - availableSkills: AvailableSkill[] = [], - availableCategories: AvailableCategory[] = [], - useTaskSystem = false, -): string { - const keyTriggers = buildKeyTriggersSection(availableAgents, availableSkills); - const toolSelection = buildToolSelectionTable( - availableAgents, - availableTools, - availableSkills, - ); - const exploreSection = buildExploreSection(availableAgents); - const librarianSection = buildLibrarianSection(availableAgents); - const categorySkillsGuide = buildCategorySkillsDelegationGuide( - availableCategories, - availableSkills, - ); - const delegationTable = buildDelegationTable(availableAgents); - const oracleSection = buildOracleSection(availableAgents); - const hardBlocks = buildHardBlocksSection(); - const antiPatterns = buildAntiPatternsSection(); - const parallelDelegationSection = buildParallelDelegationSection( - model, - availableCategories, - ); - const nonClaudePlannerSection = buildNonClaudePlannerSection(model); - const taskManagementSection = buildTaskManagementSection(useTaskSystem); - const todoHookNote = useTaskSystem - ? "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" - : "YOUR TODO CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TODO CONTINUATION])"; - - return ` -You are "Sisyphus" - Powerful AI Agent with orchestration capabilities from OhMyOpenCode. - -**Identity**: SF Bay Area engineer. Work, delegate, verify, ship. No AI slop. - -**Operating Mode**: You NEVER work alone when specialists are available. Frontend work → delegate. Deep research → parallel background agents. Complex architecture → consult Oracle. - -**Implementation Gate**: Follow user instructions. NEVER START IMPLEMENTING unless the user explicitly asks. ${todoHookNote} - if no implementation request, NEVER start work. - - - +/** + * Small Context Working Memory block for GLM. + * + * Prevents premature compaction by giving GLM a lightweight state + * convention it can read/write on demand instead of re-reading + * full plan files or scrolling old messages each turn. + */ +export function buildGlmWorkingMemory(): string { + return ` ## Working Memory via Small Context Slices GLM keeps a lightweight working memory under \`.sisyphus/state/{plan-or-session}/\` so continuity across turns does not require re-reading the full plan file or scrolling old messages. The directory key is the active plan name when one is present (\`.sisyphus/plans/{plan-name}.md\`), otherwise the current session label. @@ -105,283 +49,20 @@ GLM keeps a lightweight working memory under \`.sisyphus/state/{plan-or-session} - Append the new line(s) needed; do not rewrite the whole file. - Update \`goal.md\` when the goal or scope changes; \`decisions.md\` when you pick a routing or architectural option; \`files.md\` when the working set shifts; \`blockers.md\` when something blocks you; \`verification.md\` when you run lsp/tests/build. - Never create the \`.sisyphus/state\` directory speculatively. Only when a real state update is required. - - - - -## Phase 0 - Intent Gate (EVERY message) - -${keyTriggers} - - -### Step 0: Verbalize Intent (BEFORE Classification) - -Before classifying the task, identify what the user actually wants from you as an orchestrator. Map the surface form to the true intent, then announce your routing decision in one short line. - -**Intent → Routing Map:** - -| Surface Form | True Intent | Routing | -|---|---|---| -| "explain X", "how does Y work" | Research/understanding | explore/librarian → synthesize → answer | -| "implement X", "add Y", "create Z" | Implementation (explicit) | plan → delegate or execute | -| "look into X", "check Y", "investigate" | Investigation | explore → report findings | -| "what do you think about X?" | Evaluation | evaluate → propose → wait for confirmation | -| "I'm seeing error X" / "Y is broken" | Fix needed | diagnose → fix minimally | -| "refactor", "improve", "clean up" | Open-ended change | assess codebase first → propose approach | - -**Verbalize before proceeding:** - -> "I detect [research / implementation / investigation / evaluation / fix / open-ended] intent - [reason]. My approach: [plan]." - -This anchors routing. It does NOT commit you to implementation - only the user's explicit request does. - - -### Step 1: Classify Request Type - -- **Trivial** (single file, known location) → direct tools, unless a Key Trigger applies -- **Explicit** (specific file/line, clear command) → execute directly -- **Exploratory** ("How does X work?") → fire 1-3 explore agents in parallel + tools -- **Open-ended** ("Improve", "Refactor", "Add feature") → assess codebase first -- **Ambiguous** (unclear scope) → ask ONE clarifying question - -### Step 1.5: Turn-Local Intent Reset - -Reclassify intent from the CURRENT user message only. Never auto-carry implementation mode from prior turns. - -- Question/explanation/investigation → answer or analyze ONLY. No todos. No file edits. -- Still gathering context → confirm context first; do not start implementation yet. - -### Step 2: Check for Ambiguity - -- Single valid interpretation → proceed -- Multiple interpretations, similar effort → proceed with reasonable default, note assumption -- Multiple interpretations, 2x+ effort difference → MUST ask -- Missing critical info → MUST ask -- User's design seems flawed → MUST raise concern before implementing - -### Step 2.5: Context-Completion Gate (BEFORE Implementation) - -Implement only when ALL are true: -1. Current message contains an explicit implementation verb (implement/add/create/fix/change/write). -2. Scope is concrete enough to execute without guessing. -3. No blocking specialist result is pending (especially Oracle). - -If any condition fails, do research/clarification only, then wait. - -### Step 3: Validate Before Acting - -**Delegation Check (mandatory before acting directly):** -1. Is there a specialized agent that fits this request? -2. If not, which \`task\` category fits (visual-engineering, ultrabrain, quick, etc.)? Which skills should ride along via \`load_skills\`? -3. Self only when the task is demonstrably trivial and local AND no category/specialist fits. - -**Default Bias: DELEGATE. Work yourself only when it is super simple.** +`; +} +/** + * Vision constraint for GLM text-only models. + * + * GLM-5, GLM-5.1, GLM-5-turbo cannot render or analyze images. + * All visual tasks must be delegated to multimodal-looker. + */ +export function buildGlmVisionConstraint(): string { + return ` **Vision/Image Constraint (GLM text-only models):** - GLM-5, GLM-5.1, GLM-5-turbo are text-only models. They CANNOT render or analyze images, screenshots, PDFs, or visual content. - When a task involves viewing/analyzing images or visual content, ALWAYS delegate to the \`multimodal-looker\` agent. NEVER attempt to use \`look_at\`, \`read\`, or screenshot tools on image files yourself. - For browser visual testing (screenshot verification, UI diff), delegate to \`multimodal-looker\` or use \`visual-engineering\` category with \`playwright\` skill. - -### When to Challenge the User - -If you observe a design that will cause obvious problems, contradicts established codebase patterns, or misunderstands existing code: raise the concern concisely, propose an alternative, ask whether to proceed. - -\`\`\` -I notice [observation]. This might cause [problem] because [reason]. -Alternative: [your suggestion]. -Should I proceed with your original request, or try the alternative? -\`\`\` - ---- - -## Phase 1 - Codebase Assessment (for open-ended tasks) - -Before following existing patterns, assess whether they are worth following. - -### Quick Assessment: -1. Check linter/formatter/type configs. -2. Sample 2-3 similar files for consistency. -3. Note project age signals (dependencies, patterns). - -### State Classification: - -- **Disciplined** (consistent patterns, configs, tests) → follow existing style strictly -- **Transitional** (mixed patterns) → ask which pattern to follow -- **Legacy/Chaotic** (no consistency) → propose conventions, get confirmation -- **Greenfield** → modern best practices - -Different patterns may serve different purposes (intentional). Migration may be in progress. Verify before assuming. - ---- - -## Phase 2A - Exploration & Research - -${toolSelection} - -${exploreSection} - -${librarianSection} - -### Parallel Execution (DEFAULT behavior) - -**Parallelize EVERYTHING. Independent reads, searches, and agents run SIMULTANEOUSLY.** - - -- Parallelize independent tool calls: multiple file reads, grep searches, agent fires - all at once -- Explore/Librarian = background grep. ALWAYS \`run_in_background=true\`, ALWAYS parallel -- Fire 2-5 explore/librarian agents in parallel for any non-trivial codebase question -- Parallelize independent file reads -- After any write/edit tool call, briefly restate what changed, where, and what validation follows -- Prefer tools over internal knowledge whenever you need specific data (files, configs, patterns) - - -**Explore/Librarian = Grep, not consultants.** - -Each agent prompt should include: -- [CONTEXT]: what task, which modules, what approach -- [GOAL]: what decision the results unblock -- [DOWNSTREAM]: how you will use the results -- [REQUEST]: what to find, what format, what to skip - -### Background Result Collection: -1. Launch parallel agents → receive task_ids. -2. Continue only with non-overlapping work; otherwise END YOUR RESPONSE. -3. The system sends \`\` when tasks complete. -4. Collect via \`background_output(task_id="...")\` ONLY after the reminder. -5. Cancel disposable tasks individually via \`background_cancel(taskId="...")\`. - -${buildAntiDuplicationSection()} - -### Search Stop Conditions - -STOP searching when: enough context to proceed, info repeating across sources, 2 iterations with no new data, or direct answer found. **Time is precious.** - ---- - -## Phase 2B - Implementation - -### Pre-Implementation: -0. Find relevant skills and load them IMMEDIATELY via the \`skill\` tool. -1. 2+ steps → create todo list IMMEDIATELY, in detail. No announcements. -2. Mark current task \`in_progress\` before starting. -3. Mark \`completed\` as soon as done. Never batch. - -${categorySkillsGuide} - -${nonClaudePlannerSection} - -${parallelDelegationSection} - -${delegationTable} - -### Delegation Prompt Structure (ALL 6 sections required): - -\`\`\` -1. TASK: Atomic, specific goal -2. EXPECTED OUTCOME: Concrete deliverables with success criteria -3. REQUIRED TOOLS: Explicit tool whitelist -4. MUST DO: Exhaustive requirements - leave nothing implicit -5. MUST NOT DO: Forbidden actions -6. CONTEXT: File paths, existing patterns, constraints -\`\`\` - -After delegation: VERIFY against MUST DO/MUST NOT DO and existing patterns. Vague prompts → vague results. Be exhaustive. - -### Session Continuity (MANDATORY) - -Every \`task()\` returns a task_id. **USE IT** for follow-ups: -- Failed/incomplete → \`task_id="{id}", prompt="Fix: {specific error}"\` -- Follow-up question → \`task_id="{id}", prompt="Also: {question}"\` -- Multi-turn with same agent → \`task_id="{id}"\` - never start fresh - -This preserves full context, avoids repeated exploration, saves 70%+ tokens. - -### Code Changes: -- Match existing patterns in disciplined codebases. -- Propose approach first in chaotic codebases. -- Never suppress type errors with \`as any\`, \`@ts-ignore\`, \`@ts-expect-error\`. -- Never commit unless explicitly requested. -- **Bugfix Rule**: fix minimally. Never refactor while fixing. - -### Verification: - -Run \`lsp_diagnostics\` on changed files at the end of each logical task unit, before marking a todo complete, and before reporting completion. If the project has build/test commands, run them at task completion. - -### Evidence Requirements (task NOT complete without these): - -- **File edit** → \`lsp_diagnostics\` clean on changed files -- **Build command** → exit code 0 -- **Test run** → pass (or pre-existing failures explicitly noted) -- **Delegation** → result received and verified - -**NO EVIDENCE = NOT COMPLETE.** - ---- - -## Phase 2C - Failure Recovery - -1. Fix root causes, not symptoms. -2. Re-verify after every fix attempt. -3. Never shotgun debug. -4. After 3 consecutive failures: stop, revert to last known working state, document, consult Oracle, then ask the user if Oracle cannot resolve. - -Never leave code in a broken state. Never delete failing tests to "pass". - ---- - -## Phase 3 - Completion - -A task is complete when: -- All planned todos are done -- Diagnostics are clean on changed files -- Build passes (if applicable) -- The user's original request is fully addressed - -If verification fails: fix issues you caused. Do NOT fix pre-existing issues unless asked. Report: "Done. Note: N pre-existing errors unrelated to my changes." - -### Before Delivering Final Answer: -- Oracle running → end your response and wait for the completion notification first. -- Cancel disposable tasks individually via \`background_cancel(taskId="...")\`. - - -${oracleSection} - -${taskManagementSection} - - -## Communication Style - -### Be Concise -- Start work immediately. No acknowledgments. -- Answer directly without preamble. -- Don't summarize what you did unless asked. -- Don't explain code unless asked. - -### No Flattery -Never start responses with praise of the user's input. - -### No Status Updates -Skip casual acknowledgments. Use todos for tracking. - -### When User is Wrong -State your concern and the alternative concisely. Ask if they want to proceed anyway. - -### Match User's Style -Terse user → terse you. Detail wanted → detail given. - - - -${hardBlocks} - -${antiPatterns} - -## Soft Guidelines - -- Prefer existing libraries over new dependencies -- Prefer small, focused changes over large refactors -- When uncertain about scope, ask - -`; +`; } - -export { categorizeTools }; diff --git a/src/agents/types.test.ts b/src/agents/types.test.ts index 7e528f78d72..1d23054a724 100644 --- a/src/agents/types.test.ts +++ b/src/agents/types.test.ts @@ -4,6 +4,8 @@ import { isGeminiModel, isGlmModel, isGlmSisyphusHarnessModel, + isGlmThinkingModel, + isGlmVisionModel, isGptNativeSisyphusModel, isMiniMaxModel, } from "./types"; @@ -142,6 +144,50 @@ describe("isGlmModel", () => { }); }); +describe("isGlmVisionModel", () => { + test("#given GLM VLM variants #then returns true", () => { + expect(isGlmVisionModel("opencode/glm-4.6v")).toBe(true); + expect(isGlmVisionModel("opencode/glm-5v")).toBe(true); + expect(isGlmVisionModel("opencode/glm-5v-turbo")).toBe(true); + expect(isGlmVisionModel("z-ai/glm-5v-turbo")).toBe(true); + expect(isGlmVisionModel("opencode-go/glm5v-turbo")).toBe(true); + }); + + test("#given GLM text models #then returns false", () => { + expect(isGlmVisionModel("opencode/glm-5")).toBe(false); + expect(isGlmVisionModel("z-ai/glm-5.1")).toBe(false); + expect(isGlmVisionModel("opencode/glm-5-turbo")).toBe(false); + expect(isGlmVisionModel("opencode-go/glm5-turbo")).toBe(false); + }); + + test("#given non-GLM models #then returns false", () => { + expect(isGlmVisionModel("openai/gpt-5.4")).toBe(false); + expect(isGlmVisionModel("anthropic/claude-opus-4-7")).toBe(false); + }); +}); + +describe("isGlmThinkingModel", () => { + test("#given GLM-5+ text models #then returns true", () => { + expect(isGlmThinkingModel("opencode/glm-5")).toBe(true); + expect(isGlmThinkingModel("z-ai/glm-5.1")).toBe(true); + expect(isGlmThinkingModel("opencode/glm-5-turbo")).toBe(true); + expect(isGlmThinkingModel("opencode-go/glm5-turbo")).toBe(true); + expect(isGlmThinkingModel("zai-coding-plan/glm-5")).toBe(true); + }); + + test("#given GLM VLM models #then returns false", () => { + expect(isGlmThinkingModel("opencode/glm-4.6v")).toBe(false); + expect(isGlmThinkingModel("opencode/glm-5v")).toBe(false); + expect(isGlmThinkingModel("opencode/glm-5v-turbo")).toBe(false); + }); + + test("#given non-GLM models #then returns false", () => { + expect(isGlmThinkingModel("openai/gpt-5.4")).toBe(false); + expect(isGlmThinkingModel("anthropic/claude-opus-4-7")).toBe(false); + expect(isGlmThinkingModel("google/gemini-3.1-pro")).toBe(false); + }); +}); + describe("isGlmSisyphusHarnessModel", () => { test("#given exact GLM Sisyphus harness families #then returns true", () => { expect(isGlmSisyphusHarnessModel("z-ai/glm-5")).toBe(true); diff --git a/src/agents/types.ts b/src/agents/types.ts index f1db0505843..6df6bea4d9d 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -130,6 +130,21 @@ export function isGlmModel(model: string): boolean { return modelName.includes("glm"); } +/** Matches GLM VLM variants (e.g., glm-4.6v, glm-5v, glm-5v-turbo). */ +const GLM_VISION_MODEL_RE = /glm[\d.-]+v/ +export function isGlmVisionModel(model: string): boolean { + const modelName = extractModelName(model).toLowerCase(); + return modelName.includes("glm") && GLM_VISION_MODEL_RE.test(modelName); +} + +/** Matches GLM-5+ text-only models that support extended thinking. + * Excludes VLM variants (glm-5v-turbo, glm-4.6v) which may not support thinking. + */ +export function isGlmThinkingModel(model: string): boolean { + const modelName = extractModelName(model).toLowerCase(); + return isGlmModel(model) && !isGlmVisionModel(model) && /^glm[-]?5/.test(modelName); +} + const GLM_SISYPHUS_HARNESS_RE = /^(?:glm-5|glm-5[.-]1(?::thinking)?|glm5[.-]1(?::thinking)?|glm-5-turbo|glm5-turbo|glm-5v-turbo|glm5v-turbo)$/; From b30a1b5155398c90c9ec540b9e8c4f5bfde7b73d Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 30 Apr 2026 23:05:33 +0900 Subject: [PATCH 03/25] style(agents): remove redundant JSDoc and inline comments --- src/agents/momus.ts | 26 ------------------- src/agents/oracle.ts | 14 ---------- src/agents/sisyphus-junior/agent.ts | 19 -------------- src/agents/sisyphus.ts | 10 -------- src/agents/sisyphus/glm.ts | 30 ---------------------- src/agents/types.ts | 40 ----------------------------- 6 files changed, 139 deletions(-) diff --git a/src/agents/momus.ts b/src/agents/momus.ts index bf89b2dfb55..db13154dc8e 100644 --- a/src/agents/momus.ts +++ b/src/agents/momus.ts @@ -5,23 +5,6 @@ import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; -/** - * Momus - Plan Reviewer Agent - * - * Named after Momus, the Greek god of satire and mockery, who was known for - * finding fault in everything - even the works of the gods themselves. - * He criticized Aphrodite (found her sandals squeaky), Hephaestus (said man - * should have windows in his chest to see thoughts), and Athena (her house - * should be on wheels to move from bad neighbors). - * - * This agent reviews work plans with the same ruthless critical eye, - * catching every gap, ambiguity, and missing context that would block - * implementation. - */ - -/** - * Default Momus prompt - used for Claude and other non-GPT models. - */ const MOMUS_DEFAULT_PROMPT = `You are a **practical** work plan reviewer. Your goal is simple: verify that the plan is **executable** and **references are valid**. **CRITICAL FIRST RULE**: @@ -198,15 +181,6 @@ If REJECT: **Response Language**: Match the language of the plan content. `; -/** - * GPT-5.4 Optimized Momus System Prompt - * - * Tuned for GPT-5.4 system prompt design principles: - * - XML-tagged instruction blocks for clear structure - * - Prose-first output, explicit opener blacklist - * - Blocker-finder philosophy preserved - * - Deterministic decision criteria - */ const MOMUS_GPT_PROMPT = ` You are a practical work plan reviewer. You verify that plans are executable and references are valid. You are a blocker-finder, not a perfectionist. diff --git a/src/agents/oracle.ts b/src/agents/oracle.ts index 5452c78e18e..b99406b55aa 100644 --- a/src/agents/oracle.ts +++ b/src/agents/oracle.ts @@ -37,10 +37,6 @@ export const ORACLE_PROMPT_METADATA: AgentPromptMetadata = { ], }; -/** - * Default Oracle prompt - used for Claude and other non-GPT models. - * XML-tagged structure with extended thinking support. - */ const ORACLE_DEFAULT_PROMPT = `You are a strategic technical advisor with deep reasoning capabilities, operating as a specialized consultant within an AI-assisted development environment. @@ -152,16 +148,6 @@ Before finalizing answers on architecture, security, or performance: Your response goes directly to the user with no intermediate processing. Make your final message self-contained: a clear recommendation they can act on immediately, covering both what to do and why. `; -/** - * GPT-5.4 Optimized Oracle System Prompt - * - * Tuned for GPT-5.4 system prompt design principles: - * - Expert advisor framing with approach-first mentality - * - Prose-first output (favor conciseness, avoid bullet defaults) - * - Explicit opener blacklist - * - Deterministic decision criteria - * - XML-tagged structure for clear instruction parsing - */ const ORACLE_GPT_PROMPT = `You are a strategic technical advisor operating as an expert consultant within an AI-assisted development environment. You approach each consultation by first understanding the full technical landscape, then reasoning through the trade-offs before recommending a path. diff --git a/src/agents/sisyphus-junior/agent.ts b/src/agents/sisyphus-junior/agent.ts index 4474f163d25..e79b64a82b5 100644 --- a/src/agents/sisyphus-junior/agent.ts +++ b/src/agents/sisyphus-junior/agent.ts @@ -1,17 +1,3 @@ -/** - * Sisyphus-Junior - Focused Task Executor - * - * Executes delegated tasks directly without spawning other agents. - * Category-spawned executor with domain-specific configurations. - * - * Routing: - * 1. Kimi models -> kimi-k2-6.ts (Kimi-optimized) - * 2. GLM harness models -> glm.ts (GLM-optimized) - * 3. GPT models (openai/*, github-copilot/gpt-*) -> gpt.ts (GPT-5.4 optimized) - * 4. Gemini models (google/*, google-vertex/*) -> gemini.ts (Gemini-optimized) - * 5. Default (Claude, etc.) -> default.ts (Claude-optimized) - */ - import type { AgentConfig } from "@opencode-ai/sdk" import type { AgentMode } from "../types" import { @@ -39,8 +25,6 @@ import { buildGeminiSisyphusJuniorPrompt } from "./gemini" const MODE: AgentMode = "subagent" -// Core tools that Sisyphus-Junior must NEVER have access to -// Note: call_omo_agent is ALLOWED so subagents can spawn explore/librarian const BLOCKED_TOOLS = ["task"] const GPT_BLOCKED_TOOLS = ["task", "apply_patch"] @@ -75,9 +59,6 @@ export function getSisyphusJuniorPromptSource(model?: string): SisyphusJuniorPro return "default" } -/** - * Builds the appropriate Sisyphus-Junior prompt based on model. - */ export function buildSisyphusJuniorPrompt( model: string | undefined, useTaskSystem: boolean, diff --git a/src/agents/sisyphus.ts b/src/agents/sisyphus.ts index 2e2cf1e0522..de9bb3d447a 100644 --- a/src/agents/sisyphus.ts +++ b/src/agents/sisyphus.ts @@ -612,21 +612,16 @@ export function createSisyphusAgent( ); if (isGeminiModel(model)) { - // 1. Intent gate + tool mandate - early in prompt (after intent verbalization) prompt = prompt.replace( "", `\n\n${buildGeminiIntentGateEnforcement()}\n\n${buildGeminiToolMandate()}` ); - // 2. Tool guide + examples - after tool_usage_rules (where tools are discussed) prompt = prompt.replace( "", `\n\n${buildGeminiToolGuide()}\n\n${buildGeminiToolCallExamples()}` ); - // 3. Delegation + verification overrides - before Constraints (NOT at prompt end) - // Gemini suffers from lost-in-the-middle: content at prompt end gets weaker attention. - // Placing these before ensures they're in a high-attention zone. prompt = prompt.replace( "", `${buildGeminiDelegationOverride()}\n\n${buildGeminiVerificationOverride()}\n\n` @@ -634,14 +629,11 @@ export function createSisyphusAgent( } if (isGlmSisyphusHarnessModel(model)) { - // 1. Working Memory - after Role to provide lightweight state convention - // prevents premature compaction by giving GLM on-demand context slices prompt = prompt.replace( "", `\n\n${buildGlmWorkingMemory()}` ); - // 2. Vision constraint - in Phase 0 Step 3 (delegation check area) prompt = prompt.replace( "**Default Bias: DELEGATE. WORK YOURSELF ONLY WHEN IT IS SUPER SIMPLE.**", `**Default Bias: DELEGATE. WORK YOURSELF ONLY WHEN IT IS SUPER SIMPLE.**\n\n${buildGlmVisionConstraint()}` @@ -670,8 +662,6 @@ export function createSisyphusAgent( } if (isGlmSisyphusHarnessModel(model)) { - // GLM-5.x supports thinking: { type: "enabled" } natively (Z.AI docs). - // GLM does not support budgetTokens. Set explicitly for cross-provider consistency. return { ...base, thinking: { type: "enabled" } }; } diff --git a/src/agents/sisyphus/glm.ts b/src/agents/sisyphus/glm.ts index 37940347586..47f2671f467 100644 --- a/src/agents/sisyphus/glm.ts +++ b/src/agents/sisyphus/glm.ts @@ -1,27 +1,3 @@ -/** - * GLM-specific overlay sections for Sisyphus prompt. - * - * GLM harness models (GLM-5, GLM-5.1, GLM-5-turbo) are text-only and - * suffer from premature context compaction during long sessions. - * - * These overlays inject corrective sections at strategic points - * in the dynamic Sisyphus prompt to counter these tendencies: - * - * 1. Working Memory block — lightweight `.sisyphus/state/` slices that - * preserve continuity across turns without re-reading full plans. - * 2. Vision constraint — GLM text-only models cannot handle images/PDFs. - * - * Follows the same overlay pattern as `gemini.ts`: small functions - * injected via string replacement in `sisyphus.ts`. - */ - -/** - * Small Context Working Memory block for GLM. - * - * Prevents premature compaction by giving GLM a lightweight state - * convention it can read/write on demand instead of re-reading - * full plan files or scrolling old messages each turn. - */ export function buildGlmWorkingMemory(): string { return ` ## Working Memory via Small Context Slices @@ -52,12 +28,6 @@ GLM keeps a lightweight working memory under \`.sisyphus/state/{plan-or-session} `; } -/** - * Vision constraint for GLM text-only models. - * - * GLM-5, GLM-5.1, GLM-5-turbo cannot render or analyze images. - * All visual tasks must be delegated to multimodal-looker. - */ export function buildGlmVisionConstraint(): string { return ` **Vision/Image Constraint (GLM text-only models):** diff --git a/src/agents/types.ts b/src/agents/types.ts index 6df6bea4d9d..c1c0e775e46 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -1,72 +1,32 @@ import type { AgentConfig } from "@opencode-ai/sdk"; -/** - * Agent mode determines UI model selection behavior: - * - "primary": Respects user's UI-selected model (sisyphus, atlas) - * - "subagent": Uses own fallback chain, ignores UI selection (oracle, explore, etc.) - * - "all": Available in both contexts (OpenCode compatibility) - */ export type AgentMode = "primary" | "subagent" | "all"; -/** - * Agent factory function with static mode property. - * Mode is exposed as static property for pre-instantiation access. - */ export type AgentFactory = ((model: string) => AgentConfig) & { mode: AgentMode; }; -/** - * Agent category for grouping in Sisyphus prompt sections - */ export type AgentCategory = | "exploration" | "specialist" | "advisor" | "utility"; -/** - * Cost classification for Tool Selection table - */ export type AgentCost = "FREE" | "CHEAP" | "EXPENSIVE"; -/** - * Delegation trigger for Sisyphus prompt's Delegation Table - */ export interface DelegationTrigger { - /** Domain of work (e.g., "Frontend UI/UX") */ domain: string; - /** When to delegate (e.g., "Visual changes only...") */ trigger: string; } -/** - * Metadata for generating Sisyphus prompt sections dynamically - * This allows adding/removing agents without manually updating the Sisyphus prompt - */ export interface AgentPromptMetadata { - /** Category for grouping in prompt sections */ category: AgentCategory; - - /** Cost classification for Tool Selection table */ cost: AgentCost; - - /** Domain triggers for Delegation Table */ triggers: DelegationTrigger[]; - - /** When to use this agent (for detailed sections) */ useWhen?: string[]; - - /** When NOT to use this agent */ avoidWhen?: string[]; - - /** Optional dedicated prompt section (markdown) - for agents like Oracle that have special sections */ dedicatedSection?: string; - - /** Nickname/alias used in prompt (e.g., "Oracle" instead of "oracle") */ promptAlias?: string; - - /** Key triggers that should appear in Phase 0 (e.g., "External library mentioned → fire librarian") */ keyTrigger?: string; } From f68ff5ef85ccc72157d7f4ee10b5f24c3d8b509c Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 30 Apr 2026 23:05:42 +0900 Subject: [PATCH 04/25] style(tests): remove redundant test comments --- src/agents/sisyphus-junior/index.test.ts | 4 ---- src/agents/sisyphus/glm.test.ts | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/agents/sisyphus-junior/index.test.ts b/src/agents/sisyphus-junior/index.test.ts index fed83ca1870..4b87aa26e5a 100644 --- a/src/agents/sisyphus-junior/index.test.ts +++ b/src/agents/sisyphus-junior/index.test.ts @@ -113,7 +113,6 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { // when const result = createSisyphusJuniorAgentWithOverrides(override) - // then - defaults should be used, not the overrides expect(result.model).toBe(SISYPHUS_JUNIOR_DEFAULTS.model) expect(result.temperature).toBe(SISYPHUS_JUNIOR_DEFAULTS.temperature) }) @@ -214,13 +213,11 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { const permission = result.permission as Record | undefined if (tools) { expect(tools.task).toBe(false) - // call_omo_agent is NOW ALLOWED for subagents to spawn explore/librarian expect(tools.call_omo_agent).toBe(true) expect(tools.read).toBe(true) } if (permission) { expect(permission.task).toBe("deny") - // call_omo_agent is NOW ALLOWED for subagents to spawn explore/librarian expect(permission.call_omo_agent).toBe("allow") } }) @@ -238,7 +235,6 @@ describe("createSisyphusJuniorAgentWithOverrides", () => { // when const result = createSisyphusJuniorAgentWithOverrides(override as Parameters[0]) - // then - task blocked, but call_omo_agent allowed for explore/librarian spawning const tools = result.tools as Record | undefined const permission = result.permission as Record | undefined if (tools) { diff --git a/src/agents/sisyphus/glm.test.ts b/src/agents/sisyphus/glm.test.ts index 528df97f510..c6cd0337d0d 100644 --- a/src/agents/sisyphus/glm.test.ts +++ b/src/agents/sisyphus/glm.test.ts @@ -5,10 +5,8 @@ import { buildGlmWorkingMemory, buildGlmVisionConstraint } from "./glm"; describe("buildGlmWorkingMemory", () => { test("#given call #then contains exactly one Small_Context_Working_Memory block", () => { - // given const block = buildGlmWorkingMemory(); - // then const openCount = block.split("").length - 1; const closeCount = block.split("").length - 1; @@ -71,10 +69,8 @@ describe("buildGlmWorkingMemory", () => { describe("buildGlmVisionConstraint", () => { test("#given call #then contains GLM_VISION_CONSTRAINT tag", () => { - // given const block = buildGlmVisionConstraint(); - // then expect(block).toContain(""); expect(block).toContain(""); }); From 7f8cb96e31c79caddb1d5ab24d91212590046c4b Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 30 Apr 2026 23:05:48 +0900 Subject: [PATCH 05/25] style(tests): remove section dividers and redundant casts in benchmark --- src/agents/glm-thinking-benchmark.test.ts | 24 ++++------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/src/agents/glm-thinking-benchmark.test.ts b/src/agents/glm-thinking-benchmark.test.ts index f323b83e51e..7fa0c6c22a4 100644 --- a/src/agents/glm-thinking-benchmark.test.ts +++ b/src/agents/glm-thinking-benchmark.test.ts @@ -17,8 +17,6 @@ import { createMomusAgent } from "./momus" import { createSisyphusAgent } from "./sisyphus" import { createSisyphusJuniorAgentWithOverrides as createSisyphusJuniorAgent } from "./sisyphus-junior/agent" -// --- Model IDs to test (with provider prefixes as they appear at runtime) --- - const GLM_TEXT_MODELS = [ "z-ai/glm-5", "opencode/glm-5", @@ -46,20 +44,16 @@ const GPT_MODELS = [ "openai/gpt-5.5", ] as const -// --- Helper --- - function hasBudgetTokens(thinking: unknown): boolean { return typeof thinking === "object" && thinking !== null && "budgetTokens" in (thinking as Record) } -// === Benchmark: Sisyphus === - describe("GLM Thinking Benchmark: Sisyphus", () => { for (const model of GLM_TEXT_MODELS) { test(`#given ${model} #then thinking enabled without budgetTokens`, () => { - const config = createSisyphusAgent(model as string) + const config = createSisyphusAgent(model) expect(config.thinking).toEqual({ type: "enabled" }) expect(hasBudgetTokens(config.thinking)).toBe(false) }) @@ -67,23 +61,21 @@ describe("GLM Thinking Benchmark: Sisyphus", () => { for (const model of CLAUDE_MODELS) { test(`#given ${model} #then thinking enabled with budgetTokens`, () => { - const config = createSisyphusAgent(model as string) + const config = createSisyphusAgent(model) expect(config.thinking).toBeDefined() - expect((config.thinking as Record)?.type).toBe("enabled") + expect((config.thinking as Record).type).toBe("enabled") expect(hasBudgetTokens(config.thinking)).toBe(true) }) } for (const model of GPT_MODELS) { test(`#given ${model} #then uses reasoningEffort not thinking`, () => { - const config = createSisyphusAgent(model as string) + const config = createSisyphusAgent(model) expect(config.reasoningEffort).toBeDefined() }) } }) -// === Benchmark: Sisyphus-Junior === - describe("GLM Thinking Benchmark: Sisyphus-Junior", () => { for (const model of GLM_TEXT_MODELS) { test(`#given ${model} #then thinking enabled without budgetTokens`, () => { @@ -109,8 +101,6 @@ describe("GLM Thinking Benchmark: Sisyphus-Junior", () => { } }) -// === Benchmark: Oracle === - describe("GLM Thinking Benchmark: Oracle", () => { for (const model of GLM_TEXT_MODELS) { test(`#given ${model} #then thinking enabled without budgetTokens`, () => { @@ -145,8 +135,6 @@ describe("GLM Thinking Benchmark: Oracle", () => { } }) -// === Benchmark: Metis === - describe("GLM Thinking Benchmark: Metis", () => { for (const model of GLM_TEXT_MODELS) { test(`#given ${model} #then thinking enabled without budgetTokens`, () => { @@ -173,8 +161,6 @@ describe("GLM Thinking Benchmark: Metis", () => { } }) -// === Benchmark: Momus === - describe("GLM Thinking Benchmark: Momus", () => { for (const model of GLM_TEXT_MODELS) { test(`#given ${model} #then thinking enabled without budgetTokens`, () => { @@ -208,8 +194,6 @@ describe("GLM Thinking Benchmark: Momus", () => { } }) -// === Summary: No GLM text model should ever receive budgetTokens === - describe("GLM Thinking Benchmark: Cross-agent budgetTokens guard", () => { const agentFactories = [ { name: "Sisyphus", fn: (m: string) => createSisyphusAgent(m) }, From ab49716a9f78ccd54f4ddfc78b8f0ff343201b26 Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 30 Apr 2026 23:05:56 +0900 Subject: [PATCH 06/25] fix(benchmark): correct factoryTestResults property name and remove stale comments --- scripts/benchmark-glm-thinking.ts | 32 ++----------------------------- 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/scripts/benchmark-glm-thinking.ts b/scripts/benchmark-glm-thinking.ts index c56ecc8d35f..4688a7e27cc 100644 --- a/scripts/benchmark-glm-thinking.ts +++ b/scripts/benchmark-glm-thinking.ts @@ -1,16 +1,4 @@ #!/usr/bin/env bun -/** - * GLM Thinking Runtime Benchmark - * - * Measures GLM-5.1 API performance with thinking ON vs OFF. - * Run via: bun run scripts/benchmark-glm-thinking.ts [--model z-ai/glm-5.1] [--iterations 3] - * - * Prerequisites: OpenCode must be configured with a GLM provider. - * This script uses the agent factory functions to generate configs, - * then calls the model directly through the configured provider. - * - * Output: JSON with timing metrics to stdout, human-readable summary to stderr. - */ const DEFAULT_MODEL = "z-ai/glm-5.1" const DEFAULT_ITERATIONS = 3 @@ -107,23 +95,12 @@ async function runFactoryBenchmark(): Promise<{ totalTests: number; passed: numb return { passed: parseInt(match[1], 10), failed: parseInt(match[2], 10), totalTests: parseInt(match[1], 10) + parseInt(match[2], 10) } } return { totalTests: 0, passed: 0, failed: 0 } - } catch (error) { + } catch { return { totalTests: 0, passed: 0, failed: -1 } } } async function callModelDirect(_model: string, _prompt: string, _thinking: boolean): Promise { - // NOTE: Direct API calls require provider credentials configured in OpenCode. - // This is a placeholder that measures agent factory overhead only. - // - // For actual runtime benchmarks, run through OpenCode: - // opencode --model z-ai/glm-5.1 --prompt "your task here" - // - // Or use curl with Z.AI API: - // curl -X POST https://open.bigmodel.cn/api/paas/v4/chat/completions \ - // -H "Authorization: Bearer $ZAI_API_KEY" \ - // -d '{"model": "glm-5.1", "messages": [...], "thinking": {"type": "enabled"}}' - return { model: _model, thinkingEnabled: _thinking, @@ -146,12 +123,10 @@ async function main() { console.error(`Branch: ${git.branch} (${git.commit})`) console.error() - // Phase 1: Factory-level benchmark (no API calls needed) console.error("Phase 1: Factory config correctness benchmark...") const factoryResults = await runFactoryBenchmark() console.error(` Factory tests: ${factoryResults.passed}/${factoryResults.totalTests} passed`) - // Phase 2: Runtime benchmark placeholder console.error("\nPhase 2: Runtime benchmark (requires OpenCode runtime)...") const thinkingOnResults: BenchmarkResult[] = [] const thinkingOffResults: BenchmarkResult[] = [] @@ -166,7 +141,6 @@ async function main() { thinkingOffResults.push(offResult) } - // Summary const summary: BenchmarkSummary = { model, timestamp: new Date().toISOString(), @@ -183,13 +157,11 @@ async function main() { avgTTFTMs: null, results: thinkingOffResults, }, - factoryResults, + factoryTestResults: factoryResults, } - // Output JSON to stdout console.log(JSON.stringify(summary, null, 2)) - // Human-readable summary to stderr console.error("\n=== Summary ===") console.error(`Factory benchmark: ${factoryResults.passed}/${factoryResults.totalTests} tests passed`) console.error(` - GLM-5+ text models: thinking enabled, NO budgetTokens`) From 46cf91eacd18389e3b7379ba9bdc4d47385f9c7b Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Fri, 1 May 2026 09:46:39 +0900 Subject: [PATCH 07/25] feat(sisyphus): add GLM-5.x dedicated prompt builder and SJ speed overlay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New src/agents/sisyphus/glm-prompt.ts: 8-block GLM-specific Sisyphus prompt (DISPATCH→DELEGATE→COLLECT→SYNTHESIZE→DONE execution loop replacing EXPLORE→PLAN→ROUTE→EXECUTE→VERIFY→RETRY→DONE) - New src/agents/glm-prompt-quality.test.ts: 32 quality benchmarks across Instruction Compliance (10), Speed (10), Accuracy (9), Cross-Agent (3) - Extended src/agents/sisyphus-junior/glm.ts: SJ speed overlay with execution-first mindset, brief thinking, re-entry rule, exploration budget (2-iteration cap), tiered verification V1/V2/V3, token economy - Modified src/agents/sisyphus.ts: GLM routing from overlay string.replace to dedicated buildGlmSisyphusPrompt() builder (matches Kimi K2.x pattern) GLM-5.x does not support budgetTokens. Excessive thinking was controlled via prompt engineering: concise thinking mandate, re-entry rule (suppress re-verbalization for resolved turns), exploration budget hard stops, and tiered verification (V1/V2/V3) to avoid over-verification on trivial changes. Hephaestus delegation strategy included: sequential edits >= 3 automatically routed to Hephaestus (deep-thinking worker) to keep Sisyphus unblocked. All 140 GLM-related tests pass. Typecheck clean. AI slop removed. --- src/agents/glm-prompt-quality.test.ts | 268 +++++++++++++++++++ src/agents/sisyphus-junior/glm.ts | 36 +++ src/agents/sisyphus.ts | 17 +- src/agents/sisyphus/glm-prompt.ts | 358 ++++++++++++++++++++++++++ 4 files changed, 670 insertions(+), 9 deletions(-) create mode 100644 src/agents/glm-prompt-quality.test.ts create mode 100644 src/agents/sisyphus/glm-prompt.ts diff --git a/src/agents/glm-prompt-quality.test.ts b/src/agents/glm-prompt-quality.test.ts new file mode 100644 index 00000000000..addd2e20373 --- /dev/null +++ b/src/agents/glm-prompt-quality.test.ts @@ -0,0 +1,268 @@ +/** + * GLM Sisyphus Prompt Quality Benchmark + * + * Measures prompt quality across 3 dimensions: + * 1. Instruction Compliance (작업 지시 이행능력) + * 2. Speed (속도) — delegation-first, concise thinking, parallel dispatch + * 3. Accuracy (정확도) — verification tiers, error recovery, evidence requirements + * + * Compares the GLM-specific prompt builder output against minimum quality thresholds. + */ +import { describe, test, expect } from "bun:test" +import { createSisyphusAgent } from "./sisyphus" +import { createSisyphusJuniorAgentWithOverrides as createSJ } from "./sisyphus-junior/agent" + +const GLM_MODEL = "zai/glm-5.1" +const GLM_MODELS = [ + "zai/glm-5", + "zai/glm-5.1", + "zai/glm-5-turbo", + "opencode-go/glm5-turbo", +] as const + +function getGlmPrompt(model: string): string { + return createSisyphusAgent(model).prompt ?? "" +} + +function getSjPrompt(model: string): string { + return createSJ({ model }).prompt ?? "" +} + +function estimateTokenCount(text: string): number { + return Math.ceil(text.length / 3.5) +} + +describe("GLM Prompt Quality: Instruction Compliance", () => { + test("#given GLM Sisyphus prompt #then contains DISPATCH→DELEGATE→COLLECT→SYNTHESIZE execution loop", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("DISPATCH") + expect(prompt).toContain("DELEGATE") + expect(prompt).toContain("COLLECT") + expect(prompt).toContain("SYNTHESIZE") + expect(prompt).toContain("DONE") + }) + + test("#given GLM Sisyphus prompt #then mandates delegation before self-implementation", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/delegate.*before|dispatch.*first|delegation.*default/i) + expect(prompt).toMatch(/self.?implement.*only.*trivial|trivially simple/i) + }) + + test("#given GLM Sisyphus prompt #then includes re-entry rule to avoid re-verbalization", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("re_entry_rule") + expect(prompt).toMatch(/confirmation turn.*do not.*preamble|skip the preamble/i) + expect(prompt).toMatch(/already.*context.*return it|already.*context.*do not.*search/i) + }) + + test("#given GLM Sisyphus prompt #then includes working memory slice system", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("Small_Context_Working_Memory") + expect(prompt).toContain(".sisyphus/state/") + expect(prompt).toContain("goal.md") + expect(prompt).toContain("decisions.md") + expect(prompt).toContain("verification.md") + }) + + test("#given GLM Sisyphus prompt #then includes vision constraint for text-only models", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/GLM.*text.?only|text.?only.*models/i) + expect(prompt).toMatch(/multimodal.?looker|delegate.*visual/i) + }) + + test("#given GLM Sisyphus prompt #then includes tiered verification system", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("verification_tiers") + expect(prompt).toMatch(/V1.*trivial/i) + expect(prompt).toMatch(/V2.*moderate/i) + expect(prompt).toMatch(/V3.*full.*rigor/i) + }) + + test("#given GLM Sisyphus prompt #then includes Hephaestus delegation for heavy work", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Hephaestus/i) + expect(prompt).toMatch(/deep.?thinking.*worker|autonomous.*worker/i) + expect(prompt).toMatch(/more than 3 sequential self-edits.*delegate|delegate.*Hephaestus instead/i) + }) + + for (const model of GLM_MODELS) { + test(`#given ${model} #then GLM-specific prompt used (not default overlay)`, () => { + const prompt = getGlmPrompt(model) + + expect(prompt).toContain("DISPATCH") + expect(prompt).not.toContain("Phase 2B - Implementation") + }) + } +}) + +describe("GLM Prompt Quality: Speed", () => { + test("#given GLM Sisyphus prompt #then contains concise thinking mandate", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/think briefly|concise.*thinking|thinking.*concise/i) + expect(prompt).toMatch(/delegate before deep.?div/i) + }) + + test("#given GLM Sisyphus prompt #then contains exploration budget with hard stops", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("exploration_budget") + expect(prompt).toMatch(/hard stop/i) + expect(prompt).toMatch(/two.*parallel wave|at most two/i) + }) + + test("#given GLM Sisyphus prompt #then contains parallel dispatch mandate", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("parallel_dispatch") + expect(prompt).toMatch(/Fire ALL.*background.*FIRST/i) + expect(prompt).toMatch(/One wave.*sequential/i) + }) + + test("#given GLM Sisyphus prompt #then contains token economy for unconstrained reasoning", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toContain("token_economy") + expect(prompt).toMatch(/budgetTokens.*unsupported|unconstrained/i) + expect(prompt).toMatch(/restraint.*behavior|prompt.*level.*restraint/i) + }) + + test("#given GLM Sisyphus prompt #then intent classification is one-line, not full analysis", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/one.?line intent|classify intent in one line/i) + expect(prompt).toContain("Intent routes:") + }) + + test("#given GLM SJ prompt #then contains brief thinking mandate", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Think concisely|Brief Thinking/i) + expect(prompt).toMatch(/Execute immediately|No deliberation/i) + }) + + test("#given GLM SJ prompt #then exploration is capped at 2 iterations", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/2 search iteration|capped at 2/i) + }) + + test("#given GLM SJ prompt #then contains tiered verification for speed", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/V1.*trivial|V1.*lsp_diagnostics/i) + expect(prompt).toMatch(/V2.*moderate/i) + expect(prompt).toMatch(/V3.*broad/i) + expect(prompt).toMatch(/Stop after the first successful verification/i) + }) + + test("#given GLM prompt #then prompt length is within reasonable bounds", () => { + const prompt = getGlmPrompt(GLM_MODEL) + const tokens = estimateTokenCount(prompt) + + expect(tokens).toBeGreaterThan(2000) + expect(tokens).toBeLessThan(15000) + }) +}) + +describe("GLM Prompt Quality: Accuracy", () => { + test("#given GLM Sisyphus prompt #then requires evidence-based verification", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/verification.*mandatory|mandatory.*verification/i) + expect(prompt).toMatch(/Diagnostics clean.*only after tool output|verification evidence.*concrete/i) + }) + + test("#given GLM Sisyphus prompt #then requires reading subagent output before trusting", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Read enough.*verify|do not trust subagent summaries blindly/i) + }) + + test("#given GLM Sisyphus prompt #then includes failure recovery protocol", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/failure recovery|Fix root causes/i) + expect(prompt).toMatch(/One retry.*V1|up to two.*V2/i) + expect(prompt).toMatch(/consult Oracle/i) + }) + + test("#given GLM Sisyphus prompt #then scope discipline prevents over-implementation", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/Self-implement only trivial|trivial local work/i) + }) + + test("#given GLM Sisyphus prompt #then includes hard blocks and anti-patterns", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/as any|@ts-ignore|@ts-expect-error/i) + expect(prompt).toMatch(/Never leave.*broken/i) + }) + + test("#given GLM Sisyphus prompt #then asks clarification only when materially necessary", () => { + const prompt = getGlmPrompt(GLM_MODEL) + + expect(prompt).toMatch(/ask.*only when missing.*materially|materially change.*outcome/i) + }) + + test("#given GLM SJ prompt #then contains scope discipline", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/EXACTLY.*ONLY.*delegated|No extra features.*no scope creep/i) + }) + + test("#given GLM SJ prompt #then contains re-entry rule for context reuse", () => { + const prompt = getSjPrompt(GLM_MODEL) + + expect(prompt).toMatch(/re.?entry.*rule|Re-entry Rule/i) + expect(prompt).toMatch(/already.*context.*use it|Do not re.?search.*re.?derive/i) + }) +}) + +describe("GLM Prompt Quality: Cross-Agent Consistency", () => { + test("#given GLM Sisyphus + SJ #then both have vision constraint", () => { + const sisyphusPrompt = getGlmPrompt(GLM_MODEL) + const sjPrompt = getSjPrompt(GLM_MODEL) + + expect(sisyphusPrompt).toMatch(/text.?only/i) + expect(sjPrompt).toMatch(/text.?only|GLM.*CANNOT.*images/i) + }) + + test("#given GLM Sisyphus + SJ #then both have working memory reference", () => { + const sisyphusPrompt = getGlmPrompt(GLM_MODEL) + const sjPrompt = getSjPrompt(GLM_MODEL) + + expect(sisyphusPrompt).toContain("Small_Context_Working_Memory") + expect(sjPrompt).toContain("Small_Context_Working_Memory") + }) + + test("#given GLM Sisyphus + SJ #then both have tiered verification", () => { + const sisyphusPrompt = getGlmPrompt(GLM_MODEL) + const sjPrompt = getSjPrompt(GLM_MODEL) + + expect(sisyphusPrompt).toMatch(/V1.*trivial|V1.*trivial.*local/i) + expect(sisyphusPrompt).toMatch(/V2.*moderate/i) + expect(sisyphusPrompt).toMatch(/V3.*full.*rigor|V3.*broad/i) + expect(sjPrompt).toMatch(/V1 trivial change|V1.*lsp_diagnostics.*changed file/i) + expect(sjPrompt).toMatch(/V2 moderate change/i) + expect(sjPrompt).toMatch(/V3 broad.*risky change/i) + }) + + test("#given GLM factory config #then thinking is enabled without budgetTokens for all models", () => { + for (const model of GLM_MODELS) { + const sisyphus = createSisyphusAgent(model) + const sj = createSJ({ model }) + + expect(sisyphus.thinking).toEqual({ type: "enabled" }) + expect(sj.thinking).toEqual({ type: "enabled" }) + } + }) +}) diff --git a/src/agents/sisyphus-junior/glm.ts b/src/agents/sisyphus-junior/glm.ts index 85c456ff400..e246990314a 100644 --- a/src/agents/sisyphus-junior/glm.ts +++ b/src/agents/sisyphus-junior/glm.ts @@ -7,6 +7,42 @@ export function buildGlmSisyphusJuniorPrompt( ): string { const prompt = `${buildDefaultSisyphusJuniorPrompt(useTaskSystem)} + +## Execution-First Mindset +- You are an executor. Read task → execute → verify → done. +- No deliberation on approach. Pick the obvious repo-consistent path and move. +- Implement EXACTLY what was delegated. No extra features, no scope creep. + +## Brief Thinking Mandate +- Think concisely about the implementation. Execute immediately. +- Do not deliberate on alternatives unless the first approach concretely fails. +- Trim reasoning to essentials. The output that matters is working code, not thinking prose. + +## Re-entry Rule +- If this is a confirmed, decided, or continuation turn, do not re-verbalize the whole plan. +- User confirms/refines prior approach → one short acknowledgment, then act. +- User chose an option already discussed → follow it. Do not reopen eliminated alternatives. +- Answer already exists in current context → use it. Do not re-search or re-derive. + +## Exploration Budget +- Codebase exploration is capped at 2 search iterations, then proceed with best available info. +- Iteration means one parallel wave of reads/searches/agent calls plus synthesis. +- Stop earlier when you find the needed pattern, owner file, or verification target. +- Do not perform a second iteration just to be sure. + +## Tiered Verification +- V1 trivial change: lsp_diagnostics on changed file only. +- V2 moderate change: lsp_diagnostics on changed files + relevant tests. +- V3 broad/risky change: lsp_diagnostics on changed files + all tests + build. +- Promote V1/V2 to the next tier if verification exposes broader impact. +- Stop after the first successful verification result. + +## Token Economy +- No restating the user request. +- No progress essays. Report only meaningful phase changes and final evidence. +- Prefer short final output: changed file(s), verification run, notable caveat if any. + + ## GLM context priorities - Keep the working set tiny: start from the current task prompt, the current file, and the latest verification output. diff --git a/src/agents/sisyphus.ts b/src/agents/sisyphus.ts index de9bb3d447a..00a3f162110 100644 --- a/src/agents/sisyphus.ts +++ b/src/agents/sisyphus.ts @@ -18,7 +18,7 @@ import { buildGeminiToolCallExamples, } from "./sisyphus/gemini"; import { buildClaudeOpus47SisyphusPrompt } from "./sisyphus/claude-opus-4-7"; -import { buildGlmWorkingMemory, buildGlmVisionConstraint } from "./sisyphus/glm"; +import { buildGlmSisyphusPrompt } from "./sisyphus/glm-prompt"; import { buildGpt54SisyphusPrompt } from "./sisyphus/gpt-5-4"; import { buildGpt55SisyphusPrompt } from "./sisyphus/gpt-5-5"; import { buildKimiK26SisyphusPrompt } from "./sisyphus/kimi-k2-6"; @@ -629,14 +629,13 @@ export function createSisyphusAgent( } if (isGlmSisyphusHarnessModel(model)) { - prompt = prompt.replace( - "", - `\n\n${buildGlmWorkingMemory()}` - ); - - prompt = prompt.replace( - "**Default Bias: DELEGATE. WORK YOURSELF ONLY WHEN IT IS SUPER SIMPLE.**", - `**Default Bias: DELEGATE. WORK YOURSELF ONLY WHEN IT IS SUPER SIMPLE.**\n\n${buildGlmVisionConstraint()}` + prompt = buildGlmSisyphusPrompt( + model, + agents, + tools, + skills, + categories, + useTaskSystem, ); } diff --git a/src/agents/sisyphus/glm-prompt.ts b/src/agents/sisyphus/glm-prompt.ts new file mode 100644 index 00000000000..e1e886d38f3 --- /dev/null +++ b/src/agents/sisyphus/glm-prompt.ts @@ -0,0 +1,358 @@ +import { GPT_APPLY_PATCH_GUIDANCE } from "../gpt-apply-patch-guard" +import type { + AvailableAgent, + AvailableTool, + AvailableSkill, + AvailableCategory, +} from "../dynamic-agent-prompt-builder" +import { + buildAgentIdentitySection, + buildKeyTriggersSection, + buildToolSelectionTable, + buildExploreSection, + buildLibrarianSection, + buildDelegationTable, + buildCategorySkillsDelegationGuide, + buildOracleSection, + buildHardBlocksSection, + buildAntiPatternsSection, + buildAntiDuplicationSection, + buildNonClaudePlannerSection, + categorizeTools, +} from "../dynamic-agent-prompt-builder" +import { buildGlmVisionConstraint, buildGlmWorkingMemory } from "./glm" + +function buildGlmTasksSection(useTaskSystem: boolean): string { + if (useTaskSystem) { + return ` +Create tasks only when orchestration needs state: 2+ implementation steps, delegated work, cross-file changes, or verification follow-up. +Skip tasks for pure answers, V1 trivial edits, one-shot lookups, and background exploration turns. + +Workflow: +1. Create atomic tasks immediately when the threshold is met. +2. Mark exactly one task in_progress before starting it. +3. Mark completed immediately after the step succeeds. Never batch. +4. If background agents are pending and no non-overlapping work exists, stop and wait for completion notification. + +Clarification: ask one precise question only when the answer changes implementation materially. +` + } + + return ` +Create todos only when orchestration needs state: 2+ implementation steps, delegated work, cross-file changes, or verification follow-up. +Skip todos for pure answers, V1 trivial edits, one-shot lookups, and background exploration turns. + +Workflow: +1. Use \`todowrite\` immediately when the threshold is met. +2. Mark exactly one todo in_progress before starting it. +3. Mark completed immediately after the step succeeds. Never batch. +4. If background agents are pending and no non-overlapping work exists, stop and wait for completion notification. + +Clarification: ask one precise question only when the answer changes implementation materially. +` +} + +export function buildGlmSisyphusPrompt( + model: string, + availableAgents: AvailableAgent[], + availableTools: AvailableTool[] = [], + availableSkills: AvailableSkill[] = [], + availableCategories: AvailableCategory[] = [], + useTaskSystem = false, +): string { + const keyTriggers = buildKeyTriggersSection(availableAgents, availableSkills) + const toolSelection = buildToolSelectionTable( + availableAgents, + availableTools, + availableSkills, + ) + const exploreSection = buildExploreSection(availableAgents) + const librarianSection = buildLibrarianSection(availableAgents) + const categorySkillsGuide = buildCategorySkillsDelegationGuide( + availableCategories, + availableSkills, + ) + const delegationTable = buildDelegationTable(availableAgents) + const oracleSection = buildOracleSection(availableAgents) + const hardBlocks = buildHardBlocksSection() + const antiPatterns = buildAntiPatternsSection() + const nonClaudePlannerSection = buildNonClaudePlannerSection(model) + const tasksSection = buildGlmTasksSection(useTaskSystem) + const todoHookNote = useTaskSystem + ? "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" + : "YOUR TODO CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TODO CONTINUATION])" + + const agentIdentity = buildAgentIdentitySection( + "Sisyphus", + "Powerful AI Agent with orchestration capabilities from OhMyOpenCode", + ) + + const identityBlock = ` +You are Sisyphus, the speed-first orchestrator from OhMyOpenCode. + +Your role is dispatch and synthesize. Think briefly about routing, delegate early, and synthesize agent results. + +GLM orchestration mandate: +- GLM models do not support budgetTokens, so restraint must come from the prompt. +- Classify intent in one line, then delegate before deep reasoning. +- Prefer parallel background agents for research and category+skill subagents for implementation. +- Self-implement only trivial local work. + +Core competencies: intent routing, parallel dispatch, category+skill delegation, verification synthesis, concise final reporting. + +You never start implementing unless the current user message explicitly asks for implementation. + +${todoHookNote} + +${buildGlmWorkingMemory()} +` + + const constraintsBlock = ` +${hardBlocks} + +${antiPatterns} + +GLM hard constraints: +- Do not deep-think before dispatch. Launch specialists first when they can help. +- Do not serialize independent exploration. Use parallel waves only. +- Do not self-implement complex work. Delegate it. +- Do not re-search what background agents were asked to find. +- Do not use Claude-style budgetTokens assumptions. Keep thinking concise. + +${buildGlmVisionConstraint()} +` + + const intentBlock = ` +Start with routing, not analysis. + +Concise thinking mandate: think briefly about routing, delegate before deep-diving, synthesize results from agents. + +Intent output rule: +- New actionable turn: "I read this as [intent] - [dispatch route]." +- Confirmation or already-decided turn: skip the preamble and proceed. +- Pure answer from current context: answer directly. + +${keyTriggers} + +Intent routes: +| Surface | True intent | GLM route | +|---|---|---| +| "explain", "how does" | Research/understanding | parallel explore/librarian → synthesize | +| "implement", "add", "create" | Code change | dispatch implementation subagent unless trivial | +| "look into", "check" | Investigation | background exploration → report | +| "what do you think" | Evaluation | assess → recommend → wait if action changes code | +| "broken", error text | Fix | quick diagnosis → delegate fix or trivial self edit | +| "refactor", "improve" | Open-ended change | scoped exploration → proposal or delegated execution if explicit | + +Turn-local reset: +- Classify only the current user message. +- Do not carry implementation mode from prior turns. +- If the current message is context, a question, or an investigation request, do not edit files. + + +The gate always runs, but verbalization is suppressed when it would repeat decided context. + +1. Confirmation turn: if the user confirms an already verbalized route, do not emit a new "I read this as" line. Proceed. +2. Explicit decision already stated: acknowledge once and execute. Do not re-litigate alternatives. +3. Post-decision meta-question: treat as request for acknowledgment or risk note, not a full re-analysis. +4. Already in context: if the answer is already in the conversation or current tool output, return it. Do not search. + + +Ask only when missing information would materially change the outcome or action has irreversible/external side effects. +` + + const exploreBlock = ` +## Exploration & Research + +GLM exploration principle: dispatch first, synthesize second. Do not linger in silent inspection before launching agents. + +Codebase assessment for open-ended work: +- Check configs and 2-3 similar files in one parallel wave. +- Follow disciplined patterns. Ask if patterns conflict. Propose conventions only for chaotic/greenfield code. + +${toolSelection} + +${exploreSection} + +${librarianSection} + + +- Fire ALL independent background agents FIRST, then wait. +- For non-trivial codebase questions, launch 2-5 explore/librarian agents in parallel. +- Combine background agents with independent reads only when the work does not overlap. +- Never serialize exploration agents. One wave beats five sequential calls. + + + +- Prefer tools over memory for file contents, project commands, diagnostics, and current docs. +- Parallelize independent reads/searches/diagnostics. +- Explore and Librarian are background grep: always \`run_in_background=true\`. +- After delegating exploration, do not perform the same search yourself. +- If no non-overlapping work remains after launching background agents, end the response and wait for completion notification. + + + +Default budgets: +- Direct/local: 0-2 tool calls. Stop at first sufficient answer. +- Scoped/module: one parallel wave of 2-6 calls/agents, then synthesize. +- Open-ended: at most two parallel waves. Second wave only for a new unknown discovered by synthesis. + +Hard stops: +1. The answer is already in context. +2. The user supplied the fact you were about to verify and it is not safety-critical. +3. Two independent sources agree. +4. One full parallel wave answered the routing question. +5. A second wave would be "to be sure" rather than to answer a new unknown. + + +Background result collection: +1. Launch parallel agents and record task_ids. +2. Continue only with non-overlapping work. +3. If none exists, stop and wait for \`\`. +4. Collect via \`background_output\` only after the reminder. +5. Cancel disposable background tasks when no longer needed. + +${buildAntiDuplicationSection()} +` + + const executionLoopBlock = ` +## GLM Execution Loop: DISPATCH→DELEGATE→COLLECT→SYNTHESIZE→DONE + +1. DISPATCH + - Classify intent in one line. + - Identify all independent research, verification, and specialist routes. + - Launch background explore/librarian agents immediately for non-trivial unknowns. + +2. DELEGATE + - Implementation defaults to category+skills delegation. + - Load relevant skills before delegation when any skill domain overlaps. + - Self-edit only when the change is V1 trivial: single file, local, <10 lines, no complex domain. + - Visual work must go to visual-engineering or multimodal-looker as appropriate. + - If self-editing, keep the diff surgical. ${GPT_APPLY_PATCH_GUIDANCE} + +3. COLLECT + - Wait for background completion notification before \`background_output\`. + - Read enough touched files/tool outputs to verify claims. Do not trust subagent summaries blindly. + - Continue the same task session for fixes instead of starting fresh. + +4. SYNTHESIZE + - Merge findings into the shortest complete answer or next action. + - Prefer evidence from current tool outputs over memory. + - Do not re-run exploration once convergence conditions are met. + +5. DONE + - Mark all tasks/todos completed. + - Run required verification tier. + - Report outcome, evidence, and any blocker/pre-existing issue. + + +Verification is mandatory, but scope is tiered. + +V1 - trivial/local: +- Single file, <10 changed lines, no behavior change. +- Run \`lsp_diagnostics\` on changed file. Stop after success. + +V2 - moderate: +- Single domain, ≤3 files, behavior/config/prompt change. +- Run \`lsp_diagnostics\` on changed files. +- Run focused test/build command only when applicable and discoverable without broad exploration. + +V3 - full rigor: +- Cross-cutting changes, public API changes, user-visible behavior, release/publish risk, or delegated implementation. +- Run diagnostics on all changed files, relevant tests, and build if applicable. +- For delegated work, inspect changed files and verify requirements yourself. + +Most GLM orchestration work should finish at V1/V2. Promote only when risk or scope requires it. + + +Failure recovery: +- Fix root causes only for issues caused by your changes. +- One retry for V1, up to two for V2/V3, then consult Oracle or ask. +- Never leave known broken changes unreported. +` + + const delegationBlock = ` +## Delegation System + +Pre-delegation: +0. Load relevant skills when available. Prefer category+skills over solo execution. + +${categorySkillsGuide} + +${nonClaudePlannerSection} + +${delegationTable} + +Delegation prompt structure: +\`\`\` +1. TASK: Atomic, specific goal +2. EXPECTED OUTCOME: Concrete deliverables and success criteria +3. REQUIRED TOOLS: Explicit tool whitelist +4. MUST DO: Exhaustive requirements +5. MUST NOT DO: Forbidden actions +6. CONTEXT: File paths, existing patterns, constraints +\`\`\` + +GLM delegation defaults: +- Research: explore/librarian in background, parallel. +- Implementation: task category with load_skills. +- Architecture/debug uncertainty: Oracle before editing. +- Visual/media: multimodal-looker or visual-engineering, never GLM self-analysis of images. + +Heavy work routing: +- Long or complex implementation (multi-file, multi-step, research-heavy) → delegate to Hephaestus via \`task(category=\"deep\", load_skills=[...])\`. Hephaestus is the deep-thinking autonomous worker. Let him handle it. +- Quick targeted edits, single-file fixes, trivial config changes → self-implement or use Sisyphus-Junior with quick category. +- Frontend/visual → visual-engineering category with appropriate skills. +- The key speed insight: if a task needs more than 3 sequential self-edits, delegate it to Hephaestus instead. Your time is better spent dispatching and synthesizing. +Session continuity: +- Use returned task/session ids for follow-ups and verification fixes. +- Do not start fresh when a delegated agent already has context. + +${oracleSection ? `### Oracle + +${oracleSection}` : ""} +` + + const styleBlock = `` + + return `${agentIdentity} +${identityBlock} + +${constraintsBlock} + +${intentBlock} + +${exploreBlock} + +${executionLoopBlock} + +${delegationBlock} + +${tasksSection} + +${styleBlock}` +} + +export { categorizeTools } From eba0f4cb9c076e2f206067db9e8b0158be6e5972 Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Fri, 1 May 2026 23:36:25 +0900 Subject: [PATCH 08/25] refactor(agents): strengthen GLM vision constraints and remove AI slop - Add buildGlmSubagentVisionBlock() for concise subagent vision warnings - Apply vision constraint to Oracle, Metis, Momus (GLM branches) - Simplify GLM SJ speed overlay prompt (remove 4 redundant lines) - Remove redundant JSDoc from metis.ts, marketing language from sisyphus.ts - Centralize Sisyphus description as SISYPHUS_DESCRIPTION constant --- src/agents/metis.ts | 17 ++--------------- src/agents/momus.ts | 3 ++- src/agents/oracle.ts | 3 ++- src/agents/sisyphus-junior/glm.ts | 9 +++------ src/agents/sisyphus.ts | 25 +++++++++++-------------- src/agents/sisyphus/glm-prompt.ts | 4 +++- src/agents/sisyphus/glm.ts | 28 ++++++++++++++++++++++++++++ 7 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/agents/metis.ts b/src/agents/metis.ts index 32ee2e3af6f..dc4b1325a32 100644 --- a/src/agents/metis.ts +++ b/src/agents/metis.ts @@ -1,25 +1,12 @@ import type { AgentConfig } from "@opencode-ai/sdk" import type { AgentMode, AgentPromptMetadata } from "./types" import { isGlmThinkingModel } from "./types" +import { buildGlmSubagentVisionBlock } from "./sisyphus/glm" import { buildAntiDuplicationSection } from "./dynamic-agent-prompt-builder" import { createAgentToolRestrictions } from "../shared/permission-compat" const MODE: AgentMode = "subagent" -/** - * Metis - Plan Consultant Agent - * - * Named after the Greek goddess of wisdom, prudence, and deep counsel. - * Metis analyzes user requests BEFORE planning to prevent AI failures. - * - * Core responsibilities: - * - Identify hidden intentions and unstated requirements - * - Detect ambiguities that could derail implementation - * - Flag potential AI-slop patterns (over-engineering, scope creep) - * - Generate clarifying questions for the user - * - Prepare directives for the planner agent - */ - export const METIS_SYSTEM_PROMPT = `# Metis - Pre-Planning Consultant ## CONSTRAINTS @@ -312,7 +299,7 @@ export function createMetisAgent(model: string): AgentConfig { } as AgentConfig if (isGlmThinkingModel(model)) { - return { ...base, thinking: { type: "enabled" } } as AgentConfig + return { ...base, thinking: { type: "enabled" }, prompt: base.prompt + buildGlmSubagentVisionBlock() } as AgentConfig } return { ...base, thinking: { type: "enabled", budgetTokens: 32000 } } as AgentConfig diff --git a/src/agents/momus.ts b/src/agents/momus.ts index db13154dc8e..147951786ff 100644 --- a/src/agents/momus.ts +++ b/src/agents/momus.ts @@ -1,6 +1,7 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; import { isGlmThinkingModel, isGptModel } from "./types"; +import { buildGlmSubagentVisionBlock } from "./sisyphus/glm"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -283,7 +284,7 @@ export function createMomusAgent(model: string): AgentConfig { } if (isGlmThinkingModel(model)) { - return { ...base, thinking: { type: "enabled" } } as AgentConfig; + return { ...base, thinking: { type: "enabled" }, prompt: base.prompt + buildGlmSubagentVisionBlock() } as AgentConfig; } return { diff --git a/src/agents/oracle.ts b/src/agents/oracle.ts index b99406b55aa..c38c1179102 100644 --- a/src/agents/oracle.ts +++ b/src/agents/oracle.ts @@ -1,6 +1,7 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; import { isGlmThinkingModel, isGpt5_5Model, isGptModel } from "./types"; +import { buildGlmSubagentVisionBlock } from "./sisyphus/glm"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -430,7 +431,7 @@ export function createOracleAgent(model: string): AgentConfig { } if (isGlmThinkingModel(model)) { - return { ...base, thinking: { type: "enabled" } } as AgentConfig; + return { ...base, thinking: { type: "enabled" }, prompt: base.prompt + buildGlmSubagentVisionBlock() } as AgentConfig; } return { diff --git a/src/agents/sisyphus-junior/glm.ts b/src/agents/sisyphus-junior/glm.ts index e246990314a..f3d8a6ed007 100644 --- a/src/agents/sisyphus-junior/glm.ts +++ b/src/agents/sisyphus-junior/glm.ts @@ -16,7 +16,6 @@ export function buildGlmSisyphusJuniorPrompt( ## Brief Thinking Mandate - Think concisely about the implementation. Execute immediately. - Do not deliberate on alternatives unless the first approach concretely fails. -- Trim reasoning to essentials. The output that matters is working code, not thinking prose. ## Re-entry Rule - If this is a confirmed, decided, or continuation turn, do not re-verbalize the whole plan. @@ -27,7 +26,6 @@ export function buildGlmSisyphusJuniorPrompt( ## Exploration Budget - Codebase exploration is capped at 2 search iterations, then proceed with best available info. - Iteration means one parallel wave of reads/searches/agent calls plus synthesis. -- Stop earlier when you find the needed pattern, owner file, or verification target. - Do not perform a second iteration just to be sure. ## Tiered Verification @@ -39,22 +37,21 @@ export function buildGlmSisyphusJuniorPrompt( ## Token Economy - No restating the user request. -- No progress essays. Report only meaningful phase changes and final evidence. - Prefer short final output: changed file(s), verification run, notable caveat if any. ## GLM context priorities - Keep the working set tiny: start from the current task prompt, the current file, and the latest verification output. -- Treat .sisyphus/state/{plan-or-session}/ as optional Sisyphus handoff context only. - Read only the slice named in the task prompt, or the file/output directly needed for the current step. - Do not expand into a full ledger or read unrelated state files. ## Vision Constraint (GLM text-only) - GLM models (GLM-5, GLM-5.1, GLM-5-turbo) CANNOT render or analyze images, screenshots, or visual content. - When a task involves viewing images or visual content, delegate to the multimodal-looker agent instead of attempting it yourself. -` - +- NEVER call look_at, read (on image files), or screenshot tools. They WILL FAIL. +- ALWAYS delegate to multimodal-looker agent. If zai-mcp-server tools appear in your tool list, you may use them as secondary option. +`; if (!promptAppend) return prompt return prompt + "\n\n" + resolvePromptAppend(promptAppend) } diff --git a/src/agents/sisyphus.ts b/src/agents/sisyphus.ts index 00a3f162110..4eb3ebeb60a 100644 --- a/src/agents/sisyphus.ts +++ b/src/agents/sisyphus.ts @@ -33,6 +33,8 @@ export const SISYPHUS_PROMPT_METADATA: AgentPromptMetadata = { promptAlias: "Sisyphus", triggers: [], }; + +const SISYPHUS_DESCRIPTION = "Orchestration agent for OhMyOpenCode."; import type { AvailableAgent, AvailableTool, @@ -89,16 +91,16 @@ function buildDynamicSisyphusPrompt( const agentIdentity = buildAgentIdentitySection( "Sisyphus", - "Powerful AI Agent with orchestration capabilities from OhMyOpenCode", + SISYPHUS_DESCRIPTION, ); return `${agentIdentity} -You are "Sisyphus" - Powerful AI Agent with orchestration capabilities from OhMyOpenCode. +You are "Sisyphus", the orchestration agent for OhMyOpenCode. -**Why Sisyphus?**: Humans roll their boulder every day. So do you. We're not so different-your code should be indistinguishable from a senior engineer's. +**Why Sisyphus?**: Work steadily. Delegate, verify, ship. -**Identity**: SF Bay Area engineer. Work, delegate, verify, ship. No AI slop. +**Identity**: Delegate, verify, ship. **Core Competencies**: - Parsing implicit requirements from explicit requests @@ -503,8 +505,7 @@ export function createSisyphusAgent( useTaskSystem, ); return { - description: - "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", + description: SISYPHUS_DESCRIPTION, mode: MODE, model, maxTokens: 64000, @@ -531,8 +532,7 @@ export function createSisyphusAgent( useTaskSystem, ); return { - description: - "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", + description: SISYPHUS_DESCRIPTION, mode: MODE, model, maxTokens: 64000, @@ -558,8 +558,7 @@ export function createSisyphusAgent( useTaskSystem, ); return { - description: - "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", + description: SISYPHUS_DESCRIPTION, mode: MODE, model, maxTokens: 64000, @@ -585,8 +584,7 @@ export function createSisyphusAgent( useTaskSystem, ); return { - description: - "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", + description: SISYPHUS_DESCRIPTION, mode: MODE, model, maxTokens: 64000, @@ -646,8 +644,7 @@ export function createSisyphusAgent( ...getGptApplyPatchPermission(model), } as AgentConfig["permission"]; const base = { - description: - "Powerful AI orchestrator. Plans obsessively with todos, assesses search complexity before exploration, delegates strategically via category+skills combinations. Uses explore for internal code (parallel-friendly), librarian for external docs. (Sisyphus - OhMyOpenCode)", + description: SISYPHUS_DESCRIPTION, mode: MODE, model, maxTokens: 64000, diff --git a/src/agents/sisyphus/glm-prompt.ts b/src/agents/sisyphus/glm-prompt.ts index e1e886d38f3..64d159a05b8 100644 --- a/src/agents/sisyphus/glm-prompt.ts +++ b/src/agents/sisyphus/glm-prompt.ts @@ -20,7 +20,7 @@ import { buildNonClaudePlannerSection, categorizeTools, } from "../dynamic-agent-prompt-builder" -import { buildGlmVisionConstraint, buildGlmWorkingMemory } from "./glm" +import { buildGlmVisionConstraint, buildGlmVisionHardBlock, buildGlmWorkingMemory } from "./glm" function buildGlmTasksSection(useTaskSystem: boolean): string { if (useTaskSystem) { @@ -120,6 +120,8 @@ GLM hard constraints: - Do not use Claude-style budgetTokens assumptions. Keep thinking concise. ${buildGlmVisionConstraint()} + +${buildGlmVisionHardBlock()} ` const intentBlock = ` diff --git a/src/agents/sisyphus/glm.ts b/src/agents/sisyphus/glm.ts index 47f2671f467..8d7c4064cfc 100644 --- a/src/agents/sisyphus/glm.ts +++ b/src/agents/sisyphus/glm.ts @@ -36,3 +36,31 @@ export function buildGlmVisionConstraint(): string { - For browser visual testing (screenshot verification, UI diff), delegate to \`multimodal-looker\` or use \`visual-engineering\` category with \`playwright\` skill. `; } + +export function buildGlmVisionHardBlock(): string { + return `## GLM Vision Constraint (HARD BLOCK) + +You are a TEXT-ONLY model. You CANNOT see images. + +NEVER call these tools yourself for images/screenshots/PDFs: +- \`look_at\` +- \`read\` (on image/PDF/binary files) +- \`brave-devtools_take_screenshot\` +- \`playwright_browser_take_screenshot\` +- \`figma_get_screenshot\` + +When user shares an image, screenshot, or asks to analyze visual content: +1. Delegate to \`multimodal-looker\` agent. +2. If \`zai-mcp-server_*\` tools are available, you may use them as a secondary option. + +Do not inspect visual content directly.`; +} + +export function buildGlmSubagentVisionBlock(): string { + return ` + +## GLM Vision Constraint (HARD BLOCK) +You are a TEXT-ONLY model. You CANNOT see images. +Never call look_at, read (on image files), or screenshot tools. Delegate to multimodal-looker; if zai-mcp-server tools are available, they may be used as a secondary option. +`; +} From 554a4fa0fc3537eebf4af7dbfcc366ed8c6c7385 Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Sun, 3 May 2026 20:42:01 +0900 Subject: [PATCH 09/25] fix(sj-glm): remove stale assertion for full working memory path in junior prompt test The SJ GLM prompt intentionally omits the .sisyphus/state/{plan-or-session}/ path and individual slice filenames (goal.md, decisions.md, etc.) that the main Sisyphus GLM prompt includes. The test incorrectly expected the full ledger path; align it with the lightweight memory contract. --- src/agents/sisyphus-junior/glm.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agents/sisyphus-junior/glm.test.ts b/src/agents/sisyphus-junior/glm.test.ts index d86eeaab54b..198cb6ead3b 100644 --- a/src/agents/sisyphus-junior/glm.test.ts +++ b/src/agents/sisyphus-junior/glm.test.ts @@ -32,7 +32,6 @@ describe("buildGlmSisyphusJuniorPrompt", () => { const prompt = buildGlmSisyphusJuniorPrompt(false) // then - expect(prompt).toContain(".sisyphus/state/{plan-or-session}/") expect(prompt).toContain("Read only the slice named in the task prompt") expect(prompt).not.toContain("Toggle RL") expect(prompt).not.toContain("goal.md") From 44426b1ac0e4f2e42503cb5494779a1505c6f33d Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Wed, 6 May 2026 15:42:53 +0900 Subject: [PATCH 10/25] perf(sisyphus): add direct Hephaestus delegation for GLM routing --- src/agents/sisyphus.glm-routing.test.ts | 10 +++++++ src/agents/sisyphus/glm-prompt.ts | 28 +++++++++++++++++-- .../tool-config-handler.test.ts | 24 ++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/src/agents/sisyphus.glm-routing.test.ts b/src/agents/sisyphus.glm-routing.test.ts index 5e82b2a9596..11622a319c8 100644 --- a/src/agents/sisyphus.glm-routing.test.ts +++ b/src/agents/sisyphus.glm-routing.test.ts @@ -60,4 +60,14 @@ describe("createSisyphusAgent - GLM routing", () => { expect(agent.thinking).toEqual({ type: "enabled" }); expect((agent as Record).reasoningEffort).toBeUndefined(); }); + + test("#given GLM harness model #then strongly biases implementation toward category delegation", () => { + const prompt = getPrompt("zai/glm-5.1"); + + expect(prompt).toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); + expect(prompt).toContain("NEVER implement directly"); + expect(prompt).toContain("DIRECT HEPHAESTUS DELEGATION - YOUR IMPLEMENTATION PATH"); + expect(prompt).toContain('call_omo_agent(subagent_type="hephaestus"'); + expect(prompt).toContain("delegate to Hephaestus"); + }); }); diff --git a/src/agents/sisyphus/glm-prompt.ts b/src/agents/sisyphus/glm-prompt.ts index 64d159a05b8..f5a2276b3e9 100644 --- a/src/agents/sisyphus/glm-prompt.ts +++ b/src/agents/sisyphus/glm-prompt.ts @@ -17,6 +17,7 @@ import { buildHardBlocksSection, buildAntiPatternsSection, buildAntiDuplicationSection, + buildParallelDelegationSection, buildNonClaudePlannerSection, categorizeTools, } from "../dynamic-agent-prompt-builder" @@ -77,6 +78,24 @@ export function buildGlmSisyphusPrompt( const hardBlocks = buildHardBlocksSection() const antiPatterns = buildAntiPatternsSection() const nonClaudePlannerSection = buildNonClaudePlannerSection(model) + const parallelDelegationSection = `### DIRECT HEPHAESTUS DELEGATION - YOUR IMPLEMENTATION PATH + +For long or complex implementation, call Hephaestus directly with \`call_omo_agent(subagent_type="hephaestus", run_in_background=true, ...)\` when that tool is available. Hephaestus is the autonomous implementation worker; GLM is the orchestrator. + +Use category delegation only when direct Hephaestus invocation is unavailable or when a domain category is more precise. + +${buildParallelDelegationSection(model, availableCategories) || + `### DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER + +**YOUR FAILURE MODE: You attempt to do work yourself instead of decomposing and delegating.** When an implementation task is not V1 trivial, specialized subagents are faster and more reliable than GLM solo execution. + +**MANDATORY - for ANY non-trivial implementation task:** + +1. **ALWAYS decompose** the task into independent work units. +2. **ALWAYS delegate** each unit to available category workers, preferably \`deep\` or \`unspecified-high\`, in parallel. +3. **NEVER implement directly** when delegation is possible. You write prompts, collect results, verify, and synthesize. + +**Your value is orchestration, decomposition, and quality control. Delegating with crystal-clear prompts IS your work.**`}` const tasksSection = buildGlmTasksSection(useTaskSystem) const todoHookNote = useTaskSystem ? "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" @@ -283,6 +302,8 @@ ${categorySkillsGuide} ${nonClaudePlannerSection} +${parallelDelegationSection} + ${delegationTable} Delegation prompt structure: @@ -297,15 +318,16 @@ Delegation prompt structure: GLM delegation defaults: - Research: explore/librarian in background, parallel. -- Implementation: task category with load_skills. +- Implementation: direct Hephaestus via \`call_omo_agent\` for complex work; category task with load_skills for domain-specific work. - Architecture/debug uncertainty: Oracle before editing. - Visual/media: multimodal-looker or visual-engineering, never GLM self-analysis of images. Heavy work routing: -- Long or complex implementation (multi-file, multi-step, research-heavy) → delegate to Hephaestus via \`task(category=\"deep\", load_skills=[...])\`. Hephaestus is the deep-thinking autonomous worker. Let him handle it. +- Long or complex implementation (multi-file, multi-step, research-heavy) → delegate to Hephaestus via \`call_omo_agent(subagent_type="hephaestus", run_in_background=true, prompt=...)\`. Do not self-implement. +- Domain-specific implementation where a category is clearly better → delegate via \`task(category=..., load_skills=[...], run_in_background=true)\`. - Quick targeted edits, single-file fixes, trivial config changes → self-implement or use Sisyphus-Junior with quick category. - Frontend/visual → visual-engineering category with appropriate skills. -- The key speed insight: if a task needs more than 3 sequential self-edits, delegate it to Hephaestus instead. Your time is better spent dispatching and synthesizing. +- The key speed insight: if a task needs more than 3 sequential self-edits, decompose and delegate it instead. Your time is better spent dispatching and synthesizing. Session continuity: - Use returned task/session ids for follow-ups and verification fixes. - Do not start fresh when a delegated agent already has context. diff --git a/src/plugin-handlers/tool-config-handler.test.ts b/src/plugin-handlers/tool-config-handler.test.ts index e6cb1e222c9..0bc3b088aec 100644 --- a/src/plugin-handlers/tool-config-handler.test.ts +++ b/src/plugin-handlers/tool-config-handler.test.ts @@ -217,6 +217,30 @@ describe("applyToolConfig", () => { }) }) + describe("#given Sisyphus needs direct specialist delegation", () => { + it("#then should allow call_omo_agent for Sisyphus", () => { + const params = createParams({ agents: ["sisyphus"] }) + + applyToolConfig(params) + + const agent = params.agentResult.sisyphus as { + permission: Record + } + expect(agent.permission.call_omo_agent).toBe("allow") + }) + + it("#then should keep call_omo_agent denied for Hephaestus", () => { + const params = createParams({ agents: ["hephaestus"] }) + + applyToolConfig(params) + + const agent = params.agentResult.hephaestus as { + permission: Record + } + expect(agent.permission.call_omo_agent).toBe("deny") + }) + }) + describe("#given task_system is undefined", () => { describe("#when applying tool config", () => { it("#then should not disable todo tools globally by default", () => { From 0725c4286b0fa8d99e67a353a2d143cc3660849e Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Wed, 6 May 2026 16:51:40 +0900 Subject: [PATCH 11/25] fix(momus): restore isGlmThinkingModel branch to skip budgetTokens for GLM Upstream removed GLM-specific thinking config from Momus, causing budgetTokens: 32000 to be applied to GLM models that do not support it. Restore the isGlmThinkingModel branch matching Metis and Oracle. Also add test coverage for: - Momus GLM thinking config without budgetTokens - Sisyphus call_omo_agent permission (allow) vs Hephaestus (deny) --- scripts/benchmark-glm-delegation.ts | 170 ++++++++++++++++++ src/agents/momus.ts | 6 +- src/agents/sisyphus.glm-routing.test.ts | 34 +++- src/cli/run/delegation-scorecard.test.ts | 86 +++++++++ src/cli/run/delegation-scorecard.ts | 129 +++++++++++++ src/cli/run/event-metric-collector.test.ts | 89 +++++++++ src/cli/run/event-metric-collector.ts | 106 +++++++++++ src/cli/run/index.ts | 4 + .../tool-config-handler.test.ts | 22 +++ src/plugin-handlers/tool-config-handler.ts | 2 +- .../call-omo-agent-sisyphus.test.ts | 31 ++++ 11 files changed, 675 insertions(+), 4 deletions(-) create mode 100644 scripts/benchmark-glm-delegation.ts create mode 100644 src/cli/run/delegation-scorecard.test.ts create mode 100644 src/cli/run/delegation-scorecard.ts create mode 100644 src/cli/run/event-metric-collector.test.ts create mode 100644 src/cli/run/event-metric-collector.ts create mode 100644 src/tools/call-omo-agent/call-omo-agent-sisyphus.test.ts diff --git a/scripts/benchmark-glm-delegation.ts b/scripts/benchmark-glm-delegation.ts new file mode 100644 index 00000000000..3e6cd47e9d5 --- /dev/null +++ b/scripts/benchmark-glm-delegation.ts @@ -0,0 +1,170 @@ +#!/usr/bin/env bun + +import { calculateScorecard, SCORECARD_VERSION } from "../src/cli/run/delegation-scorecard" +import type { ScorecardResult, ScorecardTier } from "../src/cli/run/delegation-scorecard" +import { createEventMetricCollector } from "../src/cli/run/event-metric-collector" +import type { MetricSnapshot } from "../src/cli/run/event-metric-collector" + +const DEFAULT_MODEL = "zai-coding-plan/glm-5.1" + +interface CliArgs { + help: boolean + dryRun: boolean + output: string | null + model: string +} + +interface ScenarioReport { + scenarioId: string + tier: ScorecardTier + durationMs: number + success: boolean + snapshot: MetricSnapshot + scorecard: ScorecardResult +} + +interface BenchmarkReport { + model: string + timestamp: string + scenarios: ScenarioReport[] + aggregate: { + scenarioCount: number + averageTotalScore: number + averageDelegationRate: number + passedScenarios: number + } + scorecardVersion: string +} + +function parseArgs(argv: string[]): CliArgs { + const parsed: CliArgs = { + help: false, + dryRun: false, + output: null, + model: DEFAULT_MODEL, + } + + for (let index = 0; index < argv.length; index++) { + const arg = argv[index] + if (arg === "--help") { + parsed.help = true + } else if (arg === "--dry-run") { + parsed.dryRun = true + } else if (arg === "--output" && argv[index + 1]) { + parsed.output = argv[index + 1] + index++ + } else if (arg === "--model" && argv[index + 1]) { + parsed.model = argv[index + 1] + index++ + } + } + + return parsed +} + +function printUsage(): void { + console.log(`Usage: bun scripts/benchmark-glm-delegation.ts [options] + +Options: + --help Print this help message + --dry-run Generate synthetic fixture data without model or server calls + --output Write JSON report to a file + --model Model identifier (default: ${DEFAULT_MODEL})`) +} + +function average(values: number[]): number { + if (values.length === 0) return 0 + return Math.round((values.reduce((sum, value) => sum + value, 0) / values.length) * 100) / 100 +} + +function createSyntheticSnapshot(tier: ScorecardTier): MetricSnapshot { + const collector = createEventMetricCollector() + + if (tier === "quick") { + collector.onToolExecute({ sessionID: "quick", name: "read", input: { filePath: "src/example.ts" } }) + collector.onToolExecute({ sessionID: "quick", name: "edit", input: { filePath: "src/example.ts" } }) + return collector.getSnapshot() + } + + if (tier === "medium") { + collector.onToolExecute({ sessionID: "medium", name: "task", input: { category: "quick" } }) + collector.onToolResult({ sessionID: "medium", name: "task", output: "Task completed" }) + collector.onToolExecute({ sessionID: "medium", name: "read", input: { filePath: "src/example.ts" } }) + collector.onToolExecute({ sessionID: "medium", name: "hashline_edit", input: { file: "src/example.ts" } }) + return collector.getSnapshot() + } + + collector.onToolExecute({ sessionID: "deep", name: "call_omo_agent", input: { subagent_type: "hephaestus" } }) + collector.onToolResult({ sessionID: "deep", name: "call_omo_agent", output: "Background agent task launched successfully" }) + collector.onToolExecute({ sessionID: "deep", name: "task", input: { category: "deep" } }) + collector.onToolResult({ sessionID: "deep", name: "task", output: "Task completed" }) + collector.onToolExecute({ sessionID: "deep", name: "read", input: { filePath: "src/example.ts" } }) + return collector.getSnapshot() +} + +function createScenarioReport(scenarioId: string, tier: ScorecardTier, durationMs: number): ScenarioReport { + const snapshot = createSyntheticSnapshot(tier) + const success = true + const scorecard = calculateScorecard({ scenarioId, tier, snapshot, durationMs, success }) + + return { + scenarioId, + tier, + durationMs, + success, + snapshot, + scorecard, + } +} + +function createDryRunReport(model: string): BenchmarkReport { + const scenarios = [ + createScenarioReport("quick-trivial-edit", "quick", 600), + createScenarioReport("medium-scoped-change", "medium", 1800), + createScenarioReport("deep-delegated-implementation", "deep", 4200), + ] + + return { + model, + timestamp: new Date().toISOString(), + scenarios, + aggregate: { + scenarioCount: scenarios.length, + averageTotalScore: average(scenarios.map((scenario) => scenario.scorecard.totalScore)), + averageDelegationRate: average(scenarios.map((scenario) => scenario.scorecard.delegationRate)), + passedScenarios: scenarios.filter((scenario) => scenario.scorecard.passed).length, + }, + scorecardVersion: SCORECARD_VERSION, + } +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)) + + if (args.help) { + printUsage() + return + } + + if (!args.dryRun) { + console.error("Runtime model benchmarking is not implemented in this script. Use --dry-run for deterministic fixtures.") + printUsage() + return + } + + const report = createDryRunReport(args.model) + const json = `${JSON.stringify(report, null, 2)}\n` + + if (args.output) { + await Bun.write(args.output, json) + console.error(`Wrote GLM delegation benchmark report to ${args.output}`) + return + } + + console.log(json) +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)) + process.exitCode = 1 +}) diff --git a/src/agents/momus.ts b/src/agents/momus.ts index 76eb592e5a8..6000e103ac7 100644 --- a/src/agents/momus.ts +++ b/src/agents/momus.ts @@ -1,6 +1,6 @@ import type { AgentConfig } from "@opencode-ai/sdk"; import type { AgentMode, AgentPromptMetadata } from "./types"; -import { isGptModel } from "./types"; +import { isGlmThinkingModel, isGptModel } from "./types"; import { createAgentToolRestrictions } from "../shared/permission-compat"; const MODE: AgentMode = "subagent"; @@ -307,6 +307,10 @@ export function createMomusAgent(model: string): AgentConfig { } as AgentConfig; } + if (isGlmThinkingModel(model)) { + return { ...base, thinking: { type: "enabled" } } as AgentConfig; + } + return { ...base, thinking: { type: "enabled", budgetTokens: 32000 }, diff --git a/src/agents/sisyphus.glm-routing.test.ts b/src/agents/sisyphus.glm-routing.test.ts index 11622a319c8..b7f1c7ad4c4 100644 --- a/src/agents/sisyphus.glm-routing.test.ts +++ b/src/agents/sisyphus.glm-routing.test.ts @@ -3,8 +3,8 @@ import { describe, expect, test } from "bun:test"; import { createSisyphusAgent } from "./sisyphus"; -function getPrompt(model: string): string { - const agent = createSisyphusAgent(model); +function getPrompt(model: string, availableCategories?: Parameters[4]): string { + const agent = createSisyphusAgent(model, undefined, undefined, undefined, availableCategories); return agent.prompt ?? ""; } @@ -48,6 +48,8 @@ describe("createSisyphusAgent - GLM routing", () => { expect(prompt).toContain("Toggle RL"); expect(prompt).toContain("K2.x post-training context"); + expect(prompt).not.toContain("DIRECT HEPHAESTUS DELEGATION"); + expect(prompt).not.toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); expect(prompt).not.toContain(".sisyphus/state/"); expect(prompt).not.toContain(""); expect(prompt).not.toContain("goal.md"); @@ -70,4 +72,32 @@ describe("createSisyphusAgent - GLM routing", () => { expect(prompt).toContain('call_omo_agent(subagent_type="hephaestus"'); expect(prompt).toContain("delegate to Hephaestus"); }); + + test("#given GLM harness model #when building prompt #then preserves direct delegation markers", () => { + const prompt = getPrompt("zai/glm-5.1"); + + expect(prompt).toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); + expect(prompt).toContain("DIRECT HEPHAESTUS DELEGATION"); + }); + + test("#given GLM harness model #when building prompt #then places delegation before verification and style sections", () => { + const prompt = getPrompt("zai/glm-5.1"); + const delegationIndex = prompt.indexOf("DIRECT HEPHAESTUS DELEGATION"); + const verificationIndex = prompt.indexOf(""); + const styleIndex = prompt.indexOf("` +} - return `${agentIdentity} -${identityBlock} +export function buildGlmSisyphusPrompt( + model: string, + availableAgents: AvailableAgent[], + availableTools: AvailableTool[] = [], + availableSkills: AvailableSkill[] = [], + availableCategories: AvailableCategory[] = [], + useTaskSystem = false, +): string { + const keyTriggers = buildKeyTriggersSection(availableAgents, availableSkills) + const toolSelection = buildToolSelectionTable( + availableAgents, + availableTools, + availableSkills, + ) + const exploreSection = buildExploreSection(availableAgents) + const librarianSection = buildLibrarianSection(availableAgents) + const categorySkillsGuide = buildCategorySkillsDelegationGuide( + availableCategories, + availableSkills, + ) + const delegationTable = buildDelegationTable(availableAgents) + const oracleSection = buildOracleSection(availableAgents) + const hardBlocks = buildHardBlocksSection() + const antiPatterns = buildAntiPatternsSection() + const nonClaudePlannerSection = buildNonClaudePlannerSection(model) + const parallelDelegationSection = `### DIRECT HEPHAESTUS DELEGATION - YOUR IMPLEMENTATION PATH + +For long or complex implementation, call Hephaestus directly with \`call_omo_agent(subagent_type="hephaestus", run_in_background=true, ...)\` when that tool is available. Hephaestus is the autonomous implementation worker; GLM is the orchestrator. + +Use category delegation only when direct Hephaestus invocation is unavailable or when a domain category is more precise. + +${buildParallelDelegationSection(model, availableCategories) || + `### DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER + +**YOUR FAILURE MODE: You attempt to do work yourself instead of decomposing and delegating.** When an implementation task is not V1 trivial, specialized subagents are faster and more reliable than GLM solo execution. + +**MANDATORY - for ANY non-trivial implementation task:** + +1. **ALWAYS decompose** the task into independent work units. +2. **ALWAYS delegate** each unit to available category workers, preferably \`deep\` or \`unspecified-high\`, in parallel. +3. **NEVER implement directly** when delegation is possible. You write prompts, collect results, verify, and synthesize. + +**Your value is orchestration, decomposition, and quality control. Delegating with crystal-clear prompts IS your work.**`}` + + const todoHookNote = useTaskSystem + ? "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" + : "YOUR TODO CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TODO CONTINUATION])" + + const identityBlock = buildIdentityBlock(todoHookNote) + const constraintsBlock = buildConstraintsBlock(hardBlocks, antiPatterns) + const intentBlock = buildIntentBlock(keyTriggers) + const exploreBlock = buildExploreBlock(toolSelection, exploreSection, librarianSection) + const executionLoopBlock = buildExecutionLoopBlock() + const delegationBlock = buildDelegationBlock( + categorySkillsGuide, + nonClaudePlannerSection, + parallelDelegationSection, + delegationTable, + oracleSection, + ) + const styleBlock = buildStyleBlock() + const tasksSection = buildGlmTasksSection(useTaskSystem) + + return `${identityBlock} ${constraintsBlock} From d6c5bdbcda0a11aff0981bf707a22d0a04990924 Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 7 May 2026 00:33:07 +0900 Subject: [PATCH 16/25] chore(cli): remove benchmark-only exports from public CLI surface --- src/cli/run/index.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/cli/run/index.ts b/src/cli/run/index.ts index fc8e56e3761..65c4d93305f 100644 --- a/src/cli/run/index.ts +++ b/src/cli/run/index.ts @@ -4,11 +4,7 @@ export { resolveRunModel } from "./model-resolver" export { createServerConnection } from "./server-connection" export { resolveSession } from "./session-resolver" export { createJsonOutputManager } from "./json-output" -export { createEventMetricCollector } from "./event-metric-collector" -export { calculateScorecard, SCORECARD_VERSION } from "./delegation-scorecard" export { executeOnCompleteHook } from "./on-complete-hook" export { createEventState, processEvents, serializeError } from "./events" export type { EventState } from "./events" -export type { EventMetricCollector, MetricSnapshot } from "./event-metric-collector" -export type { ScorecardInput, ScorecardResult, ScorecardTier } from "./delegation-scorecard" export type { RunOptions, RunContext, RunResult, ServerConnection } from "./types" From 7d32830b9934a8b2f7f16c0fcc214ea2135729cd Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 7 May 2026 08:53:31 +0900 Subject: [PATCH 17/25] fix(glm-prompt): replace call_omo_agent references with task(category="deep") delegation --- src/agents/sisyphus/glm-prompt.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/agents/sisyphus/glm-prompt.ts b/src/agents/sisyphus/glm-prompt.ts index 134f39f7e4c..947316c704d 100644 --- a/src/agents/sisyphus/glm-prompt.ts +++ b/src/agents/sisyphus/glm-prompt.ts @@ -292,12 +292,12 @@ Delegation prompt structure: GLM delegation defaults: - Research: explore/librarian in background, parallel. -- Implementation: direct Hephaestus via \`call_omo_agent\` for complex work; category task with load_skills for domain-specific work. +- Implementation: delegate to Hephaestus via \`task(category="deep", load_skills=[...])\` for complex work; category task with load_skills for domain-specific work. - Architecture/debug uncertainty: Oracle before editing. - Visual/media: multimodal-looker or visual-engineering, never GLM self-analysis of images. Heavy work routing: -- Long or complex implementation (multi-file, multi-step, research-heavy) → delegate to Hephaestus via \`call_omo_agent(subagent_type="hephaestus", run_in_background=true, prompt=...)\`. Do not self-implement. +- Long or complex implementation (multi-file, multi-step, research-heavy) → delegate to Hephaestus via \`task(category="deep", load_skills=[], run_in_background=true, prompt=...)\`. Do not self-implement. - Domain-specific implementation where a category is clearly better → delegate via \`task(category=..., load_skills=[...], run_in_background=true)\`. - Quick targeted edits, single-file fixes, trivial config changes → self-implement or use Sisyphus-Junior with quick category. - Frontend/visual → visual-engineering category with appropriate skills. @@ -363,11 +363,11 @@ export function buildGlmSisyphusPrompt( const hardBlocks = buildHardBlocksSection() const antiPatterns = buildAntiPatternsSection() const nonClaudePlannerSection = buildNonClaudePlannerSection(model) - const parallelDelegationSection = `### DIRECT HEPHAESTUS DELEGATION - YOUR IMPLEMENTATION PATH + const parallelDelegationSection = `### DEEP DELEGATION - YOUR IMPLEMENTATION PATH -For long or complex implementation, call Hephaestus directly with \`call_omo_agent(subagent_type="hephaestus", run_in_background=true, ...)\` when that tool is available. Hephaestus is the autonomous implementation worker; GLM is the orchestrator. +For long or complex implementation, delegate to Hephaestus via \`task(category="deep", load_skills=[], run_in_background=true, ...)\`. Hephaestus is the autonomous implementation worker; GLM is the orchestrator. -Use category delegation only when direct Hephaestus invocation is unavailable or when a domain category is more precise. +Use domain-specific categories when they are more precise than \`deep\`. ${buildParallelDelegationSection(model, availableCategories) || `### DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER From 06f01c84d22f8bd95fbda364674ea4948686214b Mon Sep 17 00:00:00 2001 From: ilseob lee Date: Thu, 7 May 2026 09:11:04 +0900 Subject: [PATCH 18/25] fix(tests): update tests to match call_omo_agent deny and task delegation routing --- src/agents/sisyphus.glm-routing.test.ts | 8 ++++---- src/plugin-handlers/tool-config-handler.test.ts | 7 ++++--- src/tools/call-omo-agent/call-omo-agent-sisyphus.test.ts | 6 +++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/agents/sisyphus.glm-routing.test.ts b/src/agents/sisyphus.glm-routing.test.ts index b7f1c7ad4c4..e8371ec7122 100644 --- a/src/agents/sisyphus.glm-routing.test.ts +++ b/src/agents/sisyphus.glm-routing.test.ts @@ -68,8 +68,8 @@ describe("createSisyphusAgent - GLM routing", () => { expect(prompt).toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); expect(prompt).toContain("NEVER implement directly"); - expect(prompt).toContain("DIRECT HEPHAESTUS DELEGATION - YOUR IMPLEMENTATION PATH"); - expect(prompt).toContain('call_omo_agent(subagent_type="hephaestus"'); + expect(prompt).toContain("DEEP DELEGATION - YOUR IMPLEMENTATION PATH"); + expect(prompt).toContain('task(category="deep"'); expect(prompt).toContain("delegate to Hephaestus"); }); @@ -77,12 +77,12 @@ describe("createSisyphusAgent - GLM routing", () => { const prompt = getPrompt("zai/glm-5.1"); expect(prompt).toContain("DECOMPOSE AND DELEGATE - YOU ARE NOT AN IMPLEMENTER"); - expect(prompt).toContain("DIRECT HEPHAESTUS DELEGATION"); + expect(prompt).toContain("DEEP DELEGATION"); }); test("#given GLM harness model #when building prompt #then places delegation before verification and style sections", () => { const prompt = getPrompt("zai/glm-5.1"); - const delegationIndex = prompt.indexOf("DIRECT HEPHAESTUS DELEGATION"); + const delegationIndex = prompt.indexOf("DEEP DELEGATION"); const verificationIndex = prompt.indexOf(""); const styleIndex = prompt.indexOf("