Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3dfbc09
feat(agents): add GLM-5 harness routing and context optimizations
islee23520 Apr 30, 2026
dd4e73d
feat(agents): GLM-5 thinking optimization and overlay refactor
islee23520 Apr 30, 2026
b30a1b5
style(agents): remove redundant JSDoc and inline comments
islee23520 Apr 30, 2026
f68ff5e
style(tests): remove redundant test comments
islee23520 Apr 30, 2026
7f8cb96
style(tests): remove section dividers and redundant casts in benchmark
islee23520 Apr 30, 2026
ab49716
fix(benchmark): correct factoryTestResults property name and remove s…
islee23520 Apr 30, 2026
24df263
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 Apr 30, 2026
9b4b655
Merge remote-tracking branch 'upstream/dev' into tune/glm-performance
islee23520 May 1, 2026
46cf91e
feat(sisyphus): add GLM-5.x dedicated prompt builder and SJ speed ove…
islee23520 May 1, 2026
c2538a9
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 1, 2026
eba0f4c
refactor(agents): strengthen GLM vision constraints and remove AI slop
islee23520 May 1, 2026
647cc6a
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 3, 2026
1a0d800
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 3, 2026
554a4fa
fix(sj-glm): remove stale assertion for full working memory path in j…
islee23520 May 3, 2026
1cf61a5
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 5, 2026
44426b1
perf(sisyphus): add direct Hephaestus delegation for GLM routing
islee23520 May 6, 2026
1116027
Merge branch 'dev' into tune/glm-performance
islee23520 May 6, 2026
0725c42
fix(momus): restore isGlmThinkingModel branch to skip budgetTokens fo…
islee23520 May 6, 2026
485fb08
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 6, 2026
d08c131
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 6, 2026
a87b865
fix(security): revert call_omo_agent permission to deny for Sisyphus
islee23520 May 6, 2026
f34fa5a
fix(ralph-loop): add runtime error retry cap to prevent unbounded ret…
islee23520 May 6, 2026
b6ea936
refactor(types): replace GLM model regex with explicit allowlist set
islee23520 May 6, 2026
b93bf4b
refactor(glm-prompt): extract section builder functions from main pro…
islee23520 May 6, 2026
d6c5bdb
chore(cli): remove benchmark-only exports from public CLI surface
islee23520 May 6, 2026
7d32830
fix(glm-prompt): replace call_omo_agent references with task(category…
islee23520 May 6, 2026
06f01c8
fix(tests): update tests to match call_omo_agent deny and task delega…
islee23520 May 7, 2026
95b0c07
docs(agents): restore maintainer JSDoc comments removed in b30a1b515
islee23520 May 7, 2026
1f740ca
Merge branch 'code-yeongyu:dev' into tune/glm-performance
islee23520 May 7, 2026
19ebdf9
docs(agents): restore remaining JSDoc and inline comments in metis, S…
islee23520 May 7, 2026
c2fea1c
revert: undo JSDoc restoration - comments are not in scope for this PR
islee23520 May 7, 2026
3eb876a
chore: remove over-testing for call_omo_agent deny revert
islee23520 May 7, 2026
56a7f05
refactor(sj-glm): remove redundant speed optimization overlay from SJ…
islee23520 May 7, 2026
2f21218
chore: remove benchmark-only metric collector from production source
islee23520 May 7, 2026
5d38aa4
chore: remove broken benchmark script (imports deleted src files)
islee23520 May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions scripts/benchmark-glm-thinking.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env bun

const DEFAULT_MODEL = "z-ai/glm-5.1"
const DEFAULT_ITERATIONS = 3
const BENCHMARK_PROMPT = `Analyze this function and explain its time complexity, then suggest an optimization:

function findPairs(arr: number[], target: number): [number, number][] {
const pairs: [number, number][] = []
for (let i = 0; i < arr.length; i++) {
for (let j = i + 1; j < arr.length; j++) {
if (arr[i] + arr[j] === target) {
pairs.push([arr[i], arr[j]])
}
}
}
return pairs
}`

interface BenchmarkResult {
model: string
thinkingEnabled: boolean
iteration: number
timeToFirstTokenMs: number | null
totalTimeMs: number
thinkingTokens: number | null
responseTokens: number | null
error: string | null
}

interface BenchmarkSummary {
model: string
timestamp: string
gitBranch: string
gitCommit: string
thinkingOn: {
avgTotalTimeMs: number
avgTTFTMs: number | null
avgThinkingTokens: number | null
results: BenchmarkResult[]
}
thinkingOff: {
avgTotalTimeMs: number
avgTTFTMs: number | null
results: BenchmarkResult[]
}
factoryTestResults: {
totalTests: number
passed: number
failed: number
}
}

function parseArgs(): { model: string; iterations: number } {
const args = process.argv.slice(2)
let model = DEFAULT_MODEL
let iterations = DEFAULT_ITERATIONS

for (let i = 0; i < args.length; i++) {
if (args[i] === "--model" && args[i + 1]) {
model = args[i + 1]
i++
} else if (args[i] === "--iterations" && args[i + 1]) {
iterations = parseInt(args[i + 1], 10)
i++
}
}

return { model, iterations }
}

function average(values: number[]): number {
return values.reduce((a, b) => a + b, 0) / values.length
}

async function getGitInfo(): Promise<{ branch: string; commit: string }> {
const { execSync } = await import("child_process")
try {
const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim()
const commit = execSync("git rev-parse --short HEAD", { encoding: "utf-8" }).trim()
return { branch, commit }
} catch {
return { branch: "unknown", commit: "unknown" }
}
}

async function runFactoryBenchmark(): Promise<{ totalTests: number; passed: number; failed: number }> {
const { execSync } = await import("child_process")
try {
const output = execSync(
"bun test src/agents/glm-thinking-benchmark.test.ts src/agents/types.test.ts 2>&1",
{ encoding: "utf-8" }
)
const match = output.match(/(\d+) pass.*?(\d+) fail/)
if (match) {
return { passed: parseInt(match[1], 10), failed: parseInt(match[2], 10), totalTests: parseInt(match[1], 10) + parseInt(match[2], 10) }
}
return { totalTests: 0, passed: 0, failed: 0 }
} catch {
return { totalTests: 0, passed: 0, failed: -1 }
}
}

async function callModelDirect(_model: string, _prompt: string, _thinking: boolean): Promise<BenchmarkResult> {
return {
model: _model,
thinkingEnabled: _thinking,
iteration: 0,
timeToFirstTokenMs: null,
totalTimeMs: 0,
thinkingTokens: null,
responseTokens: null,
error: "Direct API calls require OpenCode runtime. Use factory benchmark results for config verification.",
}
}

async function main() {
const { model, iterations } = parseArgs()
const git = await getGitInfo()

console.error(`\n=== GLM Thinking Benchmark ===`)
console.error(`Model: ${model}`)
console.error(`Iterations: ${iterations}`)
console.error(`Branch: ${git.branch} (${git.commit})`)
console.error()

console.error("Phase 1: Factory config correctness benchmark...")
const factoryResults = await runFactoryBenchmark()
console.error(` Factory tests: ${factoryResults.passed}/${factoryResults.totalTests} passed`)

console.error("\nPhase 2: Runtime benchmark (requires OpenCode runtime)...")
const thinkingOnResults: BenchmarkResult[] = []
const thinkingOffResults: BenchmarkResult[] = []

for (let i = 0; i < iterations; i++) {
const onResult = await callModelDirect(model, BENCHMARK_PROMPT, true)
onResult.iteration = i + 1
thinkingOnResults.push(onResult)

const offResult = await callModelDirect(model, BENCHMARK_PROMPT, false)
offResult.iteration = i + 1
thinkingOffResults.push(offResult)
}

const summary: BenchmarkSummary = {
model,
timestamp: new Date().toISOString(),
gitBranch: git.branch,
gitCommit: git.commit,
thinkingOn: {
avgTotalTimeMs: thinkingOnResults.length > 0 ? average(thinkingOnResults.map(r => r.totalTimeMs)) : 0,
avgTTFTMs: null,
avgThinkingTokens: null,
results: thinkingOnResults,
},
thinkingOff: {
avgTotalTimeMs: thinkingOffResults.length > 0 ? average(thinkingOffResults.map(r => r.totalTimeMs)) : 0,
avgTTFTMs: null,
results: thinkingOffResults,
},
factoryTestResults: factoryResults,
}

console.log(JSON.stringify(summary, null, 2))

console.error("\n=== Summary ===")
console.error(`Factory benchmark: ${factoryResults.passed}/${factoryResults.totalTests} tests passed`)
console.error(` - GLM-5+ text models: thinking enabled, NO budgetTokens`)
console.error(` - Claude models: thinking enabled with budgetTokens`)
console.error(` - GPT models: reasoningEffort, no thinking`)
console.error(` - GLM VLM models: default path (budgetTokens)`)
console.error()
console.error("Runtime benchmark: skipped (requires OpenCode runtime)")
console.error(" To run runtime benchmark manually:")
console.error(" opencode --model z-ai/glm-5.1 --prompt 'Explain time complexity of this function: ...'")
console.error()
console.error("Full results written to stdout (JSON)")
}

main().catch(console.error)
Loading
Loading