Skip to content

Commit a83eeea

Browse files
NagyViktNagyViktclaude
authored
feat(gain): colony gain drift + savings_drift_report MCP tool (#575)
* feat(gain): add colony gain drift + savings_drift_report MCP tool - Median tokens-per-call comparison across non-overlapping windows - Classifies up_drift / down_drift / new_tool / gone / insufficient_data / stable - No schema change — reads existing mcp_metrics table Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * chore: add changeset for gain drift detector Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: NagyVikt <nagy.viktordp@gmail.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent f241ce8 commit a83eeea

12 files changed

Lines changed: 1107 additions & 1 deletion

File tree

.changeset/gain-drift-detector.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
---
2+
'colonyq': minor
3+
'@colony/storage': minor
4+
'@colony/core': minor
5+
'@colony/mcp-server': minor
6+
---
7+
8+
`colony gain drift` and a matching `savings_drift_report` MCP tool flag
9+
tools whose median tokens-per-call has drifted up or down. Default windows
10+
are non-overlapping: recent = last 3 days, baseline = 14 days ending 3 days
11+
before recent. Default thresholds: `--threshold 1.25` (up), `--down-threshold
12+
0.75`, `--min-calls 20` per window. Classifications: `up_drift`,
13+
`down_drift`, `new_tool` (no baseline), `gone` (no recent), `insufficient_data`,
14+
`stable`.
15+
16+
Storage gains `Storage.mcpTokenDriftPerOperation()` which computes per-operation
17+
medians with a `ROW_NUMBER() OVER (PARTITION BY operation ORDER BY tpc)`
18+
window function — chosen over the correlated `LIMIT 1 OFFSET (COUNT-1)/2`
19+
form because SQLite forbids outer aggregate references in scalar-subquery
20+
`OFFSET`. A `mcpMetricsMinTs()` helper surfaces a one-line warning when the
21+
baseline window starts before the first recorded metric.

apps/cli/src/commands/gain.ts

Lines changed: 233 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import {
55
type SavingsLiveComparisonCost,
66
type SavingsReferenceRow,
77
type SavingsReferenceTotals,
8+
classifyDrift,
9+
type DriftReport,
810
savingsLiveComparison,
911
savingsLiveComparisonCost,
1012
savingsReferenceTotals,
@@ -40,6 +42,16 @@ interface GainOptions {
4042
topOps?: string;
4143
}
4244

45+
interface GainDriftOptions {
46+
json?: boolean;
47+
baselineDays?: string;
48+
recentDays?: string;
49+
minCalls?: string;
50+
threshold?: string;
51+
downThreshold?: string;
52+
operation?: string;
53+
}
54+
4355
export interface MoverRow {
4456
operation: string;
4557
recent_calls: number;
@@ -78,7 +90,7 @@ interface TopErrorReason {
7890
}
7991

8092
export function registerGainCommand(program: Command): void {
81-
program
93+
const gain = program
8294
.command('gain')
8395
.description('Show colony token/cost savings from live mcp_metrics receipts')
8496
.option('--json', 'emit structured JSON')
@@ -250,6 +262,91 @@ export function registerGainCommand(program: Command): void {
250262
movers,
251263
);
252264
});
265+
266+
gain
267+
.command('drift')
268+
.description(
269+
'Flag tools whose median tokens-per-call has drifted vs a baseline window (no schema change)',
270+
)
271+
.option('--baseline-days <n>', 'baseline window length in days (default 14)')
272+
.option('--recent-days <n>', 'recent window length in days (default 3)')
273+
.option('--min-calls <n>', 'minimum sample size per window to trust signal (default 20)')
274+
.option('--threshold <ratio>', 'up-drift trigger ratio (default 1.25 = +25%)')
275+
.option('--down-threshold <ratio>', 'down-drift trigger ratio (default 0.75 = -25%)')
276+
.option('--operation <name>', 'show only this operation row in the table')
277+
.option('--json', 'emit structured JSON')
278+
.action(async (opts: GainDriftOptions) => {
279+
const settings = loadSettings();
280+
const baselineDays = parsePositiveFloat(opts.baselineDays) ?? 14;
281+
const recentDays = parsePositiveFloat(opts.recentDays) ?? 3;
282+
const minCalls = parsePositiveInt(opts.minCalls) ?? 20;
283+
const threshold = parsePositiveFloat(opts.threshold) ?? 1.25;
284+
const downThreshold = parsePositiveFloat(opts.downThreshold) ?? 0.75;
285+
286+
const now = Date.now();
287+
const recentSince = now - recentDays * 24 * 60 * 60_000;
288+
// 3-day gap between recent and baseline so day-of-week noise does not
289+
// bleed across — see spec/v0.x roadmap.
290+
const baselineUntil = recentSince - 3 * 24 * 60 * 60_000;
291+
const baselineSince = baselineUntil - baselineDays * 24 * 60 * 60_000;
292+
const recentUntil = now;
293+
294+
const { rawRows, minTs } = await withStorage(
295+
settings,
296+
(storage) => {
297+
const allRows = storage.mcpTokenDriftPerOperation({
298+
baseline_since: baselineSince,
299+
baseline_until: baselineUntil,
300+
recent_since: recentSince,
301+
recent_until: recentUntil,
302+
});
303+
const filtered =
304+
opts.operation !== undefined
305+
? allRows.filter((row) => row.operation === opts.operation)
306+
: allRows;
307+
return { rawRows: filtered, minTs: storage.mcpMetricsMinTs() };
308+
},
309+
{ readonly: true },
310+
);
311+
312+
const report = classifyDrift(rawRows, {
313+
threshold,
314+
down_threshold: downThreshold,
315+
min_calls: minCalls,
316+
});
317+
318+
const baselineWarning =
319+
minTs !== null && minTs > baselineSince
320+
? `baseline window starts before first recorded metric — drift detection needs ~${
321+
Math.ceil((recentDays + baselineDays + 3) - (now - minTs) / (24 * 60 * 60_000))
322+
} more day${baselineDays > 1 ? 's' : ''} of history`
323+
: null;
324+
325+
if (opts.json === true) {
326+
const payload = {
327+
window: {
328+
baseline_since: baselineSince,
329+
baseline_until: baselineUntil,
330+
recent_since: recentSince,
331+
recent_until: recentUntil,
332+
},
333+
threshold: report.threshold,
334+
rows: report.rows,
335+
new_tools: report.new_tools,
336+
gone_tools: report.gone_tools,
337+
insufficient_data: report.insufficient_data,
338+
...(baselineWarning !== null ? { warning: baselineWarning } : {}),
339+
};
340+
process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`);
341+
return;
342+
}
343+
344+
writeDriftReport(report, {
345+
recentDays,
346+
baselineDays,
347+
baselineWarning,
348+
});
349+
});
253350
}
254351

255352
export function writeGainReport(
@@ -1175,6 +1272,12 @@ function parsePositiveInt(raw: string | undefined): number | undefined {
11751272
return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : undefined;
11761273
}
11771274

1275+
function parsePositiveFloat(raw: string | undefined): number | undefined {
1276+
if (raw === undefined || raw.trim() === '') return undefined;
1277+
const parsed = Number(raw);
1278+
return Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
1279+
}
1280+
11781281
// rtk-style proportional bar. The row whose value equals `max` gets a full
11791282
// bar; smaller rows scale linearly. Empty when max <= 0.
11801283
export function renderImpactBar(value: number, max: number, width: number): string {
@@ -1586,3 +1689,132 @@ function colorByEfficiency(pct: number, text: string): string {
15861689
if (pct >= 40) return kleur.yellow().bold(text);
15871690
return kleur.red().bold(text);
15881691
}
1692+
1693+
export interface DriftReportInput {
1694+
recentDays: number;
1695+
baselineDays: number;
1696+
baselineWarning: string | null;
1697+
}
1698+
1699+
// Plaintext rendering for `colony gain drift`. Mirrors the gain layout
1700+
// (kleur-colored, padded columns). Drift-classified rows print first;
1701+
// new/gone/insufficient sets get one-line summaries underneath so a
1702+
// quick scan answers "is anything regressing?" without re-running.
1703+
export function writeDriftReport(report: DriftReport, input: DriftReportInput): void {
1704+
const w = process.stdout;
1705+
const { recentDays, baselineDays, baselineWarning } = input;
1706+
w.write(
1707+
`${kleur.bold(
1708+
`colony gain drift (recent ${formatDaysLabel(recentDays)} vs baseline ${formatDaysLabel(
1709+
baselineDays,
1710+
)})`,
1711+
)}\n`,
1712+
);
1713+
w.write(
1714+
kleur.dim(
1715+
`Thresholds: up >= ${report.threshold.up.toFixed(2)}x, down <= ${report.threshold.down.toFixed(
1716+
2,
1717+
)}x, min ${report.threshold.min_calls} calls per window.\n`,
1718+
),
1719+
);
1720+
if (baselineWarning !== null) {
1721+
w.write(`${kleur.yellow('[warn] ')}${baselineWarning}\n`);
1722+
}
1723+
const tableRows = report.rows.filter(
1724+
(row) =>
1725+
row.classification === 'up_drift' ||
1726+
row.classification === 'down_drift' ||
1727+
row.classification === 'stable',
1728+
);
1729+
if (tableRows.length === 0 && report.new_tools.length === 0 && report.gone_tools.length === 0) {
1730+
w.write(kleur.dim('No operations had enough samples in both windows.\n'));
1731+
if (report.insufficient_data.length > 0) {
1732+
writeDriftInsufficient(report);
1733+
}
1734+
return;
1735+
}
1736+
if (tableRows.length > 0) {
1737+
const widths = [24, 13, 11, 8, 7, 7, 18];
1738+
const head = padRow(
1739+
['Operation', 'Baseline med', 'Recent med', 'Ratio', 'n_base', 'n_rec', 'Class'],
1740+
widths,
1741+
);
1742+
w.write(`${kleur.dim(head)}\n`);
1743+
// Up-drift first (most urgent), then down, then stable. Within each
1744+
// bucket keep the storage-emitted alphabetical order so output is
1745+
// deterministic for tests.
1746+
const ordered = [
1747+
...tableRows.filter((row) => row.classification === 'up_drift'),
1748+
...tableRows.filter((row) => row.classification === 'down_drift'),
1749+
...tableRows.filter((row) => row.classification === 'stable'),
1750+
];
1751+
for (const row of ordered) {
1752+
const cells = [
1753+
truncate(row.operation, widths[0] ?? 24),
1754+
formatTokens(row.baseline_median ?? 0),
1755+
formatTokens(row.recent_median ?? 0),
1756+
formatDriftRatio(row.ratio, row.classification),
1757+
formatInt(row.baseline_n),
1758+
formatInt(row.recent_n),
1759+
formatDriftClass(row.classification),
1760+
];
1761+
w.write(`${padRow(cells, widths)}\n`);
1762+
}
1763+
}
1764+
if (report.insufficient_data.length > 0) {
1765+
writeDriftInsufficient(report);
1766+
}
1767+
if (report.new_tools.length > 0) {
1768+
w.write(
1769+
`${kleur.dim('New tools (no baseline):')} ${report.new_tools.join(', ')}\n`,
1770+
);
1771+
}
1772+
if (report.gone_tools.length > 0) {
1773+
w.write(
1774+
`${kleur.dim('Gone tools (no recent calls):')} ${report.gone_tools.join(', ')}\n`,
1775+
);
1776+
}
1777+
}
1778+
1779+
function writeDriftInsufficient(report: DriftReport): void {
1780+
const names = report.insufficient_data
1781+
.map((row) => row.operation)
1782+
.slice(0, 12)
1783+
.join(', ');
1784+
const more = report.insufficient_data.length > 12
1785+
? `, +${report.insufficient_data.length - 12} more`
1786+
: '';
1787+
process.stdout.write(
1788+
`${kleur.dim(`Insufficient data (n<${report.threshold.min_calls}):`)} ${names}${more}\n`,
1789+
);
1790+
}
1791+
1792+
function formatDriftRatio(ratio: number | null, classification: DriftReport['rows'][number]['classification']): string {
1793+
if (ratio === null) return '-';
1794+
const rounded = ratio >= 10 ? ratio.toFixed(1) : ratio.toFixed(2);
1795+
if (classification === 'up_drift') return kleur.red(`▲${rounded}x`);
1796+
if (classification === 'down_drift') return kleur.green(`▼${rounded}x`);
1797+
return `${rounded}x`;
1798+
}
1799+
1800+
function formatDriftClass(classification: DriftReport['rows'][number]['classification']): string {
1801+
switch (classification) {
1802+
case 'up_drift':
1803+
return kleur.red('up_drift');
1804+
case 'down_drift':
1805+
return kleur.green('down_drift');
1806+
case 'stable':
1807+
return kleur.dim('stable');
1808+
case 'new_tool':
1809+
return kleur.cyan('new_tool');
1810+
case 'gone':
1811+
return kleur.dim('gone');
1812+
case 'insufficient_data':
1813+
return kleur.dim('insufficient');
1814+
}
1815+
}
1816+
1817+
function formatDaysLabel(days: number): string {
1818+
if (Number.isInteger(days)) return `${days}d`;
1819+
return `${days.toFixed(1)}d`;
1820+
}

0 commit comments

Comments
 (0)