Skip to content

Commit 1917c82

Browse files
authored
Merge pull request #42 from Digital-Threads/feat/complete-token-stats
feat: complete reports tokens spent + saved (0.24.0)
2 parents 2d7171e + 71560a1 commit 1917c82

12 files changed

Lines changed: 330 additions & 44 deletions

File tree

CHANGELOG.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10-
## [0.23.0] - 2026-06-13
10+
## [0.24.0] - 2026-06-13
11+
12+
### Added
13+
- **`complete` reports tokens spent and saved.** Each finalize now prints what
14+
it cost and what it compresses: `complete tj-x: … | spent 1.5k tok ($0.0012) ·
15+
saved ~88k→1.5k tok (59×)`. **Spent** is exact, pulled from the backend's own
16+
usage report (the `claude -p` JSON envelope's `usage`/`total_cost_usd`,
17+
Anthropic/OpenAI `usage`), summed across the judge call and any `--enrich`
18+
calls. **Saved** is an estimate of memory compression — the raw transcript
19+
size of the task's sessions vs its compact pack (≈ chars/4). A batch run ends
20+
with a `Totals across N task(s):` line. Backends expose usage via a new
21+
`LlmBackend::complete_usage` method (default: no usage), so custom backends
22+
keep working unchanged.
1123

1224
Finalize, retuned after running `complete` on real 12-session tasks: the fast,
1325
reliable judge-only path is now the default, and the slow session-enrich pass is

Cargo.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ members = [
77
]
88

99
[workspace.package]
10-
version = "0.23.0"
10+
version = "0.24.0"
1111
edition = "2021"
1212
rust-version = "1.88"
1313
license = "MIT"

crates/tj-cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ default = ["embed"]
2323
embed = ["tj-core/embed"]
2424

2525
[dependencies]
26-
tj-core = { package = "task-journal-core", version = "0.23.0", path = "../tj-core", default-features = false }
26+
tj-core = { package = "task-journal-core", version = "0.24.0", path = "../tj-core", default-features = false }
2727
anyhow = { workspace = true }
2828
clap = { workspace = true }
2929
tracing = { workspace = true }

crates/tj-cli/src/main.rs

Lines changed: 154 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4108,6 +4108,89 @@ struct FinalizeOutcome {
41084108
reason: String,
41094109
/// True when no LLM backend was available — nothing was judged or written.
41104110
skipped_no_backend: bool,
4111+
/// Exact token usage spent on this task (judge + any enrich calls).
4112+
spent: tj_core::llm::LlmUsage,
4113+
/// Estimated memory compression: raw session tokens → compact pack tokens.
4114+
saved: Option<Savings>,
4115+
}
4116+
4117+
/// Rough memory-compression estimate for a finalized task (≈ chars / 4).
4118+
#[derive(Default, Clone, Copy)]
4119+
struct Savings {
4120+
raw_tokens: u64,
4121+
pack_tokens: u64,
4122+
}
4123+
4124+
/// ~tokens from a char count (a rough 4-chars-per-token estimate — enough for
4125+
/// an order-of-magnitude "how much memory this compresses" signal).
4126+
fn est_tokens(chars: usize) -> u64 {
4127+
(chars as u64).div_ceil(4)
4128+
}
4129+
4130+
/// Estimate how much raw session material a task's compact pack stands in for:
4131+
/// the summed transcript size of the sessions it touched vs the pack size.
4132+
/// `None` when sessions aren't reachable (no project dir).
4133+
fn compute_savings(
4134+
conn: &rusqlite::Connection,
4135+
events_path: &std::path::Path,
4136+
project_dir: Option<&std::path::Path>,
4137+
task_id: &str,
4138+
) -> Option<Savings> {
4139+
let dir = project_dir?;
4140+
let sessions = task_sessions(events_path, dir, task_id).ok()?;
4141+
if sessions.is_empty() {
4142+
return None;
4143+
}
4144+
let raw_chars: usize = sessions.iter().map(|(_, inp)| inp.transcript.len()).sum();
4145+
let pack = tj_core::pack::assemble(conn, task_id, tj_core::pack::PackMode::Compact).ok()?;
4146+
Some(Savings {
4147+
raw_tokens: est_tokens(raw_chars),
4148+
pack_tokens: est_tokens(pack.text.len()),
4149+
})
4150+
}
4151+
4152+
/// Format a token count compactly: 980 → "980", 3_240 → "3.2k", 88_000 → "88k".
4153+
fn fmt_tokens(n: u64) -> String {
4154+
if n < 1_000 {
4155+
n.to_string()
4156+
} else if n < 100_000 {
4157+
format!("{:.1}k", n as f64 / 1_000.0)
4158+
} else {
4159+
format!("{}k", n / 1_000)
4160+
}
4161+
}
4162+
4163+
/// Human spent/saved suffix for a finalize line, e.g.
4164+
/// " | spent 3.2k tok ($0.0012) · saved ~88k→1.5k tok (59×)".
4165+
fn stats_suffix(spent: &tj_core::llm::LlmUsage, saved: &Option<Savings>) -> String {
4166+
let mut parts = Vec::new();
4167+
if spent.total_tokens() > 0 {
4168+
let cost = match spent.cost_usd {
4169+
Some(c) if c > 0.0 => format!(" (${c:.4})"),
4170+
_ => String::new(),
4171+
};
4172+
parts.push(format!(
4173+
"spent {} tok{}",
4174+
fmt_tokens(spent.total_tokens()),
4175+
cost
4176+
));
4177+
}
4178+
if let Some(s) = saved {
4179+
if s.pack_tokens > 0 && s.raw_tokens > s.pack_tokens {
4180+
let factor = s.raw_tokens as f64 / s.pack_tokens as f64;
4181+
parts.push(format!(
4182+
"saved ~{}→{} tok ({:.0}×)",
4183+
fmt_tokens(s.raw_tokens),
4184+
fmt_tokens(s.pack_tokens),
4185+
factor
4186+
));
4187+
}
4188+
}
4189+
if parts.is_empty() {
4190+
String::new()
4191+
} else {
4192+
format!(" | {}", parts.join(" · "))
4193+
}
41114194
}
41124195

41134196
/// Per-project handles threaded through the finalize helpers.
@@ -4149,10 +4232,10 @@ fn enrich_task(
41494232
project_dir: &std::path::Path,
41504233
task_id: &str,
41514234
llm: Box<dyn tj_core::llm::LlmBackend>,
4152-
) -> anyhow::Result<usize> {
4235+
) -> anyhow::Result<(usize, tj_core::llm::LlmUsage)> {
41534236
let sessions = task_sessions(events_path, project_dir, task_id)?;
41544237
if sessions.is_empty() {
4155-
return Ok(0);
4238+
return Ok((0, tj_core::llm::LlmUsage::default()));
41564239
}
41574240
// Enrich is the slow part — one (or more, for big transcripts) `claude -p`
41584241
// call per session. Announce it so a multi-minute run doesn't look hung;
@@ -4170,7 +4253,7 @@ fn enrich_task(
41704253
};
41714254
let report =
41724255
tj_core::dream::run_dream(conn, events_path, &opts, &dream_backend, sessions, &run_id)?;
4173-
Ok(report.events_backfilled)
4256+
Ok((report.events_backfilled, dream_backend.usage()))
41744257
}
41754258

41764259
/// Current title for a task ("" if somehow unset).
@@ -4229,7 +4312,10 @@ fn finalize_one_task(
42294312
if enrich && !dry_run {
42304313
if let Some(dir) = ctx.project_dir {
42314314
if let Some(llm) = tj_core::llm::backend_from_env(backend)? {
4232-
out.enriched = enrich_task(conn, events_path, project_hash, dir, task_id, llm)?;
4315+
let (n, enrich_usage) =
4316+
enrich_task(conn, events_path, project_hash, dir, task_id, llm)?;
4317+
out.enriched = n;
4318+
out.spent.add(enrich_usage);
42334319
tj_core::db::ingest_new_events(conn, events_path, project_hash)?;
42344320
}
42354321
}
@@ -4256,7 +4342,8 @@ fn finalize_one_task(
42564342
out.skipped_no_backend = true;
42574343
return Ok(out);
42584344
};
4259-
let j = tj_core::finalize::judge(&title, &lines, judge_backend.as_ref())?;
4345+
let (j, judge_usage) = tj_core::finalize::judge(&title, &lines, judge_backend.as_ref())?;
4346+
out.spent.add(judge_usage);
42604347
out.done = j.done;
42614348
out.reason = j.reason.clone();
42624349

@@ -4302,6 +4389,9 @@ fn finalize_one_task(
43024389

43034390
writer.flush_durable()?;
43044391
tj_core::db::ingest_new_events(conn, events_path, project_hash)?;
4392+
4393+
// 6. Estimate the memory compression this finalize represents.
4394+
out.saved = compute_savings(conn, events_path, ctx.project_dir, task_id);
43054395
Ok(out)
43064396
}
43074397

@@ -4334,7 +4424,11 @@ PATH; or pick one via --backend / TJ_BACKEND: anthropic, openai, ollama (free, l
43344424
if parts.is_empty() {
43354425
parts.push("no change".to_string());
43364426
}
4337-
println!("complete {task_id}: {}", parts.join("; "));
4427+
println!(
4428+
"complete {task_id}: {}{}",
4429+
parts.join("; "),
4430+
stats_suffix(&out.spent, &out.saved)
4431+
);
43384432
}
43394433

43404434
/// `complete <id>` — finalize a single task.
@@ -4482,18 +4576,35 @@ fn run_complete_batch(
44824576
}
44834577

44844578
let mut left_open: Vec<(String, String)> = Vec::new();
4579+
let mut total_spent = tj_core::llm::LlmUsage::default();
4580+
let mut total_saved = Savings::default();
4581+
let mut done_count = 0usize;
44854582
for (id, _) in &targets {
44864583
let out = finalize_one_task(&ctx, id, enrich, false, backend)?;
44874584
print_finalize_outcome(id, &out);
44884585
if out.skipped_no_backend {
44894586
println!("complete: stopping batch — no LLM backend available.");
44904587
return Ok(());
44914588
}
4589+
total_spent.add(out.spent);
4590+
if let Some(s) = out.saved {
4591+
total_saved.raw_tokens += s.raw_tokens;
4592+
total_saved.pack_tokens += s.pack_tokens;
4593+
}
4594+
done_count += 1;
44924595
if !out.closed {
44934596
left_open.push((id.clone(), out.reason.clone()));
44944597
}
44954598
}
44964599

4600+
let totals = stats_suffix(&total_spent, &Some(total_saved));
4601+
if !totals.is_empty() {
4602+
println!(
4603+
"\nTotals across {done_count} task(s): {}",
4604+
totals.trim_start_matches(" | ")
4605+
);
4606+
}
4607+
44974608
if !left_open.is_empty() {
44984609
println!("\nLeft open ({}):", left_open.len());
44994610
for (id, reason) in &left_open {
@@ -5551,6 +5662,43 @@ mod inline_tests {
55515662
// declared before this module begins.
55525663
use super::*;
55535664

5665+
#[test]
5666+
fn fmt_tokens_scales_units() {
5667+
assert_eq!(fmt_tokens(980), "980");
5668+
assert_eq!(fmt_tokens(1_500), "1.5k");
5669+
assert_eq!(fmt_tokens(88_000), "88.0k");
5670+
assert_eq!(fmt_tokens(204_000), "204k");
5671+
}
5672+
5673+
#[test]
5674+
fn stats_suffix_shows_spent_and_saved() {
5675+
let spent = tj_core::llm::LlmUsage {
5676+
input_tokens: 1200,
5677+
output_tokens: 300,
5678+
cost_usd: Some(0.0012),
5679+
};
5680+
let saved = Some(Savings {
5681+
raw_tokens: 90_000,
5682+
pack_tokens: 1_500,
5683+
});
5684+
let s = stats_suffix(&spent, &saved);
5685+
assert!(s.contains("spent 1.5k tok ($0.0012)"), "{s}");
5686+
assert!(s.contains("saved ~90.0k→1.5k tok (60×)"), "{s}");
5687+
}
5688+
5689+
#[test]
5690+
fn stats_suffix_empty_when_nothing_to_report() {
5691+
let spent = tj_core::llm::LlmUsage::default();
5692+
assert_eq!(stats_suffix(&spent, &None), "");
5693+
// Cost omitted when zero/None; tokens still shown.
5694+
let spent = tj_core::llm::LlmUsage {
5695+
input_tokens: 500,
5696+
output_tokens: 0,
5697+
cost_usd: None,
5698+
};
5699+
assert_eq!(stats_suffix(&spent, &None), " | spent 500 tok");
5700+
}
5701+
55545702
#[test]
55555703
fn nudge_escalates_only_for_substantial_thin_sessions() {
55565704
// Small session → never escalate, regardless of capture.

crates/tj-cli/tests/cli.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5563,6 +5563,8 @@ fn complete_retitles_and_closes_via_fake_backend() {
55635563
// whose `result` field is the finalize JSON string.
55645564
let envelope = serde_json::json!({
55655565
"is_error": false,
5566+
"usage": {"input_tokens": 1200, "output_tokens": 300},
5567+
"total_cost_usd": 0.0012,
55665568
"result": serde_json::json!({
55675569
"retitle": true,
55685570
"title": "Voucher refund: paid 100% but got 50%",
@@ -5619,6 +5621,7 @@ fn complete_retitles_and_closes_via_fake_backend() {
56195621
.args(["complete", &task_id])
56205622
.assert()
56215623
.success()
5624+
.stdout(contains("spent 1.5k tok ($0.0012)"))
56225625
.stdout(contains("retitled"))
56235626
.stdout(contains("closed"));
56245627

crates/tj-core/src/classifier/agent_sdk.rs

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,9 @@ impl ClaudeCliClassifier {
236236
}
237237
}
238238

239-
/// The JSON wrapper emitted by `claude --output-format json`. We only need the
240-
/// error flag and the `result` string (the model's verdict text); the rest of
241-
/// the envelope (usage, cost, timings) is ignored.
239+
/// The JSON wrapper emitted by `claude --output-format json`. We read the error
240+
/// flag, the `result` string (the model's verdict text), and the usage/cost so
241+
/// callers can report what a call actually consumed.
242242
#[derive(serde::Deserialize)]
243243
struct CliEnvelope {
244244
#[serde(default)]
@@ -247,6 +247,22 @@ struct CliEnvelope {
247247
result: Option<String>,
248248
#[serde(default)]
249249
subtype: Option<String>,
250+
#[serde(default)]
251+
usage: Option<EnvelopeUsage>,
252+
#[serde(default)]
253+
total_cost_usd: Option<f64>,
254+
}
255+
256+
#[derive(serde::Deserialize, Default)]
257+
struct EnvelopeUsage {
258+
#[serde(default)]
259+
input_tokens: u64,
260+
#[serde(default)]
261+
output_tokens: u64,
262+
#[serde(default)]
263+
cache_creation_input_tokens: u64,
264+
#[serde(default)]
265+
cache_read_input_tokens: u64,
250266
}
251267

252268
impl Classifier for ClaudeCliClassifier {
@@ -266,6 +282,16 @@ pub fn run_claude_json(
266282
model: &str,
267283
prompt: &str,
268284
) -> anyhow::Result<String> {
285+
run_claude_json_usage(runner, model, prompt).map(|(text, _)| text)
286+
}
287+
288+
/// Like [`run_claude_json`] but also returns the envelope's reported token
289+
/// usage and cost (zeros when the envelope omits them).
290+
pub fn run_claude_json_usage(
291+
runner: &dyn CommandRunner,
292+
model: &str,
293+
prompt: &str,
294+
) -> anyhow::Result<(String, crate::llm::LlmUsage)> {
269295
let stdout = runner.run(model, prompt)?;
270296
let envelope: CliEnvelope = serde_json::from_str(stdout.trim()).with_context(|| {
271297
format!(
@@ -279,9 +305,17 @@ pub fn run_claude_json(
279305
envelope.subtype.as_deref().unwrap_or("unknown")
280306
));
281307
}
282-
envelope
308+
let u = envelope.usage.unwrap_or_default();
309+
let usage = crate::llm::LlmUsage {
310+
// Count cache reads/writes as input so the total reflects real context.
311+
input_tokens: u.input_tokens + u.cache_creation_input_tokens + u.cache_read_input_tokens,
312+
output_tokens: u.output_tokens,
313+
cost_usd: envelope.total_cost_usd,
314+
};
315+
let result = envelope
283316
.result
284-
.ok_or_else(|| anyhow!("claude json wrapper had no `result` field"))
317+
.ok_or_else(|| anyhow!("claude json wrapper had no `result` field"))?;
318+
Ok((result, usage))
285319
}
286320

287321
/// Probe whether `claude` resolves on PATH and runs. Cheap (`--version` does

0 commit comments

Comments
 (0)