Skip to content

Commit f4d4f33

Browse files
Shahinyanmclaude
andauthored
fix(pack): truncate at UTF-8 char boundary, not raw byte index (#20)
pack truncation sliced the rendered text at text[..budget], panicking ("byte index N is not a char boundary") when the cutoff landed inside a multibyte char — Cyrillic/CJK/emoji journals over the pack budget. ASCII stayed safe, so it was latent. Extract truncate_to_budget() that backs up to a char boundary before slicing; add regression tests. semver patch 0.11.0 -> 0.11.1. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent f7dfe77 commit f4d4f33

6 files changed

Lines changed: 60 additions & 9 deletions

File tree

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.11.1] - 2026-06-08
11+
12+
**Fix: `pack` panicked on multibyte UTF-8.** Pack truncation sliced the
13+
rendered text at a raw byte index, panicking ("byte index N is not a char
14+
boundary") whenever the budget cutoff landed inside a multibyte character —
15+
i.e. on Cyrillic/CJK/emoji-heavy journals that exceed the pack budget.
16+
ASCII-only content was unaffected, so it stayed latent. Truncation now cuts
17+
at a UTF-8 char boundary.
18+
19+
### Fixed
20+
- `tj_core::pack` truncation is now char-boundary-safe (`truncate_to_budget`);
21+
packs with non-ASCII text exceeding the budget no longer panic.
22+
1023
## [0.11.0] - 2026-06-08
1124

1225
**Live `session_id` on emitted events (additive, opt-in).** The journal now

Cargo.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ members = [
77
]
88

99
[workspace.package]
10-
version = "0.11.0"
10+
version = "0.11.1"
1111
edition = "2021"
1212
rust-version = "1.88"
1313
license = "MIT"

crates/tj-cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ name = "task-journal"
1616
path = "src/main.rs"
1717

1818
[dependencies]
19-
tj-core = { package = "task-journal-core", version = "0.11.0", path = "../tj-core" }
19+
tj-core = { package = "task-journal-core", version = "0.11.1", path = "../tj-core" }
2020
anyhow = { workspace = true }
2121
clap = { workspace = true }
2222
tracing = { workspace = true }

crates/tj-core/src/pack.rs

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,23 @@ fn render_lifecycle(conn: &Connection, task_id: &str) -> anyhow::Result<String>
164164
Ok(out)
165165
}
166166

167+
/// Truncate `text` to at most `budget` bytes, cutting at a UTF-8 char
168+
/// boundary and preferring the last newline within the kept prefix, then
169+
/// append `marker`. Char-boundary-safe: a raw `text[..budget]` byte slice
170+
/// panics when `budget` lands inside a multibyte char (Cyrillic/CJK/emoji).
171+
fn truncate_to_budget(text: &mut String, budget: usize, marker: &str) {
172+
if text.len() <= budget {
173+
return;
174+
}
175+
let mut end = budget;
176+
while end > 0 && !text.is_char_boundary(end) {
177+
end -= 1;
178+
}
179+
let cutoff = text[..end].rfind('\n').unwrap_or(end);
180+
text.truncate(cutoff);
181+
text.push_str(marker);
182+
}
183+
167184
pub fn assemble(conn: &Connection, task_id: &str, mode: PackMode) -> anyhow::Result<TaskPack> {
168185
let mode_str = match mode {
169186
PackMode::Compact => "compact",
@@ -334,9 +351,7 @@ pub fn assemble(conn: &Connection, task_id: &str, mode: PackMode) -> anyhow::Res
334351
};
335352
let truncated = text.len() > budget;
336353
if truncated {
337-
let cutoff = text[..budget].rfind('\n').unwrap_or(budget);
338-
text.truncate(cutoff);
339-
text.push_str(TRUNC_MARKER);
354+
truncate_to_budget(&mut text, budget, TRUNC_MARKER);
340355
}
341356

342357
let generated_at = chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
@@ -585,6 +600,29 @@ mod tests {
585600
assert!(pack.text.contains("truncated to fit pack budget"));
586601
}
587602

603+
#[test]
604+
fn truncate_to_budget_handles_multibyte_boundary() {
605+
// 1 ASCII byte shifts every 'я' (2 bytes) start to an ODD offset, so an
606+
// EVEN budget lands INSIDE a char — a raw text[..budget] slice would panic.
607+
let marker = "\n[cut]";
608+
let mut s = String::from("x");
609+
s.push_str(&"я".repeat(2000)); // total = 1 + 4000 = 4001 bytes
610+
let budget = 100usize; // even → mid-char given the odd char starts
611+
assert!(!s.is_char_boundary(budget), "precondition: budget must be mid-char");
612+
truncate_to_budget(&mut s, budget, marker); // must NOT panic
613+
assert!(s.ends_with(marker));
614+
assert!(s.len() <= budget + marker.len());
615+
assert!(std::str::from_utf8(s.as_bytes()).is_ok(), "result must be valid UTF-8");
616+
}
617+
618+
#[test]
619+
fn truncate_to_budget_noop_under_budget() {
620+
let mut s = String::from("маленький текст");
621+
let before = s.clone();
622+
truncate_to_budget(&mut s, 10_000, "\n[cut]");
623+
assert_eq!(s, before);
624+
}
625+
588626
#[test]
589627
fn corrected_events_appear_with_correction_event_type() {
590628
use crate::db;

crates/tj-mcp/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ name = "task-journal-mcp"
1616
path = "src/main.rs"
1717

1818
[dependencies]
19-
tj-core = { package = "task-journal-core", version = "0.11.0", path = "../tj-core" }
19+
tj-core = { package = "task-journal-core", version = "0.11.1", path = "../tj-core" }
2020
anyhow = { workspace = true }
2121
tokio = { workspace = true }
2222
tracing = { workspace = true }

0 commit comments

Comments
 (0)