From eb28d2ca04276319113de7a097ab60b44a69849c Mon Sep 17 00:00:00 2001 From: Grey Newell Date: Thu, 9 Apr 2026 14:45:03 -0400 Subject: [PATCH] fix(schema): stepName truncates at byte boundary for multi-byte UTF-8 steps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stepName() used len(step)/step[:77] (byte operations) to enforce the 80-char limit on HowToStep names in JSON-LD. For instruction steps containing multi-byte UTF-8 characters (é, ñ, CJK, emoji), step[:77] could land inside a character's byte sequence, producing invalid UTF-8 that json.Marshal silently replaces with \uFFFD (U+FFFD). The sentence-length guard (idx < 80) also compared a byte offset to 80, incorrectly rejecting short multi-byte sentences that were < 80 runes but > 80 bytes. Fixes: - Compute rune slice once: runes := []rune(step) - Sentence guard: compare len([]rune(step[:idx+1])) < 80 (rune count) - Truncation: string(runes[:77]) + "..." (rune slice, valid UTF-8) Adds TestStepName covering ASCII truncation, sentence extraction, multi-byte truncation (valid UTF-8 + correct rune count), and multi-byte sentence guard. Co-Authored-By: Claude Sonnet 4.6 --- internal/archdocs/pssg/schema/jsonld.go | 16 ++++--- internal/archdocs/pssg/schema/jsonld_test.go | 46 ++++++++++++++++++++ 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/internal/archdocs/pssg/schema/jsonld.go b/internal/archdocs/pssg/schema/jsonld.go index a50f139..eb7f76e 100644 --- a/internal/archdocs/pssg/schema/jsonld.go +++ b/internal/archdocs/pssg/schema/jsonld.go @@ -284,15 +284,19 @@ func MarshalSchemas(schemas ...map[string]interface{}) string { // stepName extracts a short name from an instruction step. func stepName(step string) string { - // Take first sentence + runes := []rune(step) + // Take first sentence if it fits within 80 runes. for _, sep := range []string{". ", ".\n"} { - if idx := strings.Index(step, sep); idx > 0 && idx < 80 { - return step[:idx+1] + if idx := strings.Index(step, sep); idx > 0 { + // idx is a byte offset; compute rune length of the candidate name. + if len([]rune(step[:idx+1])) < 80 { + return step[:idx+1] + } } } - // Truncate if too long - if len(step) > 80 { - return step[:77] + "..." + // Truncate if too long (rune-aware to avoid splitting multi-byte chars). + if len(runes) > 80 { + return string(runes[:77]) + "..." } return step } diff --git a/internal/archdocs/pssg/schema/jsonld_test.go b/internal/archdocs/pssg/schema/jsonld_test.go index 19cf02c..69ede7a 100644 --- a/internal/archdocs/pssg/schema/jsonld_test.go +++ b/internal/archdocs/pssg/schema/jsonld_test.go @@ -2,8 +2,54 @@ package schema import ( "testing" + "unicode/utf8" ) +func TestStepName(t *testing.T) { + // ASCII truncation: step longer than 80 bytes, no sentence break. + long := "" + for i := 0; i < 85; i++ { + long += "a" + } + got := stepName(long) + if len([]rune(got)) != 80 { // 77 + len("...") = 80 + t.Errorf("ASCII truncation: got %d runes, want 80", len([]rune(got))) + } + + // Short sentence extraction. + got = stepName("Mix ingredients. Then bake for 30 minutes.") + if got != "Mix ingredients." { + t.Errorf("short sentence: got %q, want %q", got, "Mix ingredients.") + } + + // Multi-byte truncation: 85 'é' chars (2 bytes each), no period. + // Byte length > 80 but we must truncate at rune boundary. + multiLong := "" + for i := 0; i < 85; i++ { + multiLong += "é" + } + got = stepName(multiLong) + if !utf8.ValidString(got) { + t.Errorf("multi-byte truncation produced invalid UTF-8: %q", got) + } + if len([]rune(got)) != 80 { // 77 runes + "..." + t.Errorf("multi-byte truncation: got %d runes, want 80", len([]rune(got))) + } + + // Multi-byte sentence: 'é' × 79 chars followed by ". rest" + // Sentence rune count = 80 (79 é + 1 period), which is NOT < 80, so falls through. + // Resulting truncation: 85-char total → truncate to 77+... + multiSentence := "" + for i := 0; i < 79; i++ { + multiSentence += "é" + } + multiSentence += ". rest of step" + got = stepName(multiSentence) + if !utf8.ValidString(got) { + t.Errorf("multi-byte sentence truncation produced invalid UTF-8: %q", got) + } +} + func TestParseDurationMinutes(t *testing.T) { cases := []struct { input string