From e0ebf72996c1361e293fcad8cf967e0ed8cb68b7 Mon Sep 17 00:00:00 2001 From: Peter Johnson <738069+missinglink@users.noreply.github.com> Date: Mon, 11 May 2026 13:45:51 +0200 Subject: [PATCH] feat(unicode): ensure 140 char text limit is unicode aware --- helper/unicode.js | 25 +++++++++++++++++++++++++ package.json | 2 +- sanitizer/_text.js | 6 ++++-- sanitizer/_text_pelias_parser.js | 5 +++-- test/unit/sanitizer/_text.js | 23 +++++++++++++++++++++++ 5 files changed, 56 insertions(+), 5 deletions(-) diff --git a/helper/unicode.js b/helper/unicode.js index b8d8cea6d..cc9c3cc8d 100644 --- a/helper/unicode.js +++ b/helper/unicode.js @@ -1,6 +1,11 @@ const _ = require('lodash'); const regenerate = require('regenerate'); +// shared grapheme segmenter (UAX-29) +// note: locale 'en' only affects word/sentence granularity; grapheme rules are script-agnostic. +const graphemeSegmenter = new Intl.Segmenter('en', { granularity: 'grapheme' }); +const toGraphemes = (str) => Array.from(graphemeSegmenter.segment(str), ({ segment }) => segment); + // non-printable control characters // ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters const CONTROL_CODES = regenerate() @@ -96,3 +101,23 @@ function normalize(str) { } module.exports.normalize = normalize; + +// unicode aware truncation function +// counts user-perceived characters (UAX-29 grapheme clusters) via Intl.Segmenter. +module.exports.truncate = (input, len) => { + + // sanity checking + if (!_.isString(input)) { throw new Error('invalid string'); } + + // short-circuit common case of short strings + // if input.length <= len, grapheme count must also be <= len + if (input.length > len) { + // truncate to len graphemes + const graphemes = toGraphemes(input); + if (graphemes.length > len) { + return graphemes.slice(0, len).join(''); + } + } + + return null; +}; \ No newline at end of file diff --git a/package.json b/package.json index 2cf22df74..314839370 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,7 @@ "url": "https://github.com/pelias/api/issues" }, "engines": { - "node": ">=12.0.0" + "node": ">=16.0.0" }, "dependencies": { "@hapi/joi": "^16.0.1", diff --git a/sanitizer/_text.js b/sanitizer/_text.js index 90b977c44..e2c3ef71a 100644 --- a/sanitizer/_text.js +++ b/sanitizer/_text.js @@ -21,10 +21,12 @@ function _sanitize( raw, clean ){ if( !_.isString(text) || _.isEmpty(text) ){ messages.errors.push(`invalid param 'text': text length, must be >0`); } else { - if( text.length > MAX_TEXT_LENGTH ){ + const truncated = unicode.truncate(text, MAX_TEXT_LENGTH); + if( truncated ){ messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`); - text = text.substring(0, MAX_TEXT_LENGTH); + text = truncated; } + clean.text = text; } diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index a9e3ff1fc..5483ba42b 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -40,9 +40,10 @@ function _sanitize (raw, clean, req) { else { // truncate text to $MAX_TEXT_LENGTH chars - if (text.length > MAX_TEXT_LENGTH) { + const truncated = unicode.truncate(text, MAX_TEXT_LENGTH); + if( truncated ){ messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`); - text = text.substring(0, MAX_TEXT_LENGTH); + text = truncated; } // tokenize text diff --git a/test/unit/sanitizer/_text.js b/test/unit/sanitizer/_text.js index 1ce961c0e..4458268d6 100644 --- a/test/unit/sanitizer/_text.js +++ b/test/unit/sanitizer/_text.js @@ -1,4 +1,5 @@ const sanitizer = require('../../../sanitizer/_text')(); +const unicode = require('../../../helper/unicode'); module.exports.tests = {}; @@ -165,6 +166,28 @@ it again and again until we reach our destination.` }; t.deepEquals(messages.warnings, []); t.end(); }); + + test('truncate should be unicode aware', (t) => { + const grapheme = '\uD842\uDFB7'; + const raw = { text: grapheme.repeat(200) }; + const clean = {}; + const messages = sanitizer.sanitize(raw, clean); + + // sanity: fixture genuinely distinguishes code units from graphemes + t.equals(grapheme.length, 2, 'fixture is a surrogate pair (2 code units)'); + t.equals([...grapheme].length, 1, 'fixture is one code point'); + t.equals(grapheme.normalize('NFC'), grapheme, 'fixture is NFC-stable'); + + // truncated text is 140 graphemes (user-perceived characters), + t.equals(clean.text, grapheme.repeat(140), 'truncated correctly'); + + // text.length on the truncated result is 280 (140 × 2 code units), + t.equals(clean.text.length, 280, 'truncated string is 280 UTF-16 code units'); + + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]); + t.end(); + }); }; module.exports.all = (tape, common) => {