Skip to content

Commit e0ebf72

Browse files
committed
feat(unicode): ensure 140 char text limit is unicode aware
1 parent cb66071 commit e0ebf72

5 files changed

Lines changed: 56 additions & 5 deletions

File tree

helper/unicode.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
const _ = require('lodash');
22
const regenerate = require('regenerate');
33

4+
// shared grapheme segmenter (UAX-29)
5+
// note: locale 'en' only affects word/sentence granularity; grapheme rules are script-agnostic.
6+
const graphemeSegmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
7+
const toGraphemes = (str) => Array.from(graphemeSegmenter.segment(str), ({ segment }) => segment);
8+
49
// non-printable control characters
510
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
611
const CONTROL_CODES = regenerate()
@@ -96,3 +101,23 @@ function normalize(str) {
96101
}
97102

98103
module.exports.normalize = normalize;
104+
105+
// unicode aware truncation function
106+
// counts user-perceived characters (UAX-29 grapheme clusters) via Intl.Segmenter.
107+
module.exports.truncate = (input, len) => {
108+
109+
// sanity checking
110+
if (!_.isString(input)) { throw new Error('invalid string'); }
111+
112+
// short-circuit common case of short strings
113+
// if input.length <= len, grapheme count must also be <= len
114+
if (input.length > len) {
115+
// truncate to len graphemes
116+
const graphemes = toGraphemes(input);
117+
if (graphemes.length > len) {
118+
return graphemes.slice(0, len).join('');
119+
}
120+
}
121+
122+
return null;
123+
};

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"url": "https://github.com/pelias/api/issues"
3434
},
3535
"engines": {
36-
"node": ">=12.0.0"
36+
"node": ">=16.0.0"
3737
},
3838
"dependencies": {
3939
"@hapi/joi": "^16.0.1",

sanitizer/_text.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ function _sanitize( raw, clean ){
2121
if( !_.isString(text) || _.isEmpty(text) ){
2222
messages.errors.push(`invalid param 'text': text length, must be >0`);
2323
} else {
24-
if( text.length > MAX_TEXT_LENGTH ){
24+
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
25+
if( truncated ){
2526
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
26-
text = text.substring(0, MAX_TEXT_LENGTH);
27+
text = truncated;
2728
}
29+
2830
clean.text = text;
2931
}
3032

sanitizer/_text_pelias_parser.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ function _sanitize (raw, clean, req) {
4040
else {
4141

4242
// truncate text to $MAX_TEXT_LENGTH chars
43-
if (text.length > MAX_TEXT_LENGTH) {
43+
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
44+
if( truncated ){
4445
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
45-
text = text.substring(0, MAX_TEXT_LENGTH);
46+
text = truncated;
4647
}
4748

4849
// tokenize text

test/unit/sanitizer/_text.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const sanitizer = require('../../../sanitizer/_text')();
2+
const unicode = require('../../../helper/unicode');
23

34
module.exports.tests = {};
45

@@ -165,6 +166,28 @@ it again and again until we reach our destination.` };
165166
t.deepEquals(messages.warnings, []);
166167
t.end();
167168
});
169+
170+
test('truncate should be unicode aware', (t) => {
171+
const grapheme = '\uD842\uDFB7';
172+
const raw = { text: grapheme.repeat(200) };
173+
const clean = {};
174+
const messages = sanitizer.sanitize(raw, clean);
175+
176+
// sanity: fixture genuinely distinguishes code units from graphemes
177+
t.equals(grapheme.length, 2, 'fixture is a surrogate pair (2 code units)');
178+
t.equals([...grapheme].length, 1, 'fixture is one code point');
179+
t.equals(grapheme.normalize('NFC'), grapheme, 'fixture is NFC-stable');
180+
181+
// truncated text is 140 graphemes (user-perceived characters),
182+
t.equals(clean.text, grapheme.repeat(140), 'truncated correctly');
183+
184+
// text.length on the truncated result is 280 (140 × 2 code units),
185+
t.equals(clean.text.length, 280, 'truncated string is 280 UTF-16 code units');
186+
187+
t.deepEquals(messages.errors, [], 'no errors');
188+
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
189+
t.end();
190+
});
168191
};
169192

170193
module.exports.all = (tape, common) => {

0 commit comments

Comments
 (0)