Skip to content

Commit 6c21aa8

Browse files
committed
feat(unicode): ensure 140 char text limit is unicode aware
1 parent cb66071 commit 6c21aa8

4 files changed

Lines changed: 55 additions & 5 deletions

File tree

helper/unicode.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
const _ = require('lodash');
22
const regenerate = require('regenerate');
33

4+
// shared grapheme segmenter (UAX-29)
5+
// note: locale 'en' only affects word/sentence granularity; grapheme rules are script-agnostic.
6+
const graphemeSegmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
7+
const toGraphemes = (str) => Array.from(graphemeSegmenter.segment(str), ({ segment }) => segment);
8+
49
// non-printable control characters
510
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
611
const CONTROL_CODES = regenerate()
@@ -96,3 +101,23 @@ function normalize(str) {
96101
}
97102

98103
module.exports.normalize = normalize;
104+
105+
// unicode aware truncation function
106+
// counts user-perceived characters (UAX-29 grapheme clusters) via Intl.Segmenter.
107+
module.exports.truncate = (input, len) => {
108+
109+
// sanity checking
110+
if (!_.isString(input)) { throw new Error('invalid string'); }
111+
112+
// short-circuit common case of short strings
113+
// if input.length <= len, grapheme count must also be <= len
114+
if (input.length > len) {
115+
// truncate to len graphemes
116+
const graphemes = toGraphemes(input);
117+
if (graphemes.length > len) {
118+
return graphemes.slice(0, len).join('');
119+
}
120+
}
121+
122+
return null;
123+
};

sanitizer/_text.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ function _sanitize( raw, clean ){
2121
if( !_.isString(text) || _.isEmpty(text) ){
2222
messages.errors.push(`invalid param 'text': text length, must be >0`);
2323
} else {
24-
if( text.length > MAX_TEXT_LENGTH ){
24+
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
25+
if( truncated ){
2526
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
26-
text = text.substring(0, MAX_TEXT_LENGTH);
27+
text = truncated;
2728
}
29+
2830
clean.text = text;
2931
}
3032

sanitizer/_text_pelias_parser.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ function _sanitize (raw, clean, req) {
4040
else {
4141

4242
// truncate text to $MAX_TEXT_LENGTH chars
43-
if (text.length > MAX_TEXT_LENGTH) {
43+
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
44+
if( truncated ){
4445
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
45-
text = text.substring(0, MAX_TEXT_LENGTH);
46+
text = truncated;
4647
}
4748

4849
// tokenize text

test/unit/sanitizer/_text.js

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const sanitizer = require('../../../sanitizer/_text')();
2+
const unicode = require('../../../helper/unicode');
23

34
module.exports.tests = {};
45

@@ -159,12 +160,33 @@ it again and again until we reach our destination.` };
159160
const raw = { text: 'abc' + '👩‍❤️‍👩'.repeat(200) };
160161
const clean = {};
161162
const messages = sanitizer.sanitize(raw, clean);
162-
163163
t.equals(clean.text, 'abc');
164164
t.deepEquals(messages.errors, [], 'no errors');
165165
t.deepEquals(messages.warnings, []);
166166
t.end();
167167
});
168+
169+
test('truncate should be unicode aware', (t) => {
170+
const grapheme = '\uD842\uDFB7';
171+
const raw = { text: grapheme.repeat(200) };
172+
const clean = {};
173+
const messages = sanitizer.sanitize(raw, clean);
174+
175+
// sanity: fixture genuinely distinguishes code units from graphemes
176+
t.equals(grapheme.length, 2, 'fixture is a surrogate pair (2 code units)');
177+
t.equals([...grapheme].length, 1, 'fixture is one code point');
178+
t.equals(grapheme.normalize('NFC'), grapheme, 'fixture is NFC-stable');
179+
180+
// truncated text is 140 graphemes (user-perceived characters),
181+
t.equals(clean.text, grapheme.repeat(140), 'truncated correctly');
182+
183+
// text.length on the truncated result is 280 (140 × 2 code units),
184+
t.equals(clean.text.length, 280, 'truncated string is 280 UTF-16 code units');
185+
186+
t.deepEquals(messages.errors, [], 'no errors');
187+
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
188+
t.end();
189+
});
168190
};
169191

170192
module.exports.all = (tape, common) => {

0 commit comments

Comments
 (0)