Skip to content

Commit 8be28fa

Browse files
committed
feat(unicode): ensure 140 char text limit is unicode aware
1 parent cb66071 commit 8be28fa

4 files changed

Lines changed: 49 additions & 9 deletions

File tree

helper/unicode.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
const _ = require('lodash');
22
const regenerate = require('regenerate');
33

4+
// shared grapheme segmenter (UAX-29)
5+
// note: locale 'en' only affects word/sentence granularity; grapheme rules are script-agnostic.
6+
const graphemeSegmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
7+
const toGraphemes = (str) => Array.from(graphemeSegmenter.segment(str), ({ segment }) => segment);
8+
49
// non-printable control characters
510
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
611
const CONTROL_CODES = regenerate()
@@ -96,3 +101,23 @@ function normalize(str) {
96101
}
97102

98103
module.exports.normalize = normalize;
104+
105+
// unicode aware truncation function
106+
// counts user-perceived characters (UAX-29 grapheme clusters) via Intl.Segmenter.
107+
module.exports.truncate = (input, len) => {
108+
109+
// sanity checking
110+
if (!_.isString(input)) { throw new Error('invalid string'); }
111+
112+
// short-circuit common case of short strings
113+
// if input.length <= len, grapheme count must also be <= len
114+
if (input.length > len) {
115+
// truncate to len graphemes
116+
const graphemes = toGraphemes(input);
117+
if (graphemes.length > len) {
118+
return graphemes.slice(0, len).join('');
119+
}
120+
}
121+
122+
return null;
123+
};

sanitizer/_text.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@ function _sanitize( raw, clean ){
2121
if( !_.isString(text) || _.isEmpty(text) ){
2222
messages.errors.push(`invalid param 'text': text length, must be >0`);
2323
} else {
24-
if( text.length > MAX_TEXT_LENGTH ){
24+
clean.text = text;
25+
26+
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
27+
if( truncated ){
2528
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
26-
text = text.substring(0, MAX_TEXT_LENGTH);
29+
clean.text = truncated;
2730
}
28-
clean.text = text;
2931
}
3032

3133
return messages;

sanitizer/_text_pelias_parser.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ function _sanitize (raw, clean, req) {
4040
else {
4141

4242
// truncate text to $MAX_TEXT_LENGTH chars
43-
if (text.length > MAX_TEXT_LENGTH) {
43+
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
44+
if( truncated ){
4445
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
45-
text = text.substring(0, MAX_TEXT_LENGTH);
46+
text = truncated;
4647
}
4748

4849
// tokenize text

test/unit/sanitizer/_text.js

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const sanitizer = require('../../../sanitizer/_text')();
2+
const unicode = require('../../../helper/unicode');
23

34
module.exports.tests = {};
45

@@ -155,14 +156,25 @@ it again and again until we reach our destination.` };
155156
t.end();
156157
});
157158

158-
test('strips emoji', (t) => {
159-
const raw = { text: 'abc' + '👩‍❤️‍👩'.repeat(200) };
159+
test('truncate should be unicode aware', (t) => {
160+
const grapheme = '\uD842\uDFB7';
161+
const raw = { text: grapheme.repeat(200) };
160162
const clean = {};
161163
const messages = sanitizer.sanitize(raw, clean);
162164

163-
t.equals(clean.text, 'abc');
165+
// sanity: fixture genuinely distinguishes code units from graphemes
166+
t.equals(grapheme.length, 2, 'fixture is a surrogate pair (2 code units)');
167+
t.equals([...grapheme].length, 1, 'fixture is one code point');
168+
t.equals(grapheme.normalize('NFC'), grapheme, 'fixture is NFC-stable');
169+
170+
// t.equals(unicode.length(clean.text), 140, 'length counts characters, not code units');
171+
t.equals(clean.text, grapheme.repeat(140), 'truncated correctly');
172+
173+
// text.length on the truncated result is 280 (140 × 2 code units),
174+
t.equals(clean.text.length, 280, 'truncated string is 280 UTF-16 code units');
175+
164176
t.deepEquals(messages.errors, [], 'no errors');
165-
t.deepEquals(messages.warnings, []);
177+
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
166178
t.end();
167179
});
168180
};

0 commit comments

Comments
 (0)