Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions helper/unicode.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
const _ = require('lodash');
const regenerate = require('regenerate');

// shared grapheme segmenter (UAX-29)
// note: locale 'en' only affects word/sentence granularity; grapheme rules are script-agnostic.
const graphemeSegmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
const toGraphemes = (str) => Array.from(graphemeSegmenter.segment(str), ({ segment }) => segment);

// non-printable control characters
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
const CONTROL_CODES = regenerate()
Expand Down Expand Up @@ -96,3 +101,23 @@ function normalize(str) {
}

module.exports.normalize = normalize;

// unicode aware truncation function
// counts user-perceived characters (UAX-29 grapheme clusters) via Intl.Segmenter.
module.exports.truncate = (input, len) => {

// sanity checking
if (!_.isString(input)) { throw new Error('invalid string'); }

// short-circuit common case of short strings
// if input.length <= len, grapheme count must also be <= len
if (input.length > len) {
// truncate to len graphemes
const graphemes = toGraphemes(input);
if (graphemes.length > len) {
return graphemes.slice(0, len).join('');
}
}

return null;
};
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"url": "https://github.com/pelias/api/issues"
},
"engines": {
"node": ">=12.0.0"
"node": ">=16.0.0"
},
"dependencies": {
"@hapi/joi": "^16.0.1",
Expand Down
6 changes: 4 additions & 2 deletions sanitizer/_text.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ function _sanitize( raw, clean ){
if( !_.isString(text) || _.isEmpty(text) ){
messages.errors.push(`invalid param 'text': text length, must be >0`);
} else {
if( text.length > MAX_TEXT_LENGTH ){
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
if( truncated ){
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
text = text.substring(0, MAX_TEXT_LENGTH);
text = truncated;
}

clean.text = text;
}

Expand Down
5 changes: 3 additions & 2 deletions sanitizer/_text_pelias_parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ function _sanitize (raw, clean, req) {
else {

// truncate text to $MAX_TEXT_LENGTH chars
if (text.length > MAX_TEXT_LENGTH) {
const truncated = unicode.truncate(text, MAX_TEXT_LENGTH);
if( truncated ){
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
text = text.substring(0, MAX_TEXT_LENGTH);
text = truncated;
}

// tokenize text
Expand Down
23 changes: 23 additions & 0 deletions test/unit/sanitizer/_text.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const sanitizer = require('../../../sanitizer/_text')();
const unicode = require('../../../helper/unicode');

module.exports.tests = {};

Expand Down Expand Up @@ -165,6 +166,28 @@ it again and again until we reach our destination.` };
t.deepEquals(messages.warnings, []);
t.end();
});

test('truncate should be unicode aware', (t) => {
const grapheme = '\uD842\uDFB7';
const raw = { text: grapheme.repeat(200) };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);

// sanity: fixture genuinely distinguishes code units from graphemes
t.equals(grapheme.length, 2, 'fixture is a surrogate pair (2 code units)');
t.equals([...grapheme].length, 1, 'fixture is one code point');
t.equals(grapheme.normalize('NFC'), grapheme, 'fixture is NFC-stable');

// truncated text is 140 graphemes (user-perceived characters),
t.equals(clean.text, grapheme.repeat(140), 'truncated correctly');

// text.length on the truncated result is 280 (140 × 2 code units),
t.equals(clean.text.length, 280, 'truncated string is 280 UTF-16 code units');

t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
});
};

module.exports.all = (tape, common) => {
Expand Down