Skip to content

Commit 1f05f2f

Browse files
committed
[RFC] Add grapheme_limit_codepoints function
Unicode's grapheme cluster is no limit codepoints. But 1 grapheme cluster and many codepoints can maybe crash. So I would like set limit for codepoints per grapheme cluster.
1 parent f46bc8e commit 1f05f2f

5 files changed

Lines changed: 101 additions & 1 deletion

File tree

ext/intl/grapheme/grapheme.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,6 @@ void grapheme_close_global_iterator( void );
3131
#define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
3232
#define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
3333
#define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
34+
#define GRAPHEME_LIMIT_CODEPOINTS 32
3435

3536
#endif // GRAPHEME_GRAPHEME_H

ext/intl/grapheme/grapheme_string.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,4 +1135,68 @@ U_CFUNC PHP_FUNCTION(grapheme_levenshtein)
11351135
efree(ustring1);
11361136
}
11371137

1138+
U_CFUNC PHP_FUNCTION(grapheme_limit_codepoints)
1139+
{
1140+
char *string;
1141+
size_t string_len = 0;
1142+
zend_long limit_codepoint = GRAPHEME_LIMIT_CODEPOINTS;
1143+
int ustring_len = 0;
1144+
UErrorCode status;
1145+
unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
1146+
1147+
ZEND_PARSE_PARAMETERS_START(1, 2)
1148+
Z_PARAM_STRING(string, string_len)
1149+
Z_PARAM_OPTIONAL
1150+
Z_PARAM_LONG(limit_codepoint)
1151+
ZEND_PARSE_PARAMETERS_END();
1152+
1153+
status = U_ZERO_ERROR;
1154+
UBreakIterator *bi;
1155+
UText ut = UTEXT_INITIALIZER;
1156+
bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
1157+
1158+
if( U_FAILURE(status) ) {
1159+
intl_error_set_code( nullptr, status );
1160+
1161+
/* Set error messages. */
1162+
intl_error_set_custom_msg( nullptr, "Error in grapheme_get_break_iterator" );
1163+
RETURN_FALSE;
1164+
}
1165+
1166+
utext_openUTF8(&ut, string, string_len, &status);
1167+
ubrk_setUText(bi, &ut, &status);
1168+
1169+
if ( U_FAILURE( status ) ) {
1170+
/* Set global error code. */
1171+
intl_error_set_code( nullptr, status );
1172+
1173+
/* Set error messages. */
1174+
intl_error_set_custom_msg( nullptr, "Error opening UTF-8 text");
1175+
1176+
RETURN_FALSE;
1177+
}
1178+
1179+
zend_ulong pos, before;
1180+
zend_bool ret = true;
1181+
for (before = pos = 0; pos != UBRK_DONE; ) {
1182+
pos = ubrk_next(bi);
1183+
if (pos != UBRK_DONE) {
1184+
for (zend_ulong i = before; i < (pos - before); i++) {
1185+
U8_FWD_1(string, before, (pos - before) - i);
1186+
if (i >= limit_codepoint) {
1187+
ret = false;
1188+
goto bi_close;
1189+
}
1190+
}
1191+
}
1192+
before = pos;
1193+
}
1194+
1195+
bi_close:
1196+
ubrk_close(bi);
1197+
utext_close(&ut);
1198+
1199+
RETURN_BOOL(ret);
1200+
1201+
}
11381202
/* }}} */

ext/intl/php_intl.stub.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,11 @@
166166
* @cvalue UIDNA_ERROR_CONTEXTJ
167167
*/
168168
const IDNA_ERROR_CONTEXTJ = UNKNOWN;
169+
/**
170+
* @var int
171+
* @cvalue GRAPHEME_LIMIT_CODEPOINTS
172+
*/
173+
const GRAPHEME_LIMIT_CODEPOINTS = UNKNOWN;
169174

170175
class IntlException extends Exception
171176
{
@@ -445,6 +450,8 @@ function grapheme_str_split(string $string, int $length = 1): array|false {}
445450

446451
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, string $locale = ""): int|false {}
447452

453+
function grapheme_limit_codepoints(string $string, int $limit = GRAPHEME_LIMIT_CODEPOINTS): bool {}
454+
448455
/** @param int $next */
449456
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
450457

ext/intl/php_intl_arginfo.h

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
--TEST--
2+
grapheme_limit_codepoints() function test
3+
--EXTENSIONS--
4+
intl
5+
--FILE--
6+
<?php
7+
$f = "あい👨‍👨‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦‍👦うえお";
8+
var_dump(grapheme_limit_codepoints($f));
9+
$f = "あいうえお👨‍👨‍👦";
10+
var_dump(grapheme_limit_codepoints($f));
11+
$f = "あいうえおH̵̛͕̞̦̰̜͍̰̥̟͆̏͂̌͑ͅä̷͔̟͓̬̯̟͍̭͉͈̮͙̣̯̬͚̞̭̍̀̾͠m̴̡̧̛̝̯̹̗̹̤̲̺̟̥̈̏͊̔̑̍͆̌̀̚͝͝b̴̢̢̫̝̠̗̼̬̻̮̺̭͔̘͑̆̎̚ư̵̧̡̥̙̭̿̈̀̒̐̊͒͑r̷̡̡̲̼̖͎̫̮̜͇̬͌͘g̷̹͍͎̬͕͓͕̐̃̈́̓̆̚͝ẻ̵̡̼̬̥̹͇̭͔̯̉͛̈́̕r̸̮̖̻̮̣̗͚͖̝̂͌̾̓̀̿̔̀͋̈́͌̈́̋͜👨‍👨‍👦";
12+
var_dump(grapheme_limit_codepoints($f));
13+
$f = "ཧྐྵྨླྺྼྻྂ";
14+
var_dump(grapheme_limit_codepoints($f));
15+
?>
16+
--EXPECT--
17+
bool(false)
18+
bool(true)
19+
bool(true)
20+
bool(true)

0 commit comments

Comments
 (0)