Skip to content

Commit fc9ec56

Browse files
committed
Optimize mb_strtoupper/mb_strtolower for UTF-8 enc and ASCII input
1 parent d058acb commit fc9ec56

File tree

3 files changed

+50
-10
lines changed

3 files changed

+50
-10
lines changed

Zend/zend_operators.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2952,6 +2952,34 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, boo
29522952
}
29532953
/* }}} */
29542954

2955+
ZEND_API bool ZEND_FASTCALL zend_str_is_utf8_pure_ascii(const char *str, size_t length) /* {{{ */
2956+
{
2957+
unsigned char *p = (unsigned char *) str;
2958+
unsigned char *end = p + length;
2959+
2960+
#ifdef HAVE_BLOCKCONV
2961+
__m128i blconv_80 = _mm_set1_epi8(0x80);
2962+
while (p + BLOCKCONV_STRIDE <= end) {
2963+
__m128i blconv_operand = _mm_loadu_si128((__m128i*)(p)); \
2964+
__m128i blconv_mingle = _mm_cmpeq_epi8(_mm_max_epu8(blconv_operand, blconv_80), blconv_operand);
2965+
if (BLOCKCONV_FOUND()) {
2966+
return false;
2967+
}
2968+
p += BLOCKCONV_STRIDE;
2969+
}
2970+
#endif
2971+
2972+
while (p < end) {
2973+
if (*p >= 0x80) {
2974+
return false;
2975+
}
2976+
p++;
2977+
}
2978+
2979+
return true;
2980+
}
2981+
/* }}} */
2982+
29552983
ZEND_API int ZEND_FASTCALL zend_binary_strcmp(const char *s1, size_t len1, const char *s2, size_t len2) /* {{{ */
29562984
{
29572985
int retval;

Zend/zend_operators.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup_ex(const char *source,
449449
ZEND_API char* ZEND_FASTCALL zend_str_toupper_dup_ex(const char *source, size_t length);
450450
ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, bool persistent);
451451
ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, bool persistent);
452+
ZEND_API bool ZEND_FASTCALL zend_str_is_utf8_pure_ascii(const char *str, size_t length);
452453

453454
#define zend_string_tolower(str) zend_string_tolower_ex(str, 0)
454455
#define zend_string_toupper(str) zend_string_toupper_ex(str, 0)

ext/mbstring/mbstring.c

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2588,22 +2588,29 @@ PHP_FUNCTION(mb_convert_case)
25882588
/* {{{ Returns a upper cased version of source_string */
25892589
PHP_FUNCTION(mb_strtoupper)
25902590
{
2591+
zend_string *str;
25912592
zend_string *from_encoding = NULL;
2592-
char *str;
2593-
size_t str_len, ret_len;
2593+
const mbfl_encoding *enc;
2594+
char *newstr;
2595+
size_t ret_len;
25942596

25952597
ZEND_PARSE_PARAMETERS_START(1, 2)
2596-
Z_PARAM_STRING(str, str_len)
2598+
Z_PARAM_STR(str)
25972599
Z_PARAM_OPTIONAL
25982600
Z_PARAM_STR_OR_NULL(from_encoding)
25992601
ZEND_PARSE_PARAMETERS_END();
26002602

2601-
const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2603+
enc = php_mb_get_encoding(from_encoding, 2);
26022604
if (!enc) {
26032605
RETURN_THROWS();
26042606
}
26052607

2606-
char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
2608+
// optimize performance for UTF-8 encoding and input string consisting of lower/7-bit ASCII characters only
2609+
if (enc == &mbfl_encoding_utf8 && zend_str_is_utf8_pure_ascii(ZSTR_VAL(str), ZSTR_LEN(str))) {
2610+
RETURN_STR(zend_string_toupper(str));
2611+
}
2612+
2613+
newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), &ret_len, enc);
26072614
/* If newstr is NULL something went wrong in mbfl and this is a bug */
26082615
ZEND_ASSERT(newstr != NULL);
26092616

@@ -2616,15 +2623,14 @@ PHP_FUNCTION(mb_strtoupper)
26162623
/* {{{ Returns a lower cased version of source_string */
26172624
PHP_FUNCTION(mb_strtolower)
26182625
{
2626+
zend_string *str;
26192627
zend_string *from_encoding = NULL;
2620-
char *str;
2621-
size_t str_len;
2628+
const mbfl_encoding *enc;
26222629
char *newstr;
26232630
size_t ret_len;
2624-
const mbfl_encoding *enc;
26252631

26262632
ZEND_PARSE_PARAMETERS_START(1, 2)
2627-
Z_PARAM_STRING(str, str_len)
2633+
Z_PARAM_STR(str)
26282634
Z_PARAM_OPTIONAL
26292635
Z_PARAM_STR_OR_NULL(from_encoding)
26302636
ZEND_PARSE_PARAMETERS_END();
@@ -2634,7 +2640,12 @@ PHP_FUNCTION(mb_strtolower)
26342640
RETURN_THROWS();
26352641
}
26362642

2637-
newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
2643+
// optimize performance for UTF-8 encoding and input string consisting of lower/7-bit ASCII characters only
2644+
if (enc == &mbfl_encoding_utf8 && zend_str_is_utf8_pure_ascii(ZSTR_VAL(str), ZSTR_LEN(str))) {
2645+
RETURN_STR(zend_string_tolower(str));
2646+
}
2647+
2648+
newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), &ret_len, enc);
26382649
/* If newstr is NULL something went wrong in mbfl and this is a bug */
26392650
ZEND_ASSERT(newstr != NULL);
26402651

0 commit comments

Comments
 (0)