Skip to content

Commit e3cf0d5

Browse files
authored
[k2] implement f$prepare_search_query (#1461)
1 parent 27bf56f commit e3cf0d5

45 files changed

Lines changed: 463 additions & 281 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

builtin-functions/kphp-light/stdlib/server-functions.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false;
6969

7070
function memory_get_detailed_stats() ::: int[];
7171

72+
function prepare_search_query ($query ::: string) ::: string;
73+
7274
function memory_get_total_usage() ::: int;
7375

7476
function inet_pton ($address ::: string) ::: string | false;
@@ -131,7 +133,3 @@ function flush() ::: void;
131133
define('PHP_QUERY_RFC1738', 1);
132134
define('PHP_QUERY_RFC3986', 2);
133135

134-
135-
/** @kphp-extern-func-info stub generation-required */
136-
function prepare_search_query ($query ::: string) ::: string;
137-

common/unicode/unicode-utils.cpp

Lines changed: 105 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
#include "common/unicode/unicode-utils.h"
66

7+
#include <algorithm>
78
#include <assert.h>
9+
#include <cstddef>
10+
#include <iterator>
811
#include <stdlib.h>
912
#include <string.h>
1013

@@ -13,7 +16,7 @@
1316
#include "common/unicode/utf8-utils.h"
1417

1518
/* Search generated ranges for specified character */
16-
static int binary_search_ranges(const int* ranges, int r, int code) {
19+
static int binary_search_ranges(const int* ranges, int r, int code, void (*assertf)(bool)) {
1720
if ((unsigned int)code > 0x10ffff) {
1821
return 0;
1922
}
@@ -43,152 +46,158 @@ static int binary_search_ranges(const int* ranges, int r, int code) {
4346
case 2:
4447
return ((code - 1) | 1);
4548
default:
46-
assert(0);
47-
exit(1);
49+
if (assertf != nullptr) {
50+
assertf(false);
51+
}
4852
}
53+
return 0;
4954
}
5055

5156
/* Convert character to upper case */
52-
int unicode_toupper(int code) {
57+
int unicode_toupper(int code, void (*assertf)(bool)) {
5358
if ((unsigned int)code < (unsigned int)TABLE_SIZE) {
5459
return to_upper_table[code];
5560
} else {
56-
return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code);
61+
return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, assertf);
5762
}
5863
}
5964

6065
/* Convert character to lower case */
61-
int unicode_tolower(int code) {
66+
int unicode_tolower(int code, void (*assertf)(bool)) {
6267
if ((unsigned int)code < (unsigned int)TABLE_SIZE) {
6368
return to_lower_table[code];
6469
} else {
65-
return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code);
70+
return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, assertf);
6671
}
6772
}
6873

74+
inline constexpr int32_t WHITESPACE_CODE_POINT{static_cast<int32_t>(' ')};
75+
inline constexpr int32_t PLUS_CODE_POINT{static_cast<int32_t>('+')};
76+
6977
/* Prepares unicode 0-terminated string input for search,
7078
leaving only digits and letters with diacritics.
7179
Length of string can decrease.
7280
Returns length of result. */
73-
int prepare_search_string(int* input) {
74-
int i;
75-
int* output = input;
76-
for (i = 0; input[i]; i++) {
77-
int c = input[i], new_c;
78-
if ((unsigned int)c < (unsigned int)TABLE_SIZE) {
79-
new_c = prepare_table[c];
81+
size_t prepare_search_string(int32_t* code_points, void (*assertf)(bool)) noexcept {
82+
size_t output_size{};
83+
for (size_t i{}; code_points[i] != 0; ++i) {
84+
int32_t c{code_points[i]};
85+
int32_t new_c{};
86+
if (static_cast<size_t>(c) < static_cast<size_t>(TABLE_SIZE)) {
87+
new_c = static_cast<int32_t>(prepare_table[c]);
8088
} else {
81-
new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c);
89+
new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c, assertf);
8290
}
83-
if (new_c) {
84-
if (new_c != 0x20 || (output > input && output[-1] != 0x20)) {
85-
*output++ = new_c;
91+
if (new_c != 0) {
92+
// we forbid 2 whitespaces after each other and starting whitespace
93+
if (new_c != WHITESPACE_CODE_POINT || (output_size > 0 && code_points[output_size - 1] != WHITESPACE_CODE_POINT)) {
94+
code_points[output_size++] = new_c;
8695
}
8796
}
8897
}
89-
if (output > input && output[-1] == 0x20) {
90-
output--;
98+
if (output_size > 0 && code_points[output_size - 1] == WHITESPACE_CODE_POINT) {
99+
// throw out terminating whitespace
100+
--output_size;
91101
}
92-
*output = 0;
93-
return output - input;
94-
}
95-
96-
#define MAX_NAME_SIZE 65536
97-
static char prep_buf[4 * MAX_NAME_SIZE + 4];
98-
int prep_ibuf[MAX_NAME_SIZE + 4];
99-
static int prep_ibuf_res[MAX_NAME_SIZE + 4];
100-
static int* words_ibuf[MAX_NAME_SIZE + 4];
101-
102-
int stricmp_void(const void* x, const void* y) {
103-
const int* s1 = *(const int**)x;
104-
const int* s2 = *(const int**)y;
105-
while (*s1 == *s2 && *s1 != ' ')
106-
s1++, s2++;
107-
return *s1 - *s2;
102+
code_points[output_size] = 0;
103+
return output_size;
108104
}
109105

110-
int* prepare_str_unicode(const int* x) {
111-
int* v = prep_ibuf;
112-
113-
int n;
114-
if (v != x) {
115-
for (n = 0; x[n]; n++) {
116-
v[n] = x[n];
106+
inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, void (*assertf)(bool)) noexcept {
107+
size_t code_points_length = prepare_search_string(code_points, assertf);
108+
code_points[code_points_length] = WHITESPACE_CODE_POINT;
109+
110+
size_t words_count{};
111+
size_t i{};
112+
// looking for the beginnings of the words
113+
while (i < code_points_length) {
114+
word_start_indices[words_count++] = i;
115+
while (i < code_points_length && code_points[i] != WHITESPACE_CODE_POINT) {
116+
++i;
117117
}
118-
v[n] = 0;
118+
++i;
119119
}
120120

121-
n = prepare_search_string(v);
122-
v[n] = ' ';
123-
124-
int i = 0, k = 0;
125-
while (i < n) {
126-
words_ibuf[k++] = v + i;
127-
while (v[i] && v[i] != ' ') {
128-
i++;
121+
auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool {
122+
while (code_points[x] != WHITESPACE_CODE_POINT && code_points[x] == code_points[y]) {
123+
++x;
124+
++y;
129125
}
130-
i++;
131-
}
126+
if (code_points[x] == WHITESPACE_CODE_POINT) {
127+
return code_points[y] != WHITESPACE_CODE_POINT;
128+
}
129+
if (code_points[y] == WHITESPACE_CODE_POINT) {
130+
return false;
131+
}
132+
return code_points[x] < code_points[y];
133+
}};
132134

133-
qsort(words_ibuf, (size_t)k, sizeof(int*), stricmp_void);
135+
std::sort(word_start_indices, std::next(word_start_indices, words_count), word_less_cmp);
134136

135-
int j = 0;
136-
for (i = 0; i < k; i++) {
137-
if (j == 0 || stricmp_void(&words_ibuf[j - 1], &words_ibuf[i])) {
138-
words_ibuf[j++] = words_ibuf[i];
137+
size_t uniq_words_count{};
138+
for (i = 0; i < words_count; ++i) {
139+
// drop duplicates
140+
if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) {
141+
word_start_indices[uniq_words_count++] = word_start_indices[i];
139142
} else {
140-
words_ibuf[j - 1] = words_ibuf[i];
143+
word_start_indices[uniq_words_count - 1] = word_start_indices[i];
141144
}
142145
}
143-
k = j;
144146

145-
int* res = prep_ibuf_res;
146-
for (i = 0; i < k; i++) {
147-
int* tmp = words_ibuf[i];
148-
while (*tmp != ' ') {
149-
*res++ = *tmp++;
147+
size_t result_size{};
148+
// output words with '+' separator
149+
for (i = 0; i < uniq_words_count; ++i) {
150+
size_t ind{word_start_indices[i]};
151+
while (code_points[ind] != WHITESPACE_CODE_POINT) {
152+
prepared_code_points[result_size++] = code_points[ind++];
150153
}
151-
*res++ = '+';
154+
prepared_code_points[result_size++] = PLUS_CODE_POINT;
152155
}
153-
*res++ = 0;
156+
prepared_code_points[result_size++] = 0;
154157

155-
assert(res - prep_ibuf_res < MAX_NAME_SIZE);
156-
return prep_ibuf_res;
158+
assertf(result_size < MAX_NAME_SIZE);
159+
return result_size;
157160
}
158161

159-
const char* clean_str_unicode(const int* xx) {
160-
assert(xx != NULL);
161-
162-
int* v = prepare_str_unicode(xx);
163-
int l = put_string_utf8(v, prep_buf);
164-
assert(l < sizeof(prep_buf));
165-
166-
char *s = prep_buf, *x = prep_buf;
167-
int skip;
168-
169-
while (*x != 0) {
170-
skip = !strncmp(x, "amp+", 4) || !strncmp(x, "gt+", 3) || !strncmp(x, "lt+", 3) || !strncmp(x, "quot+", 5) || !strncmp(x, "ft+", 3) ||
171-
!strncmp(x, "feat+", 5) ||
172-
(((x[0] == '1' && x[1] == '9') || (x[0] == '2' && x[1] == '0')) && ('0' <= x[2] && x[2] <= '9') && ('0' <= x[3] && x[3] <= '9') && x[4] == '+') ||
173-
!strncmp(x, "092+", 4) || !strncmp(x, "33+", 3) || !strncmp(x, "34+", 3) || !strncmp(x, "36+", 3) || !strncmp(x, "39+", 3) ||
174-
!strncmp(x, "60+", 3) || !strncmp(x, "62+", 3) || !strncmp(x, "8232+", 5) || !strncmp(x, "8233+", 5);
162+
inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result,
163+
void (*assertf)(bool)) noexcept {
164+
prepare_str_unicode(code_points, word_start_indices, prepared_code_points, assertf);
165+
166+
auto length{static_cast<size_t>(put_string_utf8(prepared_code_points, reinterpret_cast<char*>(utf8_result)))};
167+
assertf(length < MAX_NAME_BYTES_SIZE);
168+
169+
size_t i{};
170+
size_t result_size{};
171+
while (i < length) {
172+
char* c{reinterpret_cast<char*>(std::addressof(utf8_result[i]))};
173+
bool skip{!strncmp(c, "amp+", 4) || !strncmp(c, "gt+", 3) || !strncmp(c, "lt+", 3) || !strncmp(c, "quot+", 5) || !strncmp(c, "ft+", 3) ||
174+
!strncmp(c, "feat+", 5) ||
175+
(((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') ||
176+
!strncmp(c, "092+", 4) || !strncmp(c, "33+", 3) || !strncmp(c, "34+", 3) || !strncmp(c, "36+", 3) || !strncmp(c, "39+", 3) ||
177+
!strncmp(c, "60+", 3) || !strncmp(c, "62+", 3) || !strncmp(c, "8232+", 5) || !strncmp(c, "8233+", 5)};
175178
do {
176-
*s = *x;
177179
if (!skip) {
178-
s++;
180+
utf8_result[result_size] = utf8_result[i];
181+
++result_size;
179182
}
180-
} while (*x++ != '+');
183+
} while (utf8_result[i++] != static_cast<std::byte>('+'));
181184
}
182-
*s = 0;
185+
utf8_result[result_size] = static_cast<std::byte>(0);
183186

184-
return prep_buf;
187+
return result_size;
185188
}
186189

187-
const char* clean_str(const char* x) {
188-
if (x == NULL || strlen(x) >= MAX_NAME_SIZE) {
189-
return x;
190+
size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result,
191+
void (*assertf)(bool)) {
192+
size_t x_len{strlen(x)};
193+
if (assertf == nullptr || x == NULL || x_len >= MAX_NAME_SIZE) {
194+
for (size_t i = 0; i < x_len; ++i) {
195+
utf8_result[i] = static_cast<std::byte>(x[i]);
196+
}
197+
utf8_result[x_len] = static_cast<std::byte>(0);
198+
return x_len;
190199
}
191200

192-
html_string_to_utf8(x, prep_ibuf);
193-
return clean_str_unicode(prep_ibuf);
201+
html_string_to_utf8(x, code_points);
202+
return clean_str_unicode(code_points, word_start_indices, prepared_code_points, utf8_result, assertf);
194203
}

common/unicode/unicode-utils.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@
44

55
#pragma once
66

7-
int unicode_toupper(int code);
8-
int unicode_tolower(int code);
9-
const char* clean_str(const char* x);
7+
#include <cstddef>
8+
#include <cstdint>
9+
10+
inline constexpr size_t MAX_NAME_SIZE = 65536;
11+
inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4;
12+
inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4;
13+
14+
int unicode_toupper(int code, void (*assertf)(bool));
15+
int unicode_tolower(int code, void (*assertf)(bool));
16+
size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, void (*assertf)(bool));

0 commit comments

Comments
 (0)