|
4 | 4 |
|
5 | 5 | #include "common/unicode/unicode-utils.h" |
6 | 6 |
|
| 7 | +#include <algorithm> |
7 | 8 | #include <assert.h> |
| 9 | +#include <cstddef> |
| 10 | +#include <iterator> |
8 | 11 | #include <stdlib.h> |
9 | 12 | #include <string.h> |
10 | 13 |
|
|
13 | 16 | #include "common/unicode/utf8-utils.h" |
14 | 17 |
|
15 | 18 | /* Search generated ranges for specified character */ |
16 | | -static int binary_search_ranges(const int* ranges, int r, int code) { |
| 19 | +static int binary_search_ranges(const int* ranges, int r, int code, void (*assertf)(bool)) { |
17 | 20 | if ((unsigned int)code > 0x10ffff) { |
18 | 21 | return 0; |
19 | 22 | } |
@@ -43,152 +46,158 @@ static int binary_search_ranges(const int* ranges, int r, int code) { |
43 | 46 | case 2: |
44 | 47 | return ((code - 1) | 1); |
45 | 48 | default: |
46 | | - assert(0); |
47 | | - exit(1); |
| 49 | + if (assertf != nullptr) { |
| 50 | + assertf(false); |
| 51 | + } |
48 | 52 | } |
| 53 | + return 0; |
49 | 54 | } |
50 | 55 |
|
51 | 56 | /* Convert character to upper case */ |
52 | | -int unicode_toupper(int code) { |
| 57 | +int unicode_toupper(int code, void (*assertf)(bool)) { |
53 | 58 | if ((unsigned int)code < (unsigned int)TABLE_SIZE) { |
54 | 59 | return to_upper_table[code]; |
55 | 60 | } else { |
56 | | - return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code); |
| 61 | + return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, assertf); |
57 | 62 | } |
58 | 63 | } |
59 | 64 |
|
60 | 65 | /* Convert character to lower case */ |
61 | | -int unicode_tolower(int code) { |
| 66 | +int unicode_tolower(int code, void (*assertf)(bool)) { |
62 | 67 | if ((unsigned int)code < (unsigned int)TABLE_SIZE) { |
63 | 68 | return to_lower_table[code]; |
64 | 69 | } else { |
65 | | - return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code); |
| 70 | + return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, assertf); |
66 | 71 | } |
67 | 72 | } |
68 | 73 |
|
| 74 | +inline constexpr int32_t WHITESPACE_CODE_POINT{static_cast<int32_t>(' ')}; |
| 75 | +inline constexpr int32_t PLUS_CODE_POINT{static_cast<int32_t>('+')}; |
| 76 | + |
69 | 77 | /* Prepares unicode 0-terminated string input for search, |
70 | 78 | leaving only digits and letters with diacritics. |
71 | 79 | Length of string can decrease. |
72 | 80 | Returns length of result. */ |
73 | | -int prepare_search_string(int* input) { |
74 | | - int i; |
75 | | - int* output = input; |
76 | | - for (i = 0; input[i]; i++) { |
77 | | - int c = input[i], new_c; |
78 | | - if ((unsigned int)c < (unsigned int)TABLE_SIZE) { |
79 | | - new_c = prepare_table[c]; |
| 81 | +size_t prepare_search_string(int32_t* code_points, void (*assertf)(bool)) noexcept { |
| 82 | + size_t output_size{}; |
| 83 | + for (size_t i{}; code_points[i] != 0; ++i) { |
| 84 | + int32_t c{code_points[i]}; |
| 85 | + int32_t new_c{}; |
| 86 | + if (static_cast<size_t>(c) < static_cast<size_t>(TABLE_SIZE)) { |
| 87 | + new_c = static_cast<int32_t>(prepare_table[c]); |
80 | 88 | } else { |
81 | | - new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c); |
| 89 | + new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c, assertf); |
82 | 90 | } |
83 | | - if (new_c) { |
84 | | - if (new_c != 0x20 || (output > input && output[-1] != 0x20)) { |
85 | | - *output++ = new_c; |
| 91 | + if (new_c != 0) { |
| 92 | + // we forbid 2 whitespaces after each other and starting whitespace |
| 93 | + if (new_c != WHITESPACE_CODE_POINT || (output_size > 0 && code_points[output_size - 1] != WHITESPACE_CODE_POINT)) { |
| 94 | + code_points[output_size++] = new_c; |
86 | 95 | } |
87 | 96 | } |
88 | 97 | } |
89 | | - if (output > input && output[-1] == 0x20) { |
90 | | - output--; |
| 98 | + if (output_size > 0 && code_points[output_size - 1] == WHITESPACE_CODE_POINT) { |
| 99 | + // throw out terminating whitespace |
| 100 | + --output_size; |
91 | 101 | } |
92 | | - *output = 0; |
93 | | - return output - input; |
94 | | -} |
95 | | - |
96 | | -#define MAX_NAME_SIZE 65536 |
97 | | -static char prep_buf[4 * MAX_NAME_SIZE + 4]; |
98 | | -int prep_ibuf[MAX_NAME_SIZE + 4]; |
99 | | -static int prep_ibuf_res[MAX_NAME_SIZE + 4]; |
100 | | -static int* words_ibuf[MAX_NAME_SIZE + 4]; |
101 | | - |
102 | | -int stricmp_void(const void* x, const void* y) { |
103 | | - const int* s1 = *(const int**)x; |
104 | | - const int* s2 = *(const int**)y; |
105 | | - while (*s1 == *s2 && *s1 != ' ') |
106 | | - s1++, s2++; |
107 | | - return *s1 - *s2; |
| 102 | + code_points[output_size] = 0; |
| 103 | + return output_size; |
108 | 104 | } |
109 | 105 |
|
110 | | -int* prepare_str_unicode(const int* x) { |
111 | | - int* v = prep_ibuf; |
112 | | - |
113 | | - int n; |
114 | | - if (v != x) { |
115 | | - for (n = 0; x[n]; n++) { |
116 | | - v[n] = x[n]; |
| 106 | +inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, void (*assertf)(bool)) noexcept { |
| 107 | + size_t code_points_length = prepare_search_string(code_points, assertf); |
| 108 | + code_points[code_points_length] = WHITESPACE_CODE_POINT; |
| 109 | + |
| 110 | + size_t words_count{}; |
| 111 | + size_t i{}; |
| 112 | + // looking for the beginnings of the words |
| 113 | + while (i < code_points_length) { |
| 114 | + word_start_indices[words_count++] = i; |
| 115 | + while (i < code_points_length && code_points[i] != WHITESPACE_CODE_POINT) { |
| 116 | + ++i; |
117 | 117 | } |
118 | | - v[n] = 0; |
| 118 | + ++i; |
119 | 119 | } |
120 | 120 |
|
121 | | - n = prepare_search_string(v); |
122 | | - v[n] = ' '; |
123 | | - |
124 | | - int i = 0, k = 0; |
125 | | - while (i < n) { |
126 | | - words_ibuf[k++] = v + i; |
127 | | - while (v[i] && v[i] != ' ') { |
128 | | - i++; |
| 121 | + auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { |
| 122 | + while (code_points[x] != WHITESPACE_CODE_POINT && code_points[x] == code_points[y]) { |
| 123 | + ++x; |
| 124 | + ++y; |
129 | 125 | } |
130 | | - i++; |
131 | | - } |
| 126 | + if (code_points[x] == WHITESPACE_CODE_POINT) { |
| 127 | + return code_points[y] != WHITESPACE_CODE_POINT; |
| 128 | + } |
| 129 | + if (code_points[y] == WHITESPACE_CODE_POINT) { |
| 130 | + return false; |
| 131 | + } |
| 132 | + return code_points[x] < code_points[y]; |
| 133 | + }}; |
132 | 134 |
|
133 | | - qsort(words_ibuf, (size_t)k, sizeof(int*), stricmp_void); |
| 135 | + std::sort(word_start_indices, std::next(word_start_indices, words_count), word_less_cmp); |
134 | 136 |
|
135 | | - int j = 0; |
136 | | - for (i = 0; i < k; i++) { |
137 | | - if (j == 0 || stricmp_void(&words_ibuf[j - 1], &words_ibuf[i])) { |
138 | | - words_ibuf[j++] = words_ibuf[i]; |
| 137 | + size_t uniq_words_count{}; |
| 138 | + for (i = 0; i < words_count; ++i) { |
| 139 | + // drop duplicates |
| 140 | + if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { |
| 141 | + word_start_indices[uniq_words_count++] = word_start_indices[i]; |
139 | 142 | } else { |
140 | | - words_ibuf[j - 1] = words_ibuf[i]; |
| 143 | + word_start_indices[uniq_words_count - 1] = word_start_indices[i]; |
141 | 144 | } |
142 | 145 | } |
143 | | - k = j; |
144 | 146 |
|
145 | | - int* res = prep_ibuf_res; |
146 | | - for (i = 0; i < k; i++) { |
147 | | - int* tmp = words_ibuf[i]; |
148 | | - while (*tmp != ' ') { |
149 | | - *res++ = *tmp++; |
| 147 | + size_t result_size{}; |
| 148 | + // output words with '+' separator |
| 149 | + for (i = 0; i < uniq_words_count; ++i) { |
| 150 | + size_t ind{word_start_indices[i]}; |
| 151 | + while (code_points[ind] != WHITESPACE_CODE_POINT) { |
| 152 | + prepared_code_points[result_size++] = code_points[ind++]; |
150 | 153 | } |
151 | | - *res++ = '+'; |
| 154 | + prepared_code_points[result_size++] = PLUS_CODE_POINT; |
152 | 155 | } |
153 | | - *res++ = 0; |
| 156 | + prepared_code_points[result_size++] = 0; |
154 | 157 |
|
155 | | - assert(res - prep_ibuf_res < MAX_NAME_SIZE); |
156 | | - return prep_ibuf_res; |
| 158 | + assertf(result_size < MAX_NAME_SIZE); |
| 159 | + return result_size; |
157 | 160 | } |
158 | 161 |
|
159 | | -const char* clean_str_unicode(const int* xx) { |
160 | | - assert(xx != NULL); |
161 | | - |
162 | | - int* v = prepare_str_unicode(xx); |
163 | | - int l = put_string_utf8(v, prep_buf); |
164 | | - assert(l < sizeof(prep_buf)); |
165 | | - |
166 | | - char *s = prep_buf, *x = prep_buf; |
167 | | - int skip; |
168 | | - |
169 | | - while (*x != 0) { |
170 | | - skip = !strncmp(x, "amp+", 4) || !strncmp(x, "gt+", 3) || !strncmp(x, "lt+", 3) || !strncmp(x, "quot+", 5) || !strncmp(x, "ft+", 3) || |
171 | | - !strncmp(x, "feat+", 5) || |
172 | | - (((x[0] == '1' && x[1] == '9') || (x[0] == '2' && x[1] == '0')) && ('0' <= x[2] && x[2] <= '9') && ('0' <= x[3] && x[3] <= '9') && x[4] == '+') || |
173 | | - !strncmp(x, "092+", 4) || !strncmp(x, "33+", 3) || !strncmp(x, "34+", 3) || !strncmp(x, "36+", 3) || !strncmp(x, "39+", 3) || |
174 | | - !strncmp(x, "60+", 3) || !strncmp(x, "62+", 3) || !strncmp(x, "8232+", 5) || !strncmp(x, "8233+", 5); |
| 162 | +inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, |
| 163 | + void (*assertf)(bool)) noexcept { |
| 164 | + prepare_str_unicode(code_points, word_start_indices, prepared_code_points, assertf); |
| 165 | + |
| 166 | + auto length{static_cast<size_t>(put_string_utf8(prepared_code_points, reinterpret_cast<char*>(utf8_result)))}; |
| 167 | + assertf(length < MAX_NAME_BYTES_SIZE); |
| 168 | + |
| 169 | + size_t i{}; |
| 170 | + size_t result_size{}; |
| 171 | + while (i < length) { |
| 172 | + char* c{reinterpret_cast<char*>(std::addressof(utf8_result[i]))}; |
| 173 | + bool skip{!strncmp(c, "amp+", 4) || !strncmp(c, "gt+", 3) || !strncmp(c, "lt+", 3) || !strncmp(c, "quot+", 5) || !strncmp(c, "ft+", 3) || |
| 174 | + !strncmp(c, "feat+", 5) || |
| 175 | + (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || |
| 176 | + !strncmp(c, "092+", 4) || !strncmp(c, "33+", 3) || !strncmp(c, "34+", 3) || !strncmp(c, "36+", 3) || !strncmp(c, "39+", 3) || |
| 177 | + !strncmp(c, "60+", 3) || !strncmp(c, "62+", 3) || !strncmp(c, "8232+", 5) || !strncmp(c, "8233+", 5)}; |
175 | 178 | do { |
176 | | - *s = *x; |
177 | 179 | if (!skip) { |
178 | | - s++; |
| 180 | + utf8_result[result_size] = utf8_result[i]; |
| 181 | + ++result_size; |
179 | 182 | } |
180 | | - } while (*x++ != '+'); |
| 183 | + } while (utf8_result[i++] != static_cast<std::byte>('+')); |
181 | 184 | } |
182 | | - *s = 0; |
| 185 | + utf8_result[result_size] = static_cast<std::byte>(0); |
183 | 186 |
|
184 | | - return prep_buf; |
| 187 | + return result_size; |
185 | 188 | } |
186 | 189 |
|
187 | | -const char* clean_str(const char* x) { |
188 | | - if (x == NULL || strlen(x) >= MAX_NAME_SIZE) { |
189 | | - return x; |
| 190 | +size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, |
| 191 | + void (*assertf)(bool)) { |
| 192 | + size_t x_len{strlen(x)}; |
| 193 | + if (assertf == nullptr || x == NULL || x_len >= MAX_NAME_SIZE) { |
| 194 | + for (size_t i = 0; i < x_len; ++i) { |
| 195 | + utf8_result[i] = static_cast<std::byte>(x[i]); |
| 196 | + } |
| 197 | + utf8_result[x_len] = static_cast<std::byte>(0); |
| 198 | + return x_len; |
190 | 199 | } |
191 | 200 |
|
192 | | - html_string_to_utf8(x, prep_ibuf); |
193 | | - return clean_str_unicode(prep_ibuf); |
| 201 | + html_string_to_utf8(x, code_points); |
| 202 | + return clean_str_unicode(code_points, word_start_indices, prepared_code_points, utf8_result, assertf); |
194 | 203 | } |
0 commit comments