Skip to content

Commit c8365b0

Browse files
committed
Remove old libicu max collation sentinel hack
For old libicu versions < 59 on RHEL 6 and 7 we had to add a hack in the NIF driver to handle the max sortable sentinel `<<255,255,255,255>>` [1]. We don't support those versions any longer, and since libicu version 59 the library will automatically sort those as the highest values [2], so we can clean up our collator NIF. [1] #3491 [2] https://www.unicode.org/reports/tr35/tr35-collation.html#tailored-noncharacter-weights > U+FFFF: This code point is tailored to have a primary weight higher than all other characters. This allows the reliable specification of a range, such as “Sch” ≤ X ≤ “Sch\uFFFF”, to include all strings starting with "sch" or equivalent.
1 parent 333ae93 commit c8365b0

2 files changed

Lines changed: 11 additions & 41 deletions

File tree

src/couch/priv/couch_ejson_compare/couch_ejson_compare.c

Lines changed: 9 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@
1818
#include "erl_nif.h"
1919
#include "unicode/ucol.h"
2020
#include "unicode/ucasemap.h"
21+
#include "unicode/uvernum.h"
22+
23+
/* {255,255,255,255} max-key sentinel relies on libicu sorting FFFF byte
24+
* sequence w/ the highest collation weight. However that happens starting with
25+
* libicu 59, so we'll explicitly check the version to avoid silently
26+
* mis-collating data */
27+
#if U_ICU_VERSION_MAJOR_NUM < 59
28+
#error "CouchDB requires libicu >= 59 for correct max key collation"
29+
#endif
2130

2231
#define MAX_DEPTH 10
2332

@@ -73,14 +82,8 @@ static __inline int atom_sort_order(ErlNifEnv*, ERL_NIF_TERM);
7382
static __inline int compare_strings(ctx_t*, ErlNifBinary, ErlNifBinary);
7483
static __inline int compare_lists(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
7584
static __inline int compare_props(int, ctx_t*, ERL_NIF_TERM, ERL_NIF_TERM);
76-
static __inline int is_max_utf8_marker(ErlNifBinary);
7785
static __inline UCollator* get_collator(void);
7886

79-
/* Should match the <<255,255,255,255>> in:
80-
* - src/mango/src/mango_idx_view.hrl#L13
81-
* - src/couch_mrview/src/couch_mrview_util.erl#L40 */
82-
static const unsigned char max_utf8_marker[] = {255, 255, 255, 255};
83-
8487

8588
UCollator*
8689
get_collator(void)
@@ -469,47 +472,13 @@ compare_props(int depth, ctx_t* ctx, ERL_NIF_TERM a, ERL_NIF_TERM b)
469472
}
470473

471474

472-
int
473-
is_max_utf8_marker(ErlNifBinary bin)
474-
{
475-
if (bin.size == sizeof(max_utf8_marker)) {
476-
if(memcmp(bin.data, max_utf8_marker, sizeof(max_utf8_marker)) == 0) {
477-
return 1;
478-
}
479-
return 0;
480-
}
481-
return 0;
482-
}
483-
484-
485475
int
486476
compare_strings(ctx_t* ctx, ErlNifBinary a, ErlNifBinary b)
487477
{
488478
UErrorCode status = U_ZERO_ERROR;
489479
UCharIterator iterA, iterB;
490480
int result;
491481

492-
/* libicu versions earlier than 59 (at least) don't consider the
493-
* {255,255,255,255} to be the highest sortable string as CouchDB expects.
494-
* While we are still shipping CentOS 7 packages with libicu 50, we should
495-
* explicitly check for the marker, later on we can remove the max
496-
* logic */
497-
498-
int a_is_max = is_max_utf8_marker(a);
499-
int b_is_max = is_max_utf8_marker(b);
500-
501-
if(a_is_max && b_is_max) {
502-
return 0;
503-
}
504-
505-
if(a_is_max) {
506-
return 1;
507-
}
508-
509-
if(b_is_max) {
510-
return -1;
511-
}
512-
513482
uiter_setUTF8(&iterA, (const char *) a.data, (uint32_t) a.size);
514483
uiter_setUTF8(&iterB, (const char *) b.data, (uint32_t) b.size);
515484

src/couch/test/eunit/couch_ejson_compare_tests.erl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
<<"B">>,
3737
<<"ba">>,
3838
<<"bb">>,
39-
% Highest sorting unicode value. Special case in the nif
39+
% Highest sorting unicode value
40+
% libicu >= 59 sorts it highest natively
4041
?MAX_UNICODE_STRING,
4142
[<<"a">>],
4243
[<<"b">>],

0 commit comments

Comments
 (0)