Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/transform.c
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ _dispatch_transform_to_utf16(dispatch_data_t data, int32_t byteOrder)
uint32_t wch = 0;
uint8_t byte_size = _dispatch_transform_utf8_length(*src);
size_t next;
// Capture the logical start position of this character before updating i
size_t char_start = offset + i;

if (byte_size == 0) {
return (bool)false;
Expand Down Expand Up @@ -359,7 +361,7 @@ _dispatch_transform_to_utf16(dispatch_data_t data, int32_t byteOrder)
if (os_mul_overflow(size - i, sizeof(uint16_t), &next)) {
return (bool)false;
}
if (wch == 0xfeff && offset + i == 3) {
if (wch == 0xfeff && char_start == 0) {
// skip the BOM if any, as we already inserted one ourselves
} else if (wch >= 0xd800 && wch < 0xdfff) {
// Illegal range (surrogate pair)
Expand Down
49 changes: 49 additions & 0 deletions tests/dispatch_transform.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,55 @@ utf8_to_utf16le_test(void * context)
dispatch_release(utf16_data);
dispatch_release(utf8_data);

dispatch_group_async_f(context, dispatch_get_main_queue(), context, utf8_bom_to_utf16le_test);
}

void
utf8_bom_to_utf16le_test(void * context)
{
// UTF-8 with BOM (0xEF 0xBB 0xBF) followed by "A" (0x41)
static uint8_t utf8_with_bom[] = {
0xef, 0xbb, 0xbf, 0x41,
};
// Expected UTF-16LE: BOM (0xFEFF added by converter) + "A" (0x0041)
// The input BOM should be skipped, so we only get one BOM
static uint16_t utf16_expected[] = {
0xfeff, 0x0041,
};

dispatch_data_t utf8_data = dispatch_data_create(utf8_with_bom, sizeof(utf8_with_bom), NULL, ^{});
dispatch_data_t utf16_data = dispatch_data_create(utf16_expected, sizeof(utf16_expected), NULL, ^{});

dispatch_data_t transformed = dispatch_data_create_with_transform(utf8_data, DISPATCH_DATA_FORMAT_TYPE_UTF8, DISPATCH_DATA_FORMAT_TYPE_UTF16LE);
test_ptr_notnull("dispatch_data_create_with_transform (UTF8 with BOM -> UTF16LE)", transformed);
test_data_equal("utf8_bom_to_utf16le_test", transformed, utf16_data);

dispatch_release(transformed);
dispatch_release(utf16_data);
dispatch_release(utf8_data);

// Test with BOM NOT at start - should NOT be skipped (char_start != 0)
// UTF-8: "A" (0x41) followed by BOM (0xEF 0xBB 0xBF)
static uint8_t utf8_bom_not_at_start[] = {
0x41, 0xef, 0xbb, 0xbf,
};
// Expected: BOM (added by converter) + "A" (0x0041) + BOM (0xFEFF from input, NOT skipped)
// The BOM in the middle should NOT be skipped because char_start != 0
static uint16_t utf16_with_bom_in_middle[] = {
0xfeff, 0x0041, 0xfeff,
};

dispatch_data_t utf8_data2 = dispatch_data_create(utf8_bom_not_at_start, sizeof(utf8_bom_not_at_start), NULL, ^{});
dispatch_data_t utf16_data2 = dispatch_data_create(utf16_with_bom_in_middle, sizeof(utf16_with_bom_in_middle), NULL, ^{});

dispatch_data_t transformed2 = dispatch_data_create_with_transform(utf8_data2, DISPATCH_DATA_FORMAT_TYPE_UTF8, DISPATCH_DATA_FORMAT_TYPE_UTF16LE);
test_ptr_notnull("dispatch_data_create_with_transform (UTF8 with BOM not at start -> UTF16LE)", transformed2);
test_data_equal("utf8_bom_not_at_start_test", transformed2, utf16_data2);

dispatch_release(transformed2);
dispatch_release(utf16_data2);
dispatch_release(utf8_data2);

dispatch_group_async_f(context, dispatch_get_main_queue(), context, utf8_to_utf16be_test);
}

Expand Down