Skip to content

Commit bacba96

Browse files
committed
diff: add utf-{16,32} binary detection
When we examine a diff_filespec for binary-ness, we can now also determine whether it's utf-16 or utf-32 by looking for a BOM. Those encodings are still considered binary, but future patches will add callers which handle them specially. Note that even though we know the endian-ness of the encoding from the BOM, we do not include it in the returned name. This is because "iconv" (which we'll ultimately feed these names to) does not expect to see a BOM if we tell it the source is "UTF-16LE". We should feed it "UTF-16" with the BOM, which it will interpret itself. Signed-off-by: Jeff King <peff@peff.net>
1 parent 2d56119 commit bacba96

2 files changed

Lines changed: 25 additions & 1 deletion

File tree

diff.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3685,6 +3685,27 @@ static void emit_binary_diff(struct diff_options *o,
36853685
emit_binary_diff_body(o, two, one);
36863686
}
36873687

3688+
static const char *buffer_has_utf_bom(const void *vdata, size_t size)
3689+
{
3690+
const unsigned char *data = vdata;
3691+
3692+
if (size >= 4) {
3693+
if (data[0] == 0x00 && data[1] == 0x00 &&
3694+
data[2] == 0xfe && data[3] == 0xff)
3695+
return "UTF-32";
3696+
if (data[0] == 0xff && data[1] == 0xfe &&
3697+
data[2] == 0x00 && data[3] == 0x00)
3698+
return "UTF-32";
3699+
}
3700+
if (size >= 2) {
3701+
if (data[0] == 0xfe && data[1] == 0xff)
3702+
return "UTF-16";
3703+
if (data[0] == 0xff && data[1] == 0xfe)
3704+
return "UTF-16";
3705+
}
3706+
return NULL;
3707+
}
3708+
36883709
enum diff_content diff_filespec_content_type(struct repository *r,
36893710
struct diff_filespec *one)
36903711
{
@@ -3702,6 +3723,8 @@ enum diff_content diff_filespec_content_type(struct repository *r,
37023723
if (one->content_type == DIFF_CONTENT_UNKNOWN && one->data) {
37033724
if (!buffer_is_binary(one->data, one->size))
37043725
one->content_type = DIFF_CONTENT_TEXT;
3726+
else if (buffer_has_utf_bom(one->data, one->size))
3727+
one->content_type = DIFF_CONTENT_UTF;
37053728
else
37063729
one->content_type = DIFF_CONTENT_BINARY;
37073730
}

diffcore.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ struct diff_filespec {
7373
enum diff_content {
7474
DIFF_CONTENT_UNKNOWN = -1,
7575
DIFF_CONTENT_TEXT = 0,
76-
DIFF_CONTENT_BINARY = 1
76+
DIFF_CONTENT_BINARY = 1,
77+
DIFF_CONTENT_UTF
7778
} content_type;
7879
struct userdiff_driver *driver;
7980
};

0 commit comments

Comments
 (0)