Skip to content

Commit 5aec6b6

Browse files
committed
diff: add autoencode feature
This feature auto-converts utf-16 and utf-32 content into utf-8 for diffing, as long as: 1. textconv is enabled 2. the autoencode feature is enabled (which defaults to off for now) 3. the file does not have a configured textconv filter TODO: - documentation - tests - address the fixme Signed-off-by: Jeff King <peff@peff.net>
1 parent bacba96 commit 5aec6b6

2 files changed

Lines changed: 43 additions & 1 deletion

File tree

diff.c

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ static struct diff_options default_diff_options;
7878
static long diff_algorithm;
7979
static unsigned ws_error_highlight_default = WSEH_NEW;
8080

81+
static struct userdiff_textconv autoencode_textconv = { "autoencode" };
82+
8183
static char diff_colors[][COLOR_MAXLEN] = {
8284
GIT_COLOR_RESET,
8385
GIT_COLOR_NORMAL, /* CONTEXT */
@@ -3770,14 +3772,22 @@ struct userdiff_textconv *diff_get_textconv(struct repository *r,
37703772
struct diff_options *opt,
37713773
struct diff_filespec *one)
37723774
{
3775+
struct userdiff_textconv *textconv;
3776+
37733777
if (!opt->flags.allow_textconv)
37743778
return NULL;
37753779

37763780
if (!DIFF_FILE_VALID(one))
37773781
return NULL;
37783782

37793783
diff_filespec_load_driver(one, r->index);
3780-
return userdiff_get_textconv(r, one->driver);
3784+
textconv = userdiff_get_textconv(r, one->driver);
3785+
3786+
if (!textconv && opt->flags.allow_autoencode &&
3787+
diff_filespec_content_type(r, one) == DIFF_CONTENT_UTF)
3788+
textconv = &autoencode_textconv;
3789+
3790+
return textconv;
37813791
}
37823792

37833793
static struct string_list *additional_headers(struct diff_options *o,
@@ -6266,6 +6276,8 @@ struct option *add_diff_options(const struct option *opts,
62666276
OPT_CALLBACK_F(0, "textconv", options, NULL,
62676277
N_("run external text conversion filters when comparing binary files"),
62686278
PARSE_OPT_NOARG, diff_opt_textconv),
6279+
OPT_BOOL(0, "autoencode", &options->flags.allow_autoencode,
6280+
N_("allow automatic encoding conversion")),
62696281
OPT_CALLBACK_F(0, "ignore-submodules", options, N_("<when>"),
62706282
N_("ignore changes to submodules in the diff generation"),
62716283
PARSE_OPT_NONEG | PARSE_OPT_OPTARG,
@@ -7806,6 +7818,35 @@ size_t fill_textconv(struct repository *r,
78067818
return 0;
78077819
}
78087820

7821+
if (textconv == &autoencode_textconv) {
7822+
size_t outsize;
7823+
const char *from_encoding;
7824+
7825+
if (diff_populate_filespec(r, df, NULL))
7826+
die("unable to read files to diff");
7827+
7828+
from_encoding = buffer_has_utf_bom(df->data, df->size);
7829+
if (!from_encoding)
7830+
BUG("autoencode triggered for non-utf content");
7831+
7832+
*outbuf = reencode_string_len(df->data, df->size,
7833+
"UTF-8", from_encoding,
7834+
&outsize);
7835+
7836+
/*
7837+
* FIXME Our encoding guess failed. It's too late to return
7838+
* the original content, since the caller has already decided
7839+
* not to treat the contents as binary. But we could perhaps
7840+
* give some munged text form (e.g., by escaping high-bit
7841+
* characters and NULs).
7842+
*/
7843+
if (!*outbuf)
7844+
die_errno("unable to reencode from %s for path '%s'",
7845+
from_encoding, df->path);
7846+
7847+
return outsize;
7848+
}
7849+
78097850
if (!textconv) {
78107851
if (diff_populate_filespec(r, df, NULL))
78117852
die("unable to read files to diff");

diff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ struct diff_flags {
194194
unsigned dirstat_by_file;
195195
unsigned allow_textconv;
196196
unsigned textconv_set_via_cmdline;
197+
unsigned allow_autoencode;
197198
unsigned diff_from_contents;
198199
unsigned dirty_submodules;
199200
unsigned ignore_untracked_in_submodules;

0 commit comments

Comments
 (0)