Skip to content

Commit b5f3611

Browse files
committed
fix: encoding detection
1 parent 4456f87 commit b5f3611

File tree

1 file changed

+36
-1
lines changed

1 file changed

+36
-1
lines changed

src/utils/encodings.js

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,40 @@ function detectBOM(bytes) {
5555
return null;
5656
}
5757

58+
function isValidUTF8(bytes) {
59+
let i = 0;
60+
while (i < bytes.length) {
61+
const byte = bytes[i];
62+
63+
if (byte < 0x80) {
64+
i++;
65+
} else if (byte >> 5 === 0x06) {
66+
if (i + 1 >= bytes.length || bytes[i + 1] >> 6 !== 0x02) return false;
67+
i += 2;
68+
} else if (byte >> 4 === 0x0e) {
69+
if (
70+
i + 2 >= bytes.length ||
71+
bytes[i + 1] >> 6 !== 0x02 ||
72+
bytes[i + 2] >> 6 !== 0x02
73+
)
74+
return false;
75+
i += 3;
76+
} else if (byte >> 3 === 0x1e) {
77+
if (
78+
i + 3 >= bytes.length ||
79+
bytes[i + 1] >> 6 !== 0x02 ||
80+
bytes[i + 2] >> 6 !== 0x02 ||
81+
bytes[i + 3] >> 6 !== 0x02
82+
)
83+
return false;
84+
i += 4;
85+
} else {
86+
return false;
87+
}
88+
}
89+
return true;
90+
}
91+
5892
export async function detectEncoding(buffer) {
5993
if (!buffer || buffer.byteLength === 0) {
6094
return settings.value.defaultFileEncoding || "UTF-8";
@@ -74,9 +108,10 @@ export async function detectEncoding(buffer) {
74108
else if (byte < 0x80) ascii++;
75109
}
76110

77-
if (ascii / sample.length > 0.95) return "UTF-8";
78111
if (nulls > sample.length * 0.3) return "UTF-16LE";
79112

113+
if (isValidUTF8(sample)) return "UTF-8";
114+
80115
const encodings = [
81116
...new Set([
82117
"UTF-8",

0 commit comments

Comments
 (0)