Skip to content

Commit 6511b5d

Browse files
committed
feat: basic auto encoding detection (#787)
1 parent 37c3fac commit 6511b5d

File tree

2 files changed

+89
-8
lines changed

2 files changed

+89
-8
lines changed

src/lib/openFile.js

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import alert from "dialogs/alert";
44
import confirm from "dialogs/confirm";
55
import loader from "dialogs/loader";
66
import { reopenWithNewEncoding } from "palettes/changeEncoding";
7-
import { decode } from "utils/encodings";
7+
import { decode, detectEncoding } from "utils/encodings";
88
import helpers from "utils/helpers";
99
import EditorFile from "./editorFile";
1010
import fileTypeHandler from "./fileTypeHandler";
@@ -84,7 +84,7 @@ export default async function openFile(file, options = {}) {
8484
const fileInfo = await fs.stat();
8585
const name = fileInfo.name || file.filename || uri;
8686
const readOnly = fileInfo.canWrite ? false : true;
87-
const createEditor = (isUnsaved, text) => {
87+
const createEditor = (isUnsaved, text, detectedEncoding) => {
8888
new EditorFile(name, {
8989
uri,
9090
text,
@@ -93,7 +93,7 @@ export default async function openFile(file, options = {}) {
9393
render,
9494
onsave,
9595
readOnly,
96-
encoding,
96+
encoding: detectedEncoding || encoding,
9797
SAFMode: mode,
9898
});
9999
};
@@ -385,12 +385,21 @@ export default async function openFile(file, options = {}) {
385385
}
386386

387387
const binData = await fs.readFile();
388-
const fileContent = await decode(
389-
binData,
390-
file.encoding || appSettings.value.defaultFileEncoding,
391-
);
392388

393-
createEditor(false, fileContent);
389+
// Detect encoding if not explicitly provided
390+
let detectedEncoding = file.encoding || encoding;
391+
if (!detectedEncoding) {
392+
try {
393+
detectedEncoding = await detectEncoding(binData);
394+
} catch (error) {
395+
console.warn("Encoding detection failed, using default:", error);
396+
detectedEncoding = appSettings.value.defaultFileEncoding;
397+
}
398+
}
399+
400+
const fileContent = await decode(binData, detectedEncoding);
401+
402+
createEditor(false, fileContent, detectedEncoding);
394403
if (mode !== "single") recents.addFile(uri);
395404
return;
396405
} catch (error) {

src/utils/encodings.js

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,78 @@ export function getEncoding(charset) {
4040
return encodings["UTF-8"];
4141
}
4242

43+
function detectBOM(bytes) {
44+
if (
45+
bytes.length >= 3 &&
46+
bytes[0] === 0xef &&
47+
bytes[1] === 0xbb &&
48+
bytes[2] === 0xbf
49+
)
50+
return "UTF-8";
51+
if (bytes.length >= 2 && bytes[0] === 0xff && bytes[1] === 0xfe)
52+
return "UTF-16LE";
53+
if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff)
54+
return "UTF-16BE";
55+
return null;
56+
}
57+
58+
export async function detectEncoding(buffer) {
59+
if (!buffer || buffer.byteLength === 0) {
60+
return settings.value.defaultFileEncoding || "UTF-8";
61+
}
62+
63+
const bytes = new Uint8Array(buffer);
64+
65+
const bomEncoding = detectBOM(bytes);
66+
if (bomEncoding) return bomEncoding;
67+
68+
const sample = bytes.subarray(0, Math.min(2048, bytes.length));
69+
let nulls = 0,
70+
ascii = 0;
71+
72+
for (const byte of sample) {
73+
if (byte === 0) nulls++;
74+
else if (byte < 0x80) ascii++;
75+
}
76+
77+
if (ascii / sample.length > 0.95) return "UTF-8";
78+
if (nulls > sample.length * 0.3) return "UTF-16LE";
79+
80+
const encodings = [
81+
...new Set([
82+
"UTF-8",
83+
settings.value.defaultFileEncoding || "UTF-8",
84+
"windows-1252",
85+
"ISO-8859-1",
86+
]),
87+
];
88+
89+
const testSample = sample.subarray(0, 512);
90+
const testBuffer = testSample.buffer.slice(
91+
testSample.byteOffset,
92+
testSample.byteOffset + testSample.byteLength,
93+
);
94+
95+
for (const encoding of encodings) {
96+
try {
97+
const encodingObj = getEncoding(encoding);
98+
if (!encodingObj) continue;
99+
100+
const text = await execDecode(testBuffer, encodingObj.name);
101+
if (
102+
!text.includes("\uFFFD") &&
103+
!/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(text)
104+
) {
105+
return encoding;
106+
}
107+
} catch (error) {
108+
continue;
109+
}
110+
}
111+
112+
return settings.value.defaultFileEncoding || "UTF-8";
113+
}
114+
43115
/**
44116
* Decodes arrayBuffer to String according given encoding type
45117
* @param {ArrayBuffer} buffer

0 commit comments

Comments
 (0)