Skip to content

Commit a3abaf0

Browse files
CodFrmCopilotcyfung1031
authored
🐛 处理脚本编码问题 #1115 (#1138)
* 🐛 处理脚本编码问题 #1115 * 优化脚本安装编码检测性能并添加测试覆盖 (#1139) * Initial plan * 优化编码检测性能并添加完整测试覆盖 Co-authored-by: CodFrm <22783163+CodFrm@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: CodFrm <22783163+CodFrm@users.noreply.github.com> * 修复lint问题 * data.subarray --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: cyfung1031 <44498510+cyfung1031@users.noreply.github.com>
1 parent f92d50f commit a3abaf0

5 files changed

Lines changed: 226 additions & 7 deletions

File tree

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"@dnd-kit/modifiers": "^9.0.0",
3131
"@dnd-kit/sortable": "^10.0.0",
3232
"@dnd-kit/utilities": "^3.2.2",
33+
"chardet": "^2.1.1",
3334
"cron": "^3.2.1",
3435
"crypto-js": "^4.2.0",
3536
"dayjs": "^1.11.13",

pnpm-lock.yaml

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/pages/install/App.tsx

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import { CACHE_KEY_SCRIPT_INFO } from "@App/app/cache_key";
3333
import { cacheInstance } from "@App/app/cache";
3434
import { formatBytes, prettyUrl } from "@App/pkg/utils/utils";
3535
import { ScriptIcons } from "../options/routes/utils";
36+
import { detectEncoding } from "@App/pkg/utils/encoding";
3637

3738
const backgroundPromptShownKey = "background_prompt_shown";
3839

@@ -102,11 +103,6 @@ const fetchScriptBody = async (url: string, { onProgress }: { [key: string]: any
102103
onProgress?.({ receivedLength });
103104
}
104105

105-
// 检查 Content-Type 中的 charset
106-
const contentType = response.headers.get("content-type") || "";
107-
const charsetMatch = contentType.match(/charset=([^;]+)/i);
108-
const charset = charsetMatch ? charsetMatch[1].toLowerCase() : "utf-8";
109-
110106
// 合并分片(chunks)
111107
const chunksAll = new Uint8Array(receivedLength);
112108
let position = 0;
@@ -115,12 +111,18 @@ const fetchScriptBody = async (url: string, { onProgress }: { [key: string]: any
115111
position += chunk.length;
116112
}
117113

114+
// 检测编码:优先使用 Content-Type,回退到 chardet(仅检测前16KB)
115+
const contentType = response.headers.get("content-type");
116+
const encode = detectEncoding(chunksAll, contentType);
117+
118118
// 使用检测到的 charset 解码
119119
let code;
120120
try {
121-
code = new TextDecoder(charset).decode(chunksAll);
121+
code = new TextDecoder(encode).decode(chunksAll);
122122
} catch (e: any) {
123-
throw new Error(`Failed to decode response with charset ${charset}: ${e.message}`);
123+
console.warn(`Failed to decode response with charset ${encode}: ${e.message}`);
124+
// 回退到 UTF-8
125+
code = new TextDecoder("utf-8").decode(chunksAll);
124126
}
125127

126128
const metadata = parseMetadata(code);

src/pkg/utils/encoding.test.ts

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import { describe, it, expect, vi } from "vitest";
2+
import { parseCharsetFromContentType, detectEncoding } from "./encoding";
3+
4+
describe("encoding detection", () => {
5+
describe("parseCharsetFromContentType", () => {
6+
it("should extract charset from valid Content-Type header", () => {
7+
expect(parseCharsetFromContentType("text/javascript; charset=utf-8")).toBe("utf-8");
8+
expect(parseCharsetFromContentType("text/plain; charset=GBK")).toBe("gbk");
9+
expect(parseCharsetFromContentType("application/javascript; charset=ISO-8859-1")).toBe("iso-8859-1");
10+
});
11+
12+
it("should handle charset with quotes", () => {
13+
expect(parseCharsetFromContentType('text/javascript; charset="utf-8"')).toBe("utf-8");
14+
expect(parseCharsetFromContentType("text/javascript; charset='gbk'")).toBe("gbk");
15+
});
16+
17+
it("should handle case-insensitive charset parameter", () => {
18+
expect(parseCharsetFromContentType("text/javascript; CHARSET=UTF-8")).toBe("utf-8");
19+
expect(parseCharsetFromContentType("text/javascript; Charset=GBK")).toBe("gbk");
20+
});
21+
22+
it("should return null for missing charset", () => {
23+
expect(parseCharsetFromContentType("text/javascript")).toBe(null);
24+
expect(parseCharsetFromContentType("text/plain; boundary=something")).toBe(null);
25+
});
26+
27+
it("should return null for null or empty input", () => {
28+
expect(parseCharsetFromContentType(null)).toBe(null);
29+
expect(parseCharsetFromContentType("")).toBe(null);
30+
});
31+
32+
it("should handle charset with additional parameters", () => {
33+
expect(parseCharsetFromContentType("text/javascript; charset=utf-8; boundary=xxx")).toBe("utf-8");
34+
});
35+
});
36+
37+
describe("detectEncoding", () => {
38+
it("should prioritize valid charset from Content-Type header", () => {
39+
const utf8Data = new TextEncoder().encode("hello world");
40+
expect(detectEncoding(utf8Data, "text/javascript; charset=utf-8")).toBe("utf-8");
41+
});
42+
43+
it("should fallback to chardet when Content-Type header is missing", () => {
44+
// UTF-8 编码的中文
45+
const utf8Data = new TextEncoder().encode("你好世界");
46+
const encoding = detectEncoding(utf8Data, null);
47+
expect(encoding).toBe("utf-8");
48+
});
49+
50+
it("should fallback to chardet when Content-Type charset is invalid", () => {
51+
const utf8Data = new TextEncoder().encode("hello world");
52+
const encoding = detectEncoding(utf8Data, "text/javascript; charset=invalid-encoding");
53+
// chardet 可能检测为 utf-8 或 ascii,都是合理的
54+
expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding);
55+
});
56+
57+
it("should fallback to utf-8 when chardet returns null", () => {
58+
// 模拟 chardet 返回 null 的情况(空数据)
59+
const emptyData = new Uint8Array(0);
60+
const encoding = detectEncoding(emptyData, null);
61+
// 空数据时,chardet 可能返回 ascii 或其他编码,但都应该是有效的
62+
expect(encoding).toBeTruthy();
63+
expect(() => new TextDecoder(encoding)).not.toThrow();
64+
});
65+
66+
it("should only use first 16KB for chardet detection", () => {
67+
// 创建一个大于 16KB 的数据
68+
const largeData = new Uint8Array(20 * 1024);
69+
// 填充 UTF-8 编码的数据
70+
const text = "a".repeat(20 * 1024);
71+
const textBytes = new TextEncoder().encode(text);
72+
largeData.set(textBytes.slice(0, largeData.length));
73+
74+
const encoding = detectEncoding(largeData, null);
75+
// 应该成功检测,说明使用了采样
76+
expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding);
77+
});
78+
79+
it("should handle GBK encoded data", () => {
80+
// GBK 编码的 "你好" (这是一个简化的测试,实际 GBK 编码更复杂)
81+
// 注意:在浏览器环境中,GBK 编码可能被识别为其他兼容编码
82+
const gbkLikeData = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]); // "你好" in GBK
83+
const encoding = detectEncoding(gbkLikeData, null);
84+
// chardet 可能识别为 GBK、Shift_JIS 或相关的东亚编码
85+
expect(encoding).toBeTruthy();
86+
expect(() => new TextDecoder(encoding)).not.toThrow();
87+
});
88+
89+
it("should handle ISO-8859-1 encoded data", () => {
90+
// ISO-8859-1 特有字符(扩展 ASCII)
91+
const iso88591Data = new Uint8Array([0xe9, 0xe8, 0xe0, 0xe7]); // é è à ç
92+
const encoding = detectEncoding(iso88591Data, null);
93+
expect(encoding).toBeTruthy();
94+
});
95+
96+
it("should validate detected encoding is supported by TextDecoder", () => {
97+
const utf8Data = new TextEncoder().encode("test");
98+
const encoding = detectEncoding(utf8Data, null);
99+
100+
// 确保返回的编码可以被 TextDecoder 使用
101+
expect(() => new TextDecoder(encoding)).not.toThrow();
102+
});
103+
104+
it("should prefer Content-Type charset over chardet detection", () => {
105+
// 即使数据看起来像 GBK,如果 Content-Type 指定了 UTF-8,应该使用 UTF-8
106+
const data = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]);
107+
const encoding = detectEncoding(data, "text/javascript; charset=utf-8");
108+
expect(encoding).toBe("utf-8");
109+
});
110+
111+
it("should handle charset with different cases from Content-Type", () => {
112+
const data = new TextEncoder().encode("test");
113+
expect(detectEncoding(data, "text/javascript; charset=UTF-8")).toBe("utf-8");
114+
expect(detectEncoding(data, "text/javascript; charset=Utf-8")).toBe("utf-8");
115+
expect(detectEncoding(data, "text/javascript; charset=GBK")).toBe("gbk");
116+
});
117+
118+
it("should handle Windows-1252 encoded data", () => {
119+
// Windows-1252 特有字符
120+
const win1252Data = new Uint8Array([0x80, 0x82, 0x83, 0x84]); // € ‚ ƒ „
121+
const encoding = detectEncoding(win1252Data, null);
122+
expect(encoding).toBeTruthy();
123+
// chardet 应该能检测出编码或回退到有效的编码
124+
// Shift_JIS 也是一个有效的编码,chardet 可能会识别为它
125+
expect(["utf-8", "windows-1252", "iso-8859-1", "shift_jis", "ascii"]).toContain(encoding);
126+
});
127+
128+
it("should fallback to utf-8 when chardet detects invalid encoding", () => {
129+
// 使用 vi.spyOn 来模拟 console.warn
130+
const consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
131+
132+
const data = new TextEncoder().encode("test");
133+
const encoding = detectEncoding(data, null);
134+
135+
// 应该成功返回一个有效的编码
136+
expect(encoding).toBeTruthy();
137+
expect(() => new TextDecoder(encoding)).not.toThrow();
138+
139+
consoleWarnSpy.mockRestore();
140+
});
141+
});
142+
});

src/pkg/utils/encoding.ts

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import chardet from "chardet";
2+
3+
/**
4+
* 从 Content-Type header 中解析 charset
5+
*/
6+
export const parseCharsetFromContentType = (contentType: string | null): string | null => {
7+
if (!contentType) return null;
8+
9+
const match = contentType.match(/charset=([^;]+)/i);
10+
if (match && match[1]) {
11+
return match[1].trim().toLowerCase().replace(/['"]/g, "");
12+
}
13+
return null;
14+
};
15+
16+
/**
17+
* 检测字节数组的编码
18+
* 优先使用 Content-Type header,失败时使用 chardet(仅对前16KB检测以提升性能)
19+
*/
20+
export const detectEncoding = (data: Uint8Array, contentType: string | null): string => {
21+
// 优先尝试使用 Content-Type header 中的 charset
22+
const headerCharset = parseCharsetFromContentType(contentType);
23+
if (headerCharset) {
24+
try {
25+
// 验证 charset 是否有效
26+
new TextDecoder(headerCharset);
27+
return headerCharset;
28+
} catch (e: any) {
29+
console.warn(`Invalid charset from Content-Type header: ${headerCharset}, error: ${e.message}`);
30+
}
31+
}
32+
33+
// 使用 chardet 检测编码,仅检测前16KB以提升性能
34+
const sampleSize = Math.min(data.length, 16 * 1024);
35+
const sample = data.subarray(0, sampleSize);
36+
const detected = chardet.detect(sample);
37+
38+
if (detected) {
39+
const encoding = detected.toLowerCase();
40+
try {
41+
// 验证检测到的编码是否有效
42+
new TextDecoder(encoding);
43+
return encoding;
44+
} catch (e: any) {
45+
console.warn(`Invalid charset detected by chardet: ${encoding}, error: ${e.message}`);
46+
}
47+
}
48+
49+
// 回退到 UTF-8
50+
return "utf-8";
51+
};

0 commit comments

Comments
 (0)