🐛 处理脚本编码问题 #1115 (#1138)

CodFrm · Copilot · cyfung1031 · web-flow · commit a3abaf063424 · 2026-01-09T11:52:31.000+08:00
* 🐛 处理脚本编码问题 #1115 * 优化脚本安装编码检测性能并添加测试覆盖 (#1139) * Initial plan * 优化编码检测性能并添加完整测试覆盖 Co-authored-by: CodFrm <22783163+CodFrm@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: CodFrm <22783163+CodFrm@users.noreply.github.com> * 修复lint问题 * data.subarray --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: cyfung1031 <44498510+cyfung1031@users.noreply.github.com>
diff --git a/package.json b/package.json
@@ -30,6 +30,7 @@
     "@dnd-kit/modifiers": "^9.0.0",
     "@dnd-kit/sortable": "^10.0.0",
     "@dnd-kit/utilities": "^3.2.2",
+    "chardet": "^2.1.1",
     "cron": "^3.2.1",
     "crypto-js": "^4.2.0",
     "dayjs": "^1.11.13",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/pages/install/App.tsx b/src/pages/install/App.tsx
@@ -33,6 +33,7 @@ import { CACHE_KEY_SCRIPT_INFO } from "@App/app/cache_key";
 import { cacheInstance } from "@App/app/cache";
 import { formatBytes, prettyUrl } from "@App/pkg/utils/utils";
 import { ScriptIcons } from "../options/routes/utils";
+import { detectEncoding } from "@App/pkg/utils/encoding";
 
 const backgroundPromptShownKey = "background_prompt_shown";
 
@@ -102,11 +103,6 @@ const fetchScriptBody = async (url: string, { onProgress }: { [key: string]: any
     onProgress?.({ receivedLength });
   }
 
-  // 检查 Content-Type 中的 charset
-  const contentType = response.headers.get("content-type") || "";
-  const charsetMatch = contentType.match(/charset=([^;]+)/i);
-  const charset = charsetMatch ? charsetMatch[1].toLowerCase() : "utf-8";
-
   // 合并分片（chunks）
   const chunksAll = new Uint8Array(receivedLength);
   let position = 0;
@@ -115,12 +111,18 @@ const fetchScriptBody = async (url: string, { onProgress }: { [key: string]: any
     position += chunk.length;
   }
 
+  // 检测编码：优先使用 Content-Type，回退到 chardet（仅检测前16KB）
+  const contentType = response.headers.get("content-type");
+  const encode = detectEncoding(chunksAll, contentType);
+
   // 使用检测到的 charset 解码
   let code;
   try {
-    code = new TextDecoder(charset).decode(chunksAll);
+    code = new TextDecoder(encode).decode(chunksAll);
   } catch (e: any) {
-    throw new Error(`Failed to decode response with charset ${charset}: ${e.message}`);
+    console.warn(`Failed to decode response with charset ${encode}: ${e.message}`);
+    // 回退到 UTF-8
+    code = new TextDecoder("utf-8").decode(chunksAll);
   }
 
   const metadata = parseMetadata(code);
diff --git a/src/pkg/utils/encoding.test.ts b/src/pkg/utils/encoding.test.ts
@@ -0,0 +1,142 @@
+import { describe, it, expect, vi } from "vitest";
+import { parseCharsetFromContentType, detectEncoding } from "./encoding";
+
+describe("encoding detection", () => {
+  describe("parseCharsetFromContentType", () => {
+    it("should extract charset from valid Content-Type header", () => {
+      expect(parseCharsetFromContentType("text/javascript; charset=utf-8")).toBe("utf-8");
+      expect(parseCharsetFromContentType("text/plain; charset=GBK")).toBe("gbk");
+      expect(parseCharsetFromContentType("application/javascript; charset=ISO-8859-1")).toBe("iso-8859-1");
+    });
+
+    it("should handle charset with quotes", () => {
+      expect(parseCharsetFromContentType('text/javascript; charset="utf-8"')).toBe("utf-8");
+      expect(parseCharsetFromContentType("text/javascript; charset='gbk'")).toBe("gbk");
+    });
+
+    it("should handle case-insensitive charset parameter", () => {
+      expect(parseCharsetFromContentType("text/javascript; CHARSET=UTF-8")).toBe("utf-8");
+      expect(parseCharsetFromContentType("text/javascript; Charset=GBK")).toBe("gbk");
+    });
+
+    it("should return null for missing charset", () => {
+      expect(parseCharsetFromContentType("text/javascript")).toBe(null);
+      expect(parseCharsetFromContentType("text/plain; boundary=something")).toBe(null);
+    });
+
+    it("should return null for null or empty input", () => {
+      expect(parseCharsetFromContentType(null)).toBe(null);
+      expect(parseCharsetFromContentType("")).toBe(null);
+    });
+
+    it("should handle charset with additional parameters", () => {
+      expect(parseCharsetFromContentType("text/javascript; charset=utf-8; boundary=xxx")).toBe("utf-8");
+    });
+  });
+
+  describe("detectEncoding", () => {
+    it("should prioritize valid charset from Content-Type header", () => {
+      const utf8Data = new TextEncoder().encode("hello world");
+      expect(detectEncoding(utf8Data, "text/javascript; charset=utf-8")).toBe("utf-8");
+    });
+
+    it("should fallback to chardet when Content-Type header is missing", () => {
+      // UTF-8 编码的中文
+      const utf8Data = new TextEncoder().encode("你好世界");
+      const encoding = detectEncoding(utf8Data, null);
+      expect(encoding).toBe("utf-8");
+    });
+
+    it("should fallback to chardet when Content-Type charset is invalid", () => {
+      const utf8Data = new TextEncoder().encode("hello world");
+      const encoding = detectEncoding(utf8Data, "text/javascript; charset=invalid-encoding");
+      // chardet 可能检测为 utf-8 或 ascii，都是合理的
+      expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding);
+    });
+
+    it("should fallback to utf-8 when chardet returns null", () => {
+      // 模拟 chardet 返回 null 的情况（空数据）
+      const emptyData = new Uint8Array(0);
+      const encoding = detectEncoding(emptyData, null);
+      // 空数据时，chardet 可能返回 ascii 或其他编码，但都应该是有效的
+      expect(encoding).toBeTruthy();
+      expect(() => new TextDecoder(encoding)).not.toThrow();
+    });
+
+    it("should only use first 16KB for chardet detection", () => {
+      // 创建一个大于 16KB 的数据
+      const largeData = new Uint8Array(20 * 1024);
+      // 填充 UTF-8 编码的数据
+      const text = "a".repeat(20 * 1024);
+      const textBytes = new TextEncoder().encode(text);
+      largeData.set(textBytes.slice(0, largeData.length));
+
+      const encoding = detectEncoding(largeData, null);
+      // 应该成功检测，说明使用了采样
+      expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding);
+    });
+
+    it("should handle GBK encoded data", () => {
+      // GBK 编码的 "你好" (这是一个简化的测试，实际 GBK 编码更复杂)
+      // 注意：在浏览器环境中，GBK 编码可能被识别为其他兼容编码
+      const gbkLikeData = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]); // "你好" in GBK
+      const encoding = detectEncoding(gbkLikeData, null);
+      // chardet 可能识别为 GBK、Shift_JIS 或相关的东亚编码
+      expect(encoding).toBeTruthy();
+      expect(() => new TextDecoder(encoding)).not.toThrow();
+    });
+
+    it("should handle ISO-8859-1 encoded data", () => {
+      // ISO-8859-1 特有字符（扩展 ASCII）
+      const iso88591Data = new Uint8Array([0xe9, 0xe8, 0xe0, 0xe7]); // é è à ç
+      const encoding = detectEncoding(iso88591Data, null);
+      expect(encoding).toBeTruthy();
+    });
+
+    it("should validate detected encoding is supported by TextDecoder", () => {
+      const utf8Data = new TextEncoder().encode("test");
+      const encoding = detectEncoding(utf8Data, null);
+
+      // 确保返回的编码可以被 TextDecoder 使用
+      expect(() => new TextDecoder(encoding)).not.toThrow();
+    });
+
+    it("should prefer Content-Type charset over chardet detection", () => {
+      // 即使数据看起来像 GBK，如果 Content-Type 指定了 UTF-8，应该使用 UTF-8
+      const data = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]);
+      const encoding = detectEncoding(data, "text/javascript; charset=utf-8");
+      expect(encoding).toBe("utf-8");
+    });
+
+    it("should handle charset with different cases from Content-Type", () => {
+      const data = new TextEncoder().encode("test");
+      expect(detectEncoding(data, "text/javascript; charset=UTF-8")).toBe("utf-8");
+      expect(detectEncoding(data, "text/javascript; charset=Utf-8")).toBe("utf-8");
+      expect(detectEncoding(data, "text/javascript; charset=GBK")).toBe("gbk");
+    });
+
+    it("should handle Windows-1252 encoded data", () => {
+      // Windows-1252 特有字符
+      const win1252Data = new Uint8Array([0x80, 0x82, 0x83, 0x84]); // € ‚ ƒ „
+      const encoding = detectEncoding(win1252Data, null);
+      expect(encoding).toBeTruthy();
+      // chardet 应该能检测出编码或回退到有效的编码
+      // Shift_JIS 也是一个有效的编码，chardet 可能会识别为它
+      expect(["utf-8", "windows-1252", "iso-8859-1", "shift_jis", "ascii"]).toContain(encoding);
+    });
+
+    it("should fallback to utf-8 when chardet detects invalid encoding", () => {
+      // 使用 vi.spyOn 来模拟 console.warn
+      const consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
+
+      const data = new TextEncoder().encode("test");
+      const encoding = detectEncoding(data, null);
+
+      // 应该成功返回一个有效的编码
+      expect(encoding).toBeTruthy();
+      expect(() => new TextDecoder(encoding)).not.toThrow();
+
+      consoleWarnSpy.mockRestore();
+    });
+  });
+});
diff --git a/src/pkg/utils/encoding.ts b/src/pkg/utils/encoding.ts
@@ -0,0 +1,51 @@
+import chardet from "chardet";
+
+/**
+ * 从 Content-Type header 中解析 charset
+ */
+export const parseCharsetFromContentType = (contentType: string | null): string | null => {
+  if (!contentType) return null;
+
+  const match = contentType.match(/charset=([^;]+)/i);
+  if (match && match[1]) {
+    return match[1].trim().toLowerCase().replace(/['"]/g, "");
+  }
+  return null;
+};
+
+/**
+ * 检测字节数组的编码
+ * 优先使用 Content-Type header，失败时使用 chardet（仅对前16KB检测以提升性能）
+ */
+export const detectEncoding = (data: Uint8Array, contentType: string | null): string => {
+  // 优先尝试使用 Content-Type header 中的 charset
+  const headerCharset = parseCharsetFromContentType(contentType);
+  if (headerCharset) {
+    try {
+      // 验证 charset 是否有效
+      new TextDecoder(headerCharset);
+      return headerCharset;
+    } catch (e: any) {
+      console.warn(`Invalid charset from Content-Type header: ${headerCharset}, error: ${e.message}`);
+    }
+  }
+
+  // 使用 chardet 检测编码，仅检测前16KB以提升性能
+  const sampleSize = Math.min(data.length, 16 * 1024);
+  const sample = data.subarray(0, sampleSize);
+  const detected = chardet.detect(sample);
+
+  if (detected) {
+    const encoding = detected.toLowerCase();
+    try {
+      // 验证检测到的编码是否有效
+      new TextDecoder(encoding);
+      return encoding;
+    } catch (e: any) {
+      console.warn(`Invalid charset detected by chardet: ${encoding}, error: ${e.message}`);
+    }
+  }
+
+  // 回退到 UTF-8
+  return "utf-8";
+};