|
| 1 | +import { describe, it, expect, vi } from "vitest"; |
| 2 | +import { parseCharsetFromContentType, detectEncoding } from "./encoding"; |
| 3 | + |
| 4 | +describe("encoding detection", () => { |
| 5 | + describe("parseCharsetFromContentType", () => { |
| 6 | + it("should extract charset from valid Content-Type header", () => { |
| 7 | + expect(parseCharsetFromContentType("text/javascript; charset=utf-8")).toBe("utf-8"); |
| 8 | + expect(parseCharsetFromContentType("text/plain; charset=GBK")).toBe("gbk"); |
| 9 | + expect(parseCharsetFromContentType("application/javascript; charset=ISO-8859-1")).toBe("iso-8859-1"); |
| 10 | + }); |
| 11 | + |
| 12 | + it("should handle charset with quotes", () => { |
| 13 | + expect(parseCharsetFromContentType('text/javascript; charset="utf-8"')).toBe("utf-8"); |
| 14 | + expect(parseCharsetFromContentType("text/javascript; charset='gbk'")).toBe("gbk"); |
| 15 | + }); |
| 16 | + |
| 17 | + it("should handle case-insensitive charset parameter", () => { |
| 18 | + expect(parseCharsetFromContentType("text/javascript; CHARSET=UTF-8")).toBe("utf-8"); |
| 19 | + expect(parseCharsetFromContentType("text/javascript; Charset=GBK")).toBe("gbk"); |
| 20 | + }); |
| 21 | + |
| 22 | + it("should return null for missing charset", () => { |
| 23 | + expect(parseCharsetFromContentType("text/javascript")).toBe(null); |
| 24 | + expect(parseCharsetFromContentType("text/plain; boundary=something")).toBe(null); |
| 25 | + }); |
| 26 | + |
| 27 | + it("should return null for null or empty input", () => { |
| 28 | + expect(parseCharsetFromContentType(null)).toBe(null); |
| 29 | + expect(parseCharsetFromContentType("")).toBe(null); |
| 30 | + }); |
| 31 | + |
| 32 | + it("should handle charset with additional parameters", () => { |
| 33 | + expect(parseCharsetFromContentType("text/javascript; charset=utf-8; boundary=xxx")).toBe("utf-8"); |
| 34 | + }); |
| 35 | + }); |
| 36 | + |
| 37 | + describe("detectEncoding", () => { |
| 38 | + it("should prioritize valid charset from Content-Type header", () => { |
| 39 | + const utf8Data = new TextEncoder().encode("hello world"); |
| 40 | + expect(detectEncoding(utf8Data, "text/javascript; charset=utf-8")).toBe("utf-8"); |
| 41 | + }); |
| 42 | + |
| 43 | + it("should fallback to chardet when Content-Type header is missing", () => { |
| 44 | + // UTF-8 编码的中文 |
| 45 | + const utf8Data = new TextEncoder().encode("你好世界"); |
| 46 | + const encoding = detectEncoding(utf8Data, null); |
| 47 | + expect(encoding).toBe("utf-8"); |
| 48 | + }); |
| 49 | + |
| 50 | + it("should fallback to chardet when Content-Type charset is invalid", () => { |
| 51 | + const utf8Data = new TextEncoder().encode("hello world"); |
| 52 | + const encoding = detectEncoding(utf8Data, "text/javascript; charset=invalid-encoding"); |
| 53 | + // chardet 可能检测为 utf-8 或 ascii,都是合理的 |
| 54 | + expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding); |
| 55 | + }); |
| 56 | + |
| 57 | + it("should fallback to utf-8 when chardet returns null", () => { |
| 58 | + // 模拟 chardet 返回 null 的情况(空数据) |
| 59 | + const emptyData = new Uint8Array(0); |
| 60 | + const encoding = detectEncoding(emptyData, null); |
| 61 | + // 空数据时,chardet 可能返回 ascii 或其他编码,但都应该是有效的 |
| 62 | + expect(encoding).toBeTruthy(); |
| 63 | + expect(() => new TextDecoder(encoding)).not.toThrow(); |
| 64 | + }); |
| 65 | + |
| 66 | + it("should only use first 16KB for chardet detection", () => { |
| 67 | + // 创建一个大于 16KB 的数据 |
| 68 | + const largeData = new Uint8Array(20 * 1024); |
| 69 | + // 填充 UTF-8 编码的数据 |
| 70 | + const text = "a".repeat(20 * 1024); |
| 71 | + const textBytes = new TextEncoder().encode(text); |
| 72 | + largeData.set(textBytes.slice(0, largeData.length)); |
| 73 | + |
| 74 | + const encoding = detectEncoding(largeData, null); |
| 75 | + // 应该成功检测,说明使用了采样 |
| 76 | + expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding); |
| 77 | + }); |
| 78 | + |
| 79 | + it("should handle GBK encoded data", () => { |
| 80 | + // GBK 编码的 "你好" (这是一个简化的测试,实际 GBK 编码更复杂) |
| 81 | + // 注意:在浏览器环境中,GBK 编码可能被识别为其他兼容编码 |
| 82 | + const gbkLikeData = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]); // "你好" in GBK |
| 83 | + const encoding = detectEncoding(gbkLikeData, null); |
| 84 | + // chardet 可能识别为 GBK、Shift_JIS 或相关的东亚编码 |
| 85 | + expect(encoding).toBeTruthy(); |
| 86 | + expect(() => new TextDecoder(encoding)).not.toThrow(); |
| 87 | + }); |
| 88 | + |
| 89 | + it("should handle ISO-8859-1 encoded data", () => { |
| 90 | + // ISO-8859-1 特有字符(扩展 ASCII) |
| 91 | + const iso88591Data = new Uint8Array([0xe9, 0xe8, 0xe0, 0xe7]); // é è à ç |
| 92 | + const encoding = detectEncoding(iso88591Data, null); |
| 93 | + expect(encoding).toBeTruthy(); |
| 94 | + }); |
| 95 | + |
| 96 | + it("should validate detected encoding is supported by TextDecoder", () => { |
| 97 | + const utf8Data = new TextEncoder().encode("test"); |
| 98 | + const encoding = detectEncoding(utf8Data, null); |
| 99 | + |
| 100 | + // 确保返回的编码可以被 TextDecoder 使用 |
| 101 | + expect(() => new TextDecoder(encoding)).not.toThrow(); |
| 102 | + }); |
| 103 | + |
| 104 | + it("should prefer Content-Type charset over chardet detection", () => { |
| 105 | + // 即使数据看起来像 GBK,如果 Content-Type 指定了 UTF-8,应该使用 UTF-8 |
| 106 | + const data = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]); |
| 107 | + const encoding = detectEncoding(data, "text/javascript; charset=utf-8"); |
| 108 | + expect(encoding).toBe("utf-8"); |
| 109 | + }); |
| 110 | + |
| 111 | + it("should handle charset with different cases from Content-Type", () => { |
| 112 | + const data = new TextEncoder().encode("test"); |
| 113 | + expect(detectEncoding(data, "text/javascript; charset=UTF-8")).toBe("utf-8"); |
| 114 | + expect(detectEncoding(data, "text/javascript; charset=Utf-8")).toBe("utf-8"); |
| 115 | + expect(detectEncoding(data, "text/javascript; charset=GBK")).toBe("gbk"); |
| 116 | + }); |
| 117 | + |
| 118 | + it("should handle Windows-1252 encoded data", () => { |
| 119 | + // Windows-1252 特有字符 |
| 120 | + const win1252Data = new Uint8Array([0x80, 0x82, 0x83, 0x84]); // € ‚ ƒ „ |
| 121 | + const encoding = detectEncoding(win1252Data, null); |
| 122 | + expect(encoding).toBeTruthy(); |
| 123 | + // chardet 应该能检测出编码或回退到有效的编码 |
| 124 | + // Shift_JIS 也是一个有效的编码,chardet 可能会识别为它 |
| 125 | + expect(["utf-8", "windows-1252", "iso-8859-1", "shift_jis", "ascii"]).toContain(encoding); |
| 126 | + }); |
| 127 | + |
| 128 | + it("should fallback to utf-8 when chardet detects invalid encoding", () => { |
| 129 | + // 使用 vi.spyOn 来模拟 console.warn |
| 130 | + const consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); |
| 131 | + |
| 132 | + const data = new TextEncoder().encode("test"); |
| 133 | + const encoding = detectEncoding(data, null); |
| 134 | + |
| 135 | + // 应该成功返回一个有效的编码 |
| 136 | + expect(encoding).toBeTruthy(); |
| 137 | + expect(() => new TextDecoder(encoding)).not.toThrow(); |
| 138 | + |
| 139 | + consoleWarnSpy.mockRestore(); |
| 140 | + }); |
| 141 | + }); |
| 142 | +}); |
0 commit comments