Skip to content

Commit 0243902

Browse files
committed
fix: binary file detection
1 parent 65c051a commit 0243902

5 files changed

Lines changed: 285 additions & 90 deletions

File tree

src/electron/shared/binary.ts

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Shared binary detection (web + desktop workers + UI).
2+
// Strategy:
3+
// 1) Extension hints (cheap).
4+
// 2) Magic-byte signatures.
5+
// 3) UTF-8 text heuristic on a small sample.
6+
// 4) SVG special-case (XML-ish text).
7+
//
8+
// Keep fast and dependency-free.
9+
10+
export const SNIFF_BYTES = 8192
11+
12+
// Pragmatic denylist; not authoritative, just an early-out.
13+
const BINARY_EXTS = new Set([
14+
'.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico',
15+
'.pdf','.zip','.rar','.7z','.tar','.gz','.tgz',
16+
'.mp3','.wav','.flac',
17+
'.mp4','.mov','.avi','.mkv','.webm',
18+
'.exe','.dll','.bin','.dmg','.pkg','.iso',
19+
'.woff','.woff2','.ttf','.otf',
20+
'.so','.dylib','.class','.jar',
21+
'.psd','.ai','.sketch',
22+
'.wasm',
23+
// SVG is special-cased below; keep here so we early-out unless the content proves XML-ish
24+
'.svg',
25+
])
26+
27+
export function isBinaryPath(path: string): boolean {
28+
const i = path.lastIndexOf('.')
29+
if (i < 0) return false
30+
const ext = path.slice(i).toLowerCase()
31+
return BINARY_EXTS.has(ext)
32+
}
33+
34+
function startsWith(bytes: Uint8Array, ascii: string, offset = 0): boolean {
35+
if (offset + ascii.length > bytes.length) return false
36+
for (let i = 0; i < ascii.length; i++) {
37+
if (bytes[offset + i] !== ascii.charCodeAt(i)) return false
38+
}
39+
return true
40+
}
41+
42+
// Spot common binary formats by signature (magic bytes).
43+
export function hasBinaryMagic(bytes: Uint8Array): boolean {
44+
const b = bytes
45+
const len = b.length
46+
if (len >= 8) {
47+
// PNG
48+
if (
49+
b[0] === 0x89 && b[1] === 0x50 && b[2] === 0x4E && b[3] === 0x47 &&
50+
b[4] === 0x0D && b[5] === 0x0A && b[6] === 0x1A && b[7] === 0x0A
51+
) return true
52+
}
53+
if (len >= 3) {
54+
// JPEG
55+
if (b[0] === 0xFF && b[1] === 0xD8 && b[2] === 0xFF) return true
56+
}
57+
// GIF
58+
if (startsWith(b, 'GIF87a') || startsWith(b, 'GIF89a')) return true
59+
// PDF
60+
if (startsWith(b, '%PDF-')) return true
61+
// ZIP (also covers many Office docs, apk, jar)
62+
if (len >= 4 && b[0] === 0x50 && b[1] === 0x4B && (b[2] === 0x03 || b[2] === 0x05 || b[2] === 0x07) && (b[3] === 0x04 || b[3] === 0x06 || b[3] === 0x08)) return true
63+
// GZIP
64+
if (len >= 3 && b[0] === 0x1F && b[1] === 0x8B && b[2] === 0x08) return true
65+
// MP3 (ID3)
66+
if (startsWith(b, 'ID3')) return true
67+
// MP4/ISO BMFF
68+
if (len >= 12 && startsWith(b, 'ftyp', 4)) return true
69+
// OGG
70+
if (startsWith(b, 'OggS')) return true
71+
// Matroska/WebM
72+
if (len >= 4 && b[0] === 0x1A && b[1] === 0x45 && b[2] === 0xDF && b[3] === 0xA3) return true
73+
// WOFF/WOFF2
74+
if (startsWith(b, 'wOFF') || startsWith(b, 'wOF2')) return true
75+
// TTF/OTF
76+
if (len >= 4 && ((b[0] === 0x00 && b[1] === 0x01 && b[2] === 0x00 && b[3] === 0x00) || startsWith(b, 'OTTO'))) return true
77+
// Windows MZ / ELF
78+
if (startsWith(b, 'MZ') || (len >= 4 && b[0] === 0x7F && b[1] === 0x45 && b[2] === 0x4C && b[3] === 0x46)) return true
79+
return false
80+
}
81+
82+
// SVG often lives in repos as text; detect text-y SVG even if extension is .svg.
83+
function isXmlSvgText(bytes: Uint8Array): boolean {
84+
// Skip leading whitespace / BOM, then expect '<'
85+
let i = 0
86+
while (i < bytes.length && (bytes[i] === 0xEF || bytes[i] === 0xBB || bytes[i] === 0xBF || bytes[i] <= 0x20)) i++
87+
if (i >= bytes.length || bytes[i] !== 0x3C /* '<' */) return false
88+
// Look for "<?xml" or "<svg"
89+
return startsWith(bytes, '?xml', i + 1) || startsWith(bytes, 'svg', i + 1) || startsWith(bytes, '!DOCTYPE svg', i + 1)
90+
}
91+
92+
// Heuristic: treat as binary if many NUL/control chars in the sample.
93+
export function looksBinaryHeuristic(bytes: Uint8Array): boolean {
94+
const n = Math.min(bytes.length, SNIFF_BYTES)
95+
if (n === 0) return false
96+
let suspicious = 0
97+
for (let i = 0; i < n; i++) {
98+
const c = bytes[i]
99+
if (c === 0) { suspicious += 2; continue }
100+
// control chars outside common whitespace range
101+
if (c < 7 || (c > 13 && c < 32)) suspicious++
102+
}
103+
return suspicious / n > 0.30
104+
}
105+
106+
export function detectBinaryByContent(sample: Uint8Array, path?: string): boolean {
107+
// path hint
108+
if (path && isBinaryPath(path)) {
109+
// Allow SVG override if it looks like XML
110+
if (path.toLowerCase().endsWith('.svg') && isXmlSvgText(sample)) return false
111+
return true
112+
}
113+
if (hasBinaryMagic(sample)) return true
114+
// SVG override if no magic but looks like XML text
115+
if (isXmlSvgText(sample)) return false
116+
return looksBinaryHeuristic(sample)
117+
}
118+
119+
// Convenience: decide with or without sample.
120+
export function shouldTreatAsBinary(path: string, sample?: Uint8Array): boolean {
121+
if (!sample) return isBinaryPath(path)
122+
return detectBinaryByContent(sample, path)
123+
}
124+
125+
export { BINARY_EXTS }

src/electron/workers/nodeGitWorker.ts

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import * as fs from 'fs'
33
import * as path from 'path'
44
import * as git from 'isomorphic-git'
55
import { LRUCache } from 'lru-cache'
6+
import { isBinaryPath, detectBinaryByContent, SNIFF_BYTES } from '../shared/binary'
67

78
type Msg =
89
| { id: number; type: 'loadRepo'; repoPath: string }
@@ -17,19 +18,6 @@ const blobCache = new LRUCache<string, { binary: boolean; text: string | null }>
1718
let blobCacheHits = 0
1819
const gitCache: Record<string, any> = Object.create(null)
1920
const WORKDIR = '__WORKDIR__'
20-
const BINARY_EXTS = [
21-
'.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico',
22-
'.pdf','.zip','.rar','.7z','.tar','.gz','.tgz',
23-
'.mp3','.wav','.flac',
24-
'.mp4','.mov','.avi','.mkv','.webm',
25-
'.exe','.dll','.bin','.dmg','.pkg','.iso',
26-
'.woff','.woff2','.ttf','.otf',
27-
'.svg'
28-
]
29-
function isBinaryPathLocal(p: string): boolean {
30-
const lower = p.toLowerCase()
31-
return BINARY_EXTS.some(ext => lower.endsWith(ext))
32-
}
3321

3422
// Helper function to parse packed-refs
3523
async function parsePackedRefs(repoPath: string): Promise<string[]> {
@@ -58,14 +46,7 @@ function ok(id: number, data?: any) { send({ id, type: 'ok', data }) }
5846
function err(id: number, error: string) { send({ id, type: 'error', error }) }
5947
function progress(id: number, message: string) { send({ id, type: 'progress', message }) }
6048

61-
function looksBinary(buf: Buffer): boolean {
62-
const len = Math.min(buf.length, 8000)
63-
for (let i = 0; i < len; i++) {
64-
const c = buf[i]
65-
if (c === 0) return true
66-
}
67-
return false
68-
}
49+
// (content sniffing comes from shared helper)
6950

7051
parentPort?.on('message', async (m: Msg) => {
7152
try {
@@ -141,7 +122,7 @@ parentPort?.on('message', async (m: Msg) => {
141122
}
142123
case 'readFile': {
143124
// Fast path: known-binary extension => no content read
144-
if (isBinaryPathLocal(m.filepath)) {
125+
if (isBinaryPath(m.filepath)) {
145126
if (m.ref !== WORKDIR) {
146127
ok(m.id, { binary: true, text: null, notFound: false }); return
147128
}
@@ -168,17 +149,32 @@ parentPort?.on('message', async (m: Msg) => {
168149
const res = await git.readBlob({ fs, dir: repoPath, oid: commitOid, filepath: m.filepath }).catch(() => null)
169150
if (!res) { ok(m.id, { binary: false, text: null, notFound: true }); return }
170151
const buf = Buffer.from(res.blob)
171-
const binary = looksBinary(buf)
152+
const sample = buf.subarray(0, SNIFF_BYTES)
153+
const binary = detectBinaryByContent(sample, m.filepath)
172154
const value = { binary, text: binary ? null : buf.toString('utf8') }
173155
blobCache.set(cacheKey, value)
174156
ok(m.id, { ...value, notFound: false })
175157
return
176158
}
159+
// Partial read to sniff type without pulling whole large files
177160
const fileAbs = path.join(repoPath, m.filepath)
178-
const buf = await fs.promises.readFile(fileAbs).catch(() => null as any)
179-
if (!buf) { ok(m.id, { binary: false, text: null, notFound: true }); return }
180-
const binary = looksBinary(buf as Buffer)
181-
ok(m.id, { binary, text: binary ? null : (buf as Buffer).toString('utf8') })
161+
const fd = await fs.promises.open(fileAbs, 'r').catch(() => null as any)
162+
if (!fd) { ok(m.id, { binary: false, text: null, notFound: true }); return }
163+
try {
164+
const probe = Buffer.allocUnsafe(SNIFF_BYTES)
165+
const { bytesRead } = await fd.read(probe, 0, SNIFF_BYTES, 0)
166+
const sample = probe.subarray(0, bytesRead)
167+
const binary = detectBinaryByContent(sample, m.filepath)
168+
if (binary) {
169+
ok(m.id, { binary: true, text: null, notFound: false })
170+
} else {
171+
// Now read full text if needed
172+
const full = await fs.promises.readFile(fileAbs, 'utf8')
173+
ok(m.id, { binary: false, text: full, notFound: false })
174+
}
175+
} finally {
176+
await fd.close().catch(() => {})
177+
}
182178
return
183179
}
184180
case 'listFiles': {

src/web/src/shared/binary.ts

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Shared binary detection (web + desktop workers + UI).
2+
// Strategy:
3+
// 1) Extension hints (cheap).
4+
// 2) Magic-byte signatures.
5+
// 3) UTF-8 text heuristic on a small sample.
6+
// 4) SVG special-case (XML-ish text).
7+
//
8+
// Keep fast and dependency-free.
9+
10+
export const SNIFF_BYTES = 8192
11+
12+
// Pragmatic denylist; not authoritative, just an early-out.
13+
const BINARY_EXTS = new Set([
14+
'.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico',
15+
'.pdf','.zip','.rar','.7z','.tar','.gz','.tgz',
16+
'.mp3','.wav','.flac',
17+
'.mp4','.mov','.avi','.mkv','.webm',
18+
'.exe','.dll','.bin','.dmg','.pkg','.iso',
19+
'.woff','.woff2','.ttf','.otf',
20+
'.so','.dylib','.class','.jar',
21+
'.psd','.ai','.sketch',
22+
'.wasm',
23+
// SVG is special-cased below; keep here so we early-out unless the content proves XML-ish
24+
'.svg',
25+
])
26+
27+
export function isBinaryPath(path: string): boolean {
28+
const i = path.lastIndexOf('.')
29+
if (i < 0) return false
30+
const ext = path.slice(i).toLowerCase()
31+
return BINARY_EXTS.has(ext)
32+
}
33+
34+
function startsWith(bytes: Uint8Array, ascii: string, offset = 0): boolean {
35+
if (offset + ascii.length > bytes.length) return false
36+
for (let i = 0; i < ascii.length; i++) {
37+
if (bytes[offset + i] !== ascii.charCodeAt(i)) return false
38+
}
39+
return true
40+
}
41+
42+
// Spot common binary formats by signature (magic bytes).
43+
export function hasBinaryMagic(bytes: Uint8Array): boolean {
44+
const b = bytes
45+
const len = b.length
46+
if (len >= 8) {
47+
// PNG
48+
if (
49+
b[0] === 0x89 && b[1] === 0x50 && b[2] === 0x4E && b[3] === 0x47 &&
50+
b[4] === 0x0D && b[5] === 0x0A && b[6] === 0x1A && b[7] === 0x0A
51+
) return true
52+
}
53+
if (len >= 3) {
54+
// JPEG
55+
if (b[0] === 0xFF && b[1] === 0xD8 && b[2] === 0xFF) return true
56+
}
57+
// GIF
58+
if (startsWith(b, 'GIF87a') || startsWith(b, 'GIF89a')) return true
59+
// PDF
60+
if (startsWith(b, '%PDF-')) return true
61+
// ZIP (also covers many Office docs, apk, jar)
62+
if (len >= 4 && b[0] === 0x50 && b[1] === 0x4B && (b[2] === 0x03 || b[2] === 0x05 || b[2] === 0x07) && (b[3] === 0x04 || b[3] === 0x06 || b[3] === 0x08)) return true
63+
// GZIP
64+
if (len >= 3 && b[0] === 0x1F && b[1] === 0x8B && b[2] === 0x08) return true
65+
// MP3 (ID3)
66+
if (startsWith(b, 'ID3')) return true
67+
// MP4/ISO BMFF
68+
if (len >= 12 && startsWith(b, 'ftyp', 4)) return true
69+
// OGG
70+
if (startsWith(b, 'OggS')) return true
71+
// Matroska/WebM
72+
if (len >= 4 && b[0] === 0x1A && b[1] === 0x45 && b[2] === 0xDF && b[3] === 0xA3) return true
73+
// WOFF/WOFF2
74+
if (startsWith(b, 'wOFF') || startsWith(b, 'wOF2')) return true
75+
// TTF/OTF
76+
if (len >= 4 && ((b[0] === 0x00 && b[1] === 0x01 && b[2] === 0x00 && b[3] === 0x00) || startsWith(b, 'OTTO'))) return true
77+
// Windows MZ / ELF
78+
if (startsWith(b, 'MZ') || (len >= 4 && b[0] === 0x7F && b[1] === 0x45 && b[2] === 0x4C && b[3] === 0x46)) return true
79+
return false
80+
}
81+
82+
// SVG often lives in repos as text; detect text-y SVG even if extension is .svg.
83+
function isXmlSvgText(bytes: Uint8Array): boolean {
84+
// Skip leading whitespace / BOM, then expect '<'
85+
let i = 0
86+
while (i < bytes.length && (bytes[i] === 0xEF || bytes[i] === 0xBB || bytes[i] === 0xBF || bytes[i] <= 0x20)) i++
87+
if (i >= bytes.length || bytes[i] !== 0x3C /* '<' */) return false
88+
// Look for "<?xml" or "<svg"
89+
return startsWith(bytes, '?xml', i + 1) || startsWith(bytes, 'svg', i + 1) || startsWith(bytes, '!DOCTYPE svg', i + 1)
90+
}
91+
92+
// Heuristic: treat as binary if many NUL/control chars in the sample.
93+
export function looksBinaryHeuristic(bytes: Uint8Array): boolean {
94+
const n = Math.min(bytes.length, SNIFF_BYTES)
95+
if (n === 0) return false
96+
let suspicious = 0
97+
for (let i = 0; i < n; i++) {
98+
const c = bytes[i]
99+
if (c === 0) { suspicious += 2; continue }
100+
// control chars outside common whitespace range
101+
if (c < 7 || (c > 13 && c < 32)) suspicious++
102+
}
103+
return suspicious / n > 0.30
104+
}
105+
106+
export function detectBinaryByContent(sample: Uint8Array, path?: string): boolean {
107+
// path hint
108+
if (path && isBinaryPath(path)) {
109+
// Allow SVG override if it looks like XML
110+
if (path.toLowerCase().endsWith('.svg') && isXmlSvgText(sample)) return false
111+
return true
112+
}
113+
if (hasBinaryMagic(sample)) return true
114+
// SVG override if no magic but looks like XML text
115+
if (isXmlSvgText(sample)) return false
116+
return looksBinaryHeuristic(sample)
117+
}
118+
119+
// Convenience: decide with or without sample.
120+
export function shouldTreatAsBinary(path: string, sample?: Uint8Array): boolean {
121+
if (!sample) return isBinaryPath(path)
122+
return detectBinaryByContent(sample, path)
123+
}
124+
125+
export { BINARY_EXTS }

src/web/src/utils/binary.ts

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,2 @@
1-
// Centralized binary-file heuristics to keep UI, workers, and counters in sync.
2-
// Note: We treat SVG as binary here for safety/perf (often very large).
3-
const BINARY_EXTS = [
4-
'.png','.jpg','.jpeg','.gif','.bmp','.webp','.ico',
5-
'.pdf','.zip','.rar','.7z','.tar','.gz','.tgz',
6-
'.mp3','.wav','.flac',
7-
'.mp4','.mov','.avi','.mkv','.webm',
8-
'.exe','.dll','.bin','.dmg','.pkg','.iso',
9-
'.woff','.woff2','.ttf','.otf',
10-
'.svg'
11-
]
12-
13-
export function isBinaryPath(p: string): boolean {
14-
const lower = p.toLowerCase()
15-
return BINARY_EXTS.some(ext => lower.endsWith(ext))
16-
}
17-
18-
export { BINARY_EXTS }
1+
// Re-export shared detector for web UI code paths.
2+
export * from '../shared/binary'

0 commit comments

Comments
 (0)