Skip to content

Commit d9260d1

Browse files
committed
Robust line ending handling for all formats
- Support LF (Unix), CRLF (Windows), CR (old Mac), and mixed line endings - Process file in 1MB chunks for efficient memory usage - Compute exact byte offsets for each line - Handle files without trailing newline - Handle CR at chunk boundaries correctly
1 parent 57313da commit d9260d1

1 file changed

Lines changed: 113 additions & 31 deletions

File tree

src/main/fileHandler.ts

Lines changed: 113 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -53,46 +53,128 @@ export class FileHandler {
5353
const stat = fs.statSync(filePath);
5454
const fileSize = stat.size;
5555

56-
// Detect line ending type (CRLF vs LF)
57-
let lineEndingSize = 1; // Default to LF (\n)
58-
const sampleBuffer = Buffer.alloc(Math.min(4096, fileSize));
59-
const sampleFd = fs.openSync(filePath, 'r');
60-
fs.readSync(sampleFd, sampleBuffer, 0, sampleBuffer.length, 0);
61-
fs.closeSync(sampleFd);
62-
if (sampleBuffer.includes('\r\n')) {
63-
lineEndingSize = 2; // CRLF (\r\n)
64-
}
65-
66-
// Index all line offsets
56+
// Index all line offsets - handle any line ending format (LF, CRLF, CR, mixed)
6757
this.lineOffsets = [];
68-
let offset = 0;
6958
let lineNumber = 0;
7059
let firstLine = true;
7160

72-
const stream = fs.createReadStream(filePath, { encoding: 'utf-8' });
73-
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
61+
// Read file in chunks to find line boundaries and compute exact offsets
62+
const fd = fs.openSync(filePath, 'r');
63+
const chunkSize = 1024 * 1024; // 1MB chunks
64+
const buffer = Buffer.alloc(chunkSize);
65+
let fileOffset = 0;
66+
let lineStart = 0;
67+
let leftover = Buffer.alloc(0);
7468

75-
for await (const line of rl) {
76-
const length = Buffer.byteLength(line, 'utf-8');
77-
78-
// Check first line for split header
79-
if (firstLine) {
80-
firstLine = false;
81-
const splitInfo = this.parseSplitHeader(line);
82-
if (splitInfo) {
83-
this.splitMetadata = splitInfo;
84-
this.headerLineCount = 1;
85-
// Still store the offset but we'll skip it when returning lines
69+
try {
70+
while (fileOffset < fileSize) {
71+
const bytesRead = fs.readSync(fd, buffer, 0, chunkSize, fileOffset);
72+
if (bytesRead === 0) break;
73+
74+
// Combine leftover from previous chunk with current chunk
75+
const chunk = Buffer.concat([leftover, buffer.slice(0, bytesRead)]);
76+
let chunkPos = 0;
77+
const effectiveOffset = fileOffset - leftover.length;
78+
79+
while (chunkPos < chunk.length) {
80+
const byte = chunk[chunkPos];
81+
82+
if (byte === 0x0A) { // LF
83+
// End of line (LF or end of CRLF)
84+
const lineEnd = effectiveOffset + chunkPos;
85+
const lineLength = lineEnd - lineStart;
86+
87+
// Check for CRLF - exclude CR from line content
88+
let actualLength = lineLength;
89+
if (lineLength > 0 && chunkPos > 0 && chunk[chunkPos - 1] === 0x0D) {
90+
actualLength = lineLength - 1;
91+
} else if (lineLength > 0 && this.lineOffsets.length > 0) {
92+
// CR might be at end of previous chunk - check the stored length
93+
// This is handled by reading the actual bytes when needed
94+
}
95+
96+
this.lineOffsets.push({ offset: lineStart, length: actualLength });
97+
98+
// Check first line for split header
99+
if (firstLine) {
100+
firstLine = false;
101+
const lineBuffer = Buffer.alloc(Math.min(actualLength, 500));
102+
fs.readSync(fd, lineBuffer, 0, lineBuffer.length, lineStart);
103+
const lineText = lineBuffer.toString('utf-8');
104+
const splitInfo = this.parseSplitHeader(lineText);
105+
if (splitInfo) {
106+
this.splitMetadata = splitInfo;
107+
this.headerLineCount = 1;
108+
}
109+
}
110+
111+
lineNumber++;
112+
lineStart = lineEnd + 1; // Move past LF
113+
114+
if (lineNumber % 100000 === 0 && onProgress) {
115+
onProgress(Math.min(99, Math.round((lineStart / fileSize) * 100)));
116+
}
117+
} else if (byte === 0x0D) { // CR
118+
// Could be CR-only (old Mac) or start of CRLF
119+
// Look ahead to see if next byte is LF
120+
if (chunkPos + 1 < chunk.length) {
121+
if (chunk[chunkPos + 1] !== 0x0A) {
122+
// CR-only line ending (old Mac format)
123+
const lineEnd = effectiveOffset + chunkPos;
124+
const lineLength = lineEnd - lineStart;
125+
this.lineOffsets.push({ offset: lineStart, length: lineLength });
126+
127+
if (firstLine) {
128+
firstLine = false;
129+
const lineBuffer = Buffer.alloc(Math.min(lineLength, 500));
130+
fs.readSync(fd, lineBuffer, 0, lineBuffer.length, lineStart);
131+
const lineText = lineBuffer.toString('utf-8');
132+
const splitInfo = this.parseSplitHeader(lineText);
133+
if (splitInfo) {
134+
this.splitMetadata = splitInfo;
135+
this.headerLineCount = 1;
136+
}
137+
}
138+
139+
lineNumber++;
140+
lineStart = lineEnd + 1; // Move past CR
141+
}
142+
// If next is LF, we'll handle it in the LF case
143+
}
144+
// If CR is at end of chunk, we'll handle it in the next iteration
145+
}
146+
chunkPos++;
86147
}
87-
}
88148

89-
this.lineOffsets.push({ offset, length });
90-
offset += length + lineEndingSize; // Account for line ending (LF or CRLF)
91-
lineNumber++;
149+
// Keep any partial line for next chunk
150+
if (lineStart < effectiveOffset + chunk.length) {
151+
leftover = chunk.slice(lineStart - effectiveOffset);
152+
} else {
153+
leftover = Buffer.alloc(0);
154+
}
155+
156+
fileOffset += bytesRead;
157+
}
92158

93-
if (lineNumber % 100000 === 0 && onProgress) {
94-
onProgress(Math.min(99, Math.round((offset / fileSize) * 100)));
159+
// Handle last line if file doesn't end with newline
160+
if (lineStart < fileSize) {
161+
const lastLineLength = fileSize - lineStart;
162+
this.lineOffsets.push({ offset: lineStart, length: lastLineLength });
163+
164+
if (firstLine) {
165+
const lineBuffer = Buffer.alloc(Math.min(lastLineLength, 500));
166+
fs.readSync(fd, lineBuffer, 0, lineBuffer.length, lineStart);
167+
const lineText = lineBuffer.toString('utf-8');
168+
const splitInfo = this.parseSplitHeader(lineText);
169+
if (splitInfo) {
170+
this.splitMetadata = splitInfo;
171+
this.headerLineCount = 1;
172+
}
173+
}
174+
lineNumber++;
95175
}
176+
} finally {
177+
fs.closeSync(fd);
96178
}
97179

98180
// Adjust total lines to exclude hidden header

0 commit comments

Comments
 (0)