@@ -53,46 +53,128 @@ export class FileHandler {
5353 const stat = fs . statSync ( filePath ) ;
5454 const fileSize = stat . size ;
5555
56- // Detect line ending type (CRLF vs LF)
57- let lineEndingSize = 1 ; // Default to LF (\n)
58- const sampleBuffer = Buffer . alloc ( Math . min ( 4096 , fileSize ) ) ;
59- const sampleFd = fs . openSync ( filePath , 'r' ) ;
60- fs . readSync ( sampleFd , sampleBuffer , 0 , sampleBuffer . length , 0 ) ;
61- fs . closeSync ( sampleFd ) ;
62- if ( sampleBuffer . includes ( '\r\n' ) ) {
63- lineEndingSize = 2 ; // CRLF (\r\n)
64- }
65-
66- // Index all line offsets
56+ // Index all line offsets - handle any line ending format (LF, CRLF, CR, mixed)
6757 this . lineOffsets = [ ] ;
68- let offset = 0 ;
6958 let lineNumber = 0 ;
7059 let firstLine = true ;
7160
72- const stream = fs . createReadStream ( filePath , { encoding : 'utf-8' } ) ;
73- const rl = readline . createInterface ( { input : stream , crlfDelay : Infinity } ) ;
61+ // Read file in chunks to find line boundaries and compute exact offsets
62+ const fd = fs . openSync ( filePath , 'r' ) ;
63+ const chunkSize = 1024 * 1024 ; // 1MB chunks
64+ const buffer = Buffer . alloc ( chunkSize ) ;
65+ let fileOffset = 0 ;
66+ let lineStart = 0 ;
67+ let leftover = Buffer . alloc ( 0 ) ;
7468
75- for await ( const line of rl ) {
76- const length = Buffer . byteLength ( line , 'utf-8' ) ;
77-
78- // Check first line for split header
79- if ( firstLine ) {
80- firstLine = false ;
81- const splitInfo = this . parseSplitHeader ( line ) ;
82- if ( splitInfo ) {
83- this . splitMetadata = splitInfo ;
84- this . headerLineCount = 1 ;
85- // Still store the offset but we'll skip it when returning lines
69+ try {
70+ while ( fileOffset < fileSize ) {
71+ const bytesRead = fs . readSync ( fd , buffer , 0 , chunkSize , fileOffset ) ;
72+ if ( bytesRead === 0 ) break ;
73+
74+ // Combine leftover from previous chunk with current chunk
75+ const chunk = Buffer . concat ( [ leftover , buffer . slice ( 0 , bytesRead ) ] ) ;
76+ let chunkPos = 0 ;
77+ const effectiveOffset = fileOffset - leftover . length ;
78+
79+ while ( chunkPos < chunk . length ) {
80+ const byte = chunk [ chunkPos ] ;
81+
82+ if ( byte === 0x0A ) { // LF
83+ // End of line (LF or end of CRLF)
84+ const lineEnd = effectiveOffset + chunkPos ;
85+ const lineLength = lineEnd - lineStart ;
86+
87+ // Check for CRLF - exclude CR from line content
88+ let actualLength = lineLength ;
89+ if ( lineLength > 0 && chunkPos > 0 && chunk [ chunkPos - 1 ] === 0x0D ) {
90+ actualLength = lineLength - 1 ;
91+ } else if ( lineLength > 0 && this . lineOffsets . length > 0 ) {
92+ // CR might be at end of previous chunk - check the stored length
93+ // This is handled by reading the actual bytes when needed
94+ }
95+
96+ this . lineOffsets . push ( { offset : lineStart , length : actualLength } ) ;
97+
98+ // Check first line for split header
99+ if ( firstLine ) {
100+ firstLine = false ;
101+ const lineBuffer = Buffer . alloc ( Math . min ( actualLength , 500 ) ) ;
102+ fs . readSync ( fd , lineBuffer , 0 , lineBuffer . length , lineStart ) ;
103+ const lineText = lineBuffer . toString ( 'utf-8' ) ;
104+ const splitInfo = this . parseSplitHeader ( lineText ) ;
105+ if ( splitInfo ) {
106+ this . splitMetadata = splitInfo ;
107+ this . headerLineCount = 1 ;
108+ }
109+ }
110+
111+ lineNumber ++ ;
112+ lineStart = lineEnd + 1 ; // Move past LF
113+
114+ if ( lineNumber % 100000 === 0 && onProgress ) {
115+ onProgress ( Math . min ( 99 , Math . round ( ( lineStart / fileSize ) * 100 ) ) ) ;
116+ }
117+ } else if ( byte === 0x0D ) { // CR
118+ // Could be CR-only (old Mac) or start of CRLF
119+ // Look ahead to see if next byte is LF
120+ if ( chunkPos + 1 < chunk . length ) {
121+ if ( chunk [ chunkPos + 1 ] !== 0x0A ) {
122+ // CR-only line ending (old Mac format)
123+ const lineEnd = effectiveOffset + chunkPos ;
124+ const lineLength = lineEnd - lineStart ;
125+ this . lineOffsets . push ( { offset : lineStart , length : lineLength } ) ;
126+
127+ if ( firstLine ) {
128+ firstLine = false ;
129+ const lineBuffer = Buffer . alloc ( Math . min ( lineLength , 500 ) ) ;
130+ fs . readSync ( fd , lineBuffer , 0 , lineBuffer . length , lineStart ) ;
131+ const lineText = lineBuffer . toString ( 'utf-8' ) ;
132+ const splitInfo = this . parseSplitHeader ( lineText ) ;
133+ if ( splitInfo ) {
134+ this . splitMetadata = splitInfo ;
135+ this . headerLineCount = 1 ;
136+ }
137+ }
138+
139+ lineNumber ++ ;
140+ lineStart = lineEnd + 1 ; // Move past CR
141+ }
142+ // If next is LF, we'll handle it in the LF case
143+ }
144+ // If CR is at end of chunk, we'll handle it in the next iteration
145+ }
146+ chunkPos ++ ;
86147 }
87- }
88148
89- this . lineOffsets . push ( { offset, length } ) ;
90- offset += length + lineEndingSize ; // Account for line ending (LF or CRLF)
91- lineNumber ++ ;
149+ // Keep any partial line for next chunk
150+ if ( lineStart < effectiveOffset + chunk . length ) {
151+ leftover = chunk . slice ( lineStart - effectiveOffset ) ;
152+ } else {
153+ leftover = Buffer . alloc ( 0 ) ;
154+ }
155+
156+ fileOffset += bytesRead ;
157+ }
92158
93- if ( lineNumber % 100000 === 0 && onProgress ) {
94- onProgress ( Math . min ( 99 , Math . round ( ( offset / fileSize ) * 100 ) ) ) ;
159+ // Handle last line if file doesn't end with newline
160+ if ( lineStart < fileSize ) {
161+ const lastLineLength = fileSize - lineStart ;
162+ this . lineOffsets . push ( { offset : lineStart , length : lastLineLength } ) ;
163+
164+ if ( firstLine ) {
165+ const lineBuffer = Buffer . alloc ( Math . min ( lastLineLength , 500 ) ) ;
166+ fs . readSync ( fd , lineBuffer , 0 , lineBuffer . length , lineStart ) ;
167+ const lineText = lineBuffer . toString ( 'utf-8' ) ;
168+ const splitInfo = this . parseSplitHeader ( lineText ) ;
169+ if ( splitInfo ) {
170+ this . splitMetadata = splitInfo ;
171+ this . headerLineCount = 1 ;
172+ }
173+ }
174+ lineNumber ++ ;
95175 }
176+ } finally {
177+ fs . closeSync ( fd ) ;
96178 }
97179
98180 // Adjust total lines to exclude hidden header
0 commit comments