55
66//! Utilities for reading files as chunks.
77
8+ // spell-checker:ignore ELEMS
89#![ allow( dead_code) ]
910// Ignores non-used warning for `borrow_buffer` in `Chunk`
1011
1112use std:: {
1213 io:: { ErrorKind , Read } ,
14+ ops:: Range ,
1315 sync:: mpsc:: SyncSender ,
1416} ;
1517
1618use memchr:: memchr_iter;
1719use self_cell:: self_cell;
1820use uucore:: error:: { UResult , USimpleError } ;
1921
20- use crate :: { GeneralBigDecimalParseResult , GlobalSettings , Line , numeric_str_cmp:: NumInfo } ;
22+ use crate :: {
23+ GeneralBigDecimalParseResult , GlobalSettings , Line , SortMode , numeric_str_cmp:: NumInfo ,
24+ } ;
25+
26+ const MAX_TOKEN_BUFFER_BYTES : usize = 4 * 1024 * 1024 ;
27+ const MAX_TOKEN_BUFFER_ELEMS : usize = MAX_TOKEN_BUFFER_BYTES / std:: mem:: size_of :: < Range < usize > > ( ) ;
2128
2229self_cell ! (
2330 /// The chunk that is passed around between threads.
@@ -35,6 +42,8 @@ self_cell!(
3542pub struct ChunkContents < ' a > {
3643 pub lines : Vec < Line < ' a > > ,
3744 pub line_data : LineData < ' a > ,
45+ pub token_buffer : Vec < Range < usize > > ,
46+ pub line_count_hint : usize ,
3847}
3948
4049#[ derive( Debug ) ]
@@ -54,6 +63,7 @@ impl Chunk {
5463 contents. line_data . num_infos . clear ( ) ;
5564 contents. line_data . parsed_floats . clear ( ) ;
5665 contents. line_data . line_num_floats . clear ( ) ;
66+ contents. token_buffer . clear ( ) ;
5767 let lines = unsafe {
5868 // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
5969 // because the vector is empty.
@@ -76,6 +86,8 @@ impl Chunk {
7686 std:: mem:: take ( & mut contents. line_data . num_infos ) ,
7787 std:: mem:: take ( & mut contents. line_data . parsed_floats ) ,
7888 std:: mem:: take ( & mut contents. line_data . line_num_floats ) ,
89+ std:: mem:: take ( & mut contents. token_buffer ) ,
90+ contents. line_count_hint ,
7991 )
8092 } ) ;
8193 RecycledChunk {
@@ -84,6 +96,8 @@ impl Chunk {
8496 num_infos : recycled_contents. 2 ,
8597 parsed_floats : recycled_contents. 3 ,
8698 line_num_floats : recycled_contents. 4 ,
99+ token_buffer : recycled_contents. 5 ,
100+ line_count_hint : recycled_contents. 6 ,
87101 buffer : self . into_owner ( ) ,
88102 }
89103 }
@@ -103,6 +117,8 @@ pub struct RecycledChunk {
103117 num_infos : Vec < NumInfo > ,
104118 parsed_floats : Vec < GeneralBigDecimalParseResult > ,
105119 line_num_floats : Vec < Option < f64 > > ,
120+ token_buffer : Vec < Range < usize > > ,
121+ line_count_hint : usize ,
106122 buffer : Vec < u8 > ,
107123}
108124
@@ -114,6 +130,8 @@ impl RecycledChunk {
114130 num_infos : Vec :: new ( ) ,
115131 parsed_floats : Vec :: new ( ) ,
116132 line_num_floats : Vec :: new ( ) ,
133+ token_buffer : Vec :: new ( ) ,
134+ line_count_hint : 0 ,
117135 buffer : vec ! [ 0 ; capacity] ,
118136 }
119137 }
@@ -157,6 +175,8 @@ pub fn read<T: Read>(
157175 num_infos,
158176 parsed_floats,
159177 line_num_floats,
178+ mut token_buffer,
179+ mut line_count_hint,
160180 mut buffer,
161181 } = recycled_chunk;
162182 if buffer. len ( ) < carry_over. len ( ) {
@@ -193,8 +213,21 @@ pub fn read<T: Read>(
193213 parsed_floats,
194214 line_num_floats,
195215 } ;
196- parse_lines ( read, & mut lines, & mut line_data, separator, settings) ;
197- Ok ( ChunkContents { lines, line_data } )
216+ parse_lines (
217+ read,
218+ & mut lines,
219+ & mut line_data,
220+ & mut token_buffer,
221+ & mut line_count_hint,
222+ separator,
223+ settings,
224+ ) ;
225+ Ok ( ChunkContents {
226+ lines,
227+ line_data,
228+ token_buffer,
229+ line_count_hint,
230+ } )
198231 } ) ;
199232 sender. send ( payload?) . unwrap ( ) ;
200233 }
@@ -206,6 +239,8 @@ fn parse_lines<'a>(
206239 read : & ' a [ u8 ] ,
207240 lines : & mut Vec < Line < ' a > > ,
208241 line_data : & mut LineData < ' a > ,
242+ token_buffer : & mut Vec < Range < usize > > ,
243+ line_count_hint : & mut usize ,
209244 separator : u8 ,
210245 settings : & GlobalSettings ,
211246) {
@@ -216,12 +251,55 @@ fn parse_lines<'a>(
216251 assert ! ( line_data. num_infos. is_empty( ) ) ;
217252 assert ! ( line_data. parsed_floats. is_empty( ) ) ;
218253 assert ! ( line_data. line_num_floats. is_empty( ) ) ;
219- let mut token_buffer = vec ! [ ] ;
220- lines. extend (
221- read. split ( |& c| c == separator)
222- . enumerate ( )
223- . map ( |( index, line) | Line :: create ( line, index, line_data, & mut token_buffer, settings) ) ,
224- ) ;
254+ token_buffer. clear ( ) ;
255+ if token_buffer. capacity ( ) > MAX_TOKEN_BUFFER_ELEMS {
256+ token_buffer. shrink_to ( MAX_TOKEN_BUFFER_ELEMS ) ;
257+ }
258+ const SMALL_CHUNK_BYTES : usize = 64 * 1024 ;
259+ let mut estimated = ( * line_count_hint) . max ( 1 ) ;
260+ let mut exact_line_count = None ;
261+ if * line_count_hint == 0 || read. len ( ) <= SMALL_CHUNK_BYTES {
262+ let count = if read. is_empty ( ) {
263+ 1
264+ } else {
265+ memchr_iter ( separator, read) . count ( ) + 1
266+ } ;
267+ exact_line_count = Some ( count) ;
268+ estimated = count;
269+ } else if estimated == 1 {
270+ const LINE_LEN_HINT : usize = 32 ;
271+ estimated = ( read. len ( ) / LINE_LEN_HINT ) . max ( 1 ) ;
272+ }
273+ lines. reserve ( estimated) ;
274+ if settings. precomputed . selections_per_line > 0 {
275+ line_data
276+ . selections
277+ . reserve ( estimated. saturating_mul ( settings. precomputed . selections_per_line ) ) ;
278+ }
279+ if settings. precomputed . num_infos_per_line > 0 {
280+ line_data
281+ . num_infos
282+ . reserve ( estimated. saturating_mul ( settings. precomputed . num_infos_per_line ) ) ;
283+ }
284+ if settings. precomputed . floats_per_line > 0 {
285+ line_data
286+ . parsed_floats
287+ . reserve ( estimated. saturating_mul ( settings. precomputed . floats_per_line ) ) ;
288+ }
289+ if settings. mode == SortMode :: Numeric {
290+ line_data. line_num_floats . reserve ( estimated) ;
291+ }
292+ let mut start = 0usize ;
293+ let mut index = 0usize ;
294+ for sep_idx in memchr_iter ( separator, read) {
295+ let line = & read[ start..sep_idx] ;
296+ lines. push ( Line :: create ( line, index, line_data, token_buffer, settings) ) ;
297+ index += 1 ;
298+ start = sep_idx + 1 ;
299+ }
300+ let line = & read[ start..] ;
301+ lines. push ( Line :: create ( line, index, line_data, token_buffer, settings) ) ;
302+ * line_count_hint = exact_line_count. unwrap_or ( index + 1 ) ;
225303}
226304
227305/// Read from `file` into `buffer`.
0 commit comments