@@ -53,17 +53,36 @@ pub struct LineData<'a> {
5353 pub num_infos : Vec < NumInfo > ,
5454 pub parsed_floats : Vec < GeneralBigDecimalParseResult > ,
5555 pub line_num_floats : Vec < Option < f64 > > ,
56+ /// Arena buffer holding all collation sort keys concatenated.
57+ pub collation_key_buffer : Vec < u8 > ,
58+ /// End offsets into `collation_key_buffer` for each line's sort key.
59+ pub collation_key_ends : Vec < usize > ,
60+ }
61+
62+ impl LineData < ' _ > {
63+ /// Get the collation sort key for a line at the given index.
64+ pub fn collation_key ( & self , index : usize ) -> & [ u8 ] {
65+ let start = if index == 0 {
66+ 0
67+ } else {
68+ self . collation_key_ends [ index - 1 ]
69+ } ;
70+ let end = self . collation_key_ends [ index] ;
71+ & self . collation_key_buffer [ start..end]
72+ }
5673}
5774
5875impl Chunk {
5976 /// Destroy this chunk and return its components to be reused.
6077 pub fn recycle ( mut self ) -> RecycledChunk {
61- let recycled_contents = self . with_dependent_mut ( |_, contents| {
78+ let mut recycled_contents = self . with_dependent_mut ( |_, contents| {
6279 contents. lines . clear ( ) ;
6380 contents. line_data . selections . clear ( ) ;
6481 contents. line_data . num_infos . clear ( ) ;
6582 contents. line_data . parsed_floats . clear ( ) ;
6683 contents. line_data . line_num_floats . clear ( ) ;
84+ contents. line_data . collation_key_buffer . clear ( ) ;
85+ contents. line_data . collation_key_ends . clear ( ) ;
6786 contents. token_buffer . clear ( ) ;
6887 let lines = unsafe {
6988 // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
@@ -81,26 +100,22 @@ impl Chunk {
81100 & mut contents. line_data . selections ,
82101 ) )
83102 } ;
84- (
103+ RecycledChunk {
85104 lines,
86105 selections,
87- std:: mem:: take ( & mut contents. line_data . num_infos ) ,
88- std:: mem:: take ( & mut contents. line_data . parsed_floats ) ,
89- std:: mem:: take ( & mut contents. line_data . line_num_floats ) ,
90- std:: mem:: take ( & mut contents. token_buffer ) ,
91- contents. line_count_hint ,
92- )
106+ num_infos : std:: mem:: take ( & mut contents. line_data . num_infos ) ,
107+ parsed_floats : std:: mem:: take ( & mut contents. line_data . parsed_floats ) ,
108+ line_num_floats : std:: mem:: take ( & mut contents. line_data . line_num_floats ) ,
109+ collation_key_buffer : std:: mem:: take ( & mut contents. line_data . collation_key_buffer ) ,
110+ collation_key_ends : std:: mem:: take ( & mut contents. line_data . collation_key_ends ) ,
111+ token_buffer : std:: mem:: take ( & mut contents. token_buffer ) ,
112+ line_count_hint : contents. line_count_hint ,
113+ // buffer is set below after we consume `self`
114+ buffer : Vec :: new ( ) ,
115+ }
93116 } ) ;
94- RecycledChunk {
95- lines : recycled_contents. 0 ,
96- selections : recycled_contents. 1 ,
97- num_infos : recycled_contents. 2 ,
98- parsed_floats : recycled_contents. 3 ,
99- line_num_floats : recycled_contents. 4 ,
100- token_buffer : recycled_contents. 5 ,
101- line_count_hint : recycled_contents. 6 ,
102- buffer : self . into_owner ( ) ,
103- }
117+ recycled_contents. buffer = self . into_owner ( ) ;
118+ recycled_contents
104119 }
105120
106121 pub fn lines ( & self ) -> & Vec < Line < ' _ > > {
@@ -118,6 +133,8 @@ pub struct RecycledChunk {
118133 num_infos : Vec < NumInfo > ,
119134 parsed_floats : Vec < GeneralBigDecimalParseResult > ,
120135 line_num_floats : Vec < Option < f64 > > ,
136+ collation_key_buffer : Vec < u8 > ,
137+ collation_key_ends : Vec < usize > ,
121138 token_buffer : Vec < Range < usize > > ,
122139 line_count_hint : usize ,
123140 buffer : Vec < u8 > ,
@@ -131,6 +148,8 @@ impl RecycledChunk {
131148 num_infos : Vec :: new ( ) ,
132149 parsed_floats : Vec :: new ( ) ,
133150 line_num_floats : Vec :: new ( ) ,
151+ collation_key_buffer : Vec :: new ( ) ,
152+ collation_key_ends : Vec :: new ( ) ,
134153 token_buffer : Vec :: new ( ) ,
135154 line_count_hint : 0 ,
136155 buffer : vec ! [ 0 ; capacity] ,
@@ -176,6 +195,8 @@ pub fn read<T: Read>(
176195 num_infos,
177196 parsed_floats,
178197 line_num_floats,
198+ collation_key_buffer,
199+ collation_key_ends,
179200 mut token_buffer,
180201 mut line_count_hint,
181202 mut buffer,
@@ -214,6 +235,8 @@ pub fn read<T: Read>(
214235 num_infos,
215236 parsed_floats,
216237 line_num_floats,
238+ collation_key_buffer,
239+ collation_key_ends,
217240 } ;
218241 parse_lines (
219242 read,
@@ -253,6 +276,8 @@ fn parse_lines<'a>(
253276 assert ! ( line_data. num_infos. is_empty( ) ) ;
254277 assert ! ( line_data. parsed_floats. is_empty( ) ) ;
255278 assert ! ( line_data. line_num_floats. is_empty( ) ) ;
279+ assert ! ( line_data. collation_key_buffer. is_empty( ) ) ;
280+ assert ! ( line_data. collation_key_ends. is_empty( ) ) ;
256281 token_buffer. clear ( ) ;
257282 const SMALL_CHUNK_BYTES : usize = 64 * 1024 ;
258283 let mut estimated = ( * line_count_hint) . max ( 1 ) ;
0 commit comments