@@ -5,7 +5,12 @@ use crate::{
55 ColumnDictionaryConfig , ColumnEncodingConfig , DEFAULT_BLOOM_FILTER_FPP , DataFormat ,
66 DictionaryMode , ListOutputsFormat , ParquetCompression , ParquetEncoding , ParquetStatistics ,
77 ParquetWriterVersion , PartitionStrategy , SortSpec , TransformCommand , default_thread_budget,
8- io_strategies:: { OutputFileInfo , output_strategy:: SinkFactory , path_template:: PathTemplate } ,
8+ io_strategies:: {
9+ OutputFileInfo ,
10+ input_strategy:: InputStrategy ,
11+ output_strategy:: SinkFactory ,
12+ path_template:: PathTemplate ,
13+ } ,
914 operations:: { query:: QueryOperation , sort:: SortOperation } ,
1015 pipeline:: Pipeline ,
1116 sinks:: {
@@ -18,6 +23,7 @@ use crate::{
1823 arrow:: ArrowDataSource , data_source:: DataSource , parquet:: ParquetDataSource ,
1924 vortex:: VortexDataSource ,
2025 } ,
26+ utils:: memory:: { estimate_sort_spill_reservation, sample_avg_row_bytes} ,
2127} ;
2228use anyhow:: { Result , anyhow} ;
2329use arrow:: datatypes:: SchemaRef ;
@@ -178,70 +184,84 @@ pub async fn run(args: TransformCommand) -> Result<()> {
178184 ( from_many, true )
179185 } ;
180186
181- let setup_result: Result < ( ) > = {
182- if !should_glob && input_paths. len ( ) == 1 {
183- let input_path = & input_paths[ 0 ] ;
184- let detected_input_format = detect_format ( input_path, input_format) ?;
185-
186- let source: Box < dyn DataSource > = match detected_input_format {
187- DataFormat :: Arrow => Box :: new ( ArrowDataSource :: new ( input_path. clone ( ) ) ) ,
188- DataFormat :: Parquet => Box :: new ( ParquetDataSource :: new ( input_path. clone ( ) ) ) ,
189- DataFormat :: Vortex => Box :: new ( VortexDataSource :: new ( input_path. clone ( ) ) ) ,
190- } ;
191-
192- pipeline = pipeline. with_input_strategy_with_single_source ( source) ;
193- Ok ( ( ) )
194- } else {
195- let mut expanded_paths = Vec :: new ( ) ;
196-
197- for pattern in & input_paths {
198- for entry in glob ( pattern)
199- . map_err ( |e| anyhow ! ( "Error expanding glob pattern {}: {}" , pattern, e) ) ?
200- {
201- expanded_paths. push (
202- entry
203- . map_err ( |e| anyhow ! ( "Error decoding file path: {}" , e) ) ?
204- . to_string_lossy ( )
205- . to_string ( ) ,
206- ) ;
207- }
187+ // resolve input paths (glob-expand if needed), build sources, and create InputStrategy
188+ let resolved_paths: Vec < String > ;
189+ let input_strategy = if !should_glob && input_paths. len ( ) == 1 {
190+ let input_path = & input_paths[ 0 ] ;
191+ let source = make_source ( input_path, input_format) ?;
192+ resolved_paths = vec ! [ input_path. clone( ) ] ;
193+ InputStrategy :: Single ( source)
194+ } else {
195+ let mut expanded_paths = Vec :: new ( ) ;
196+
197+ for pattern in & input_paths {
198+ for entry in glob ( pattern)
199+ . map_err ( |e| anyhow ! ( "Error expanding glob pattern {}: {}" , pattern, e) ) ?
200+ {
201+ expanded_paths. push (
202+ entry
203+ . map_err ( |e| anyhow ! ( "Error decoding file path: {}" , e) ) ?
204+ . to_string_lossy ( )
205+ . to_string ( ) ,
206+ ) ;
208207 }
208+ }
209209
210- expanded_paths. sort ( ) ;
211- expanded_paths. dedup ( ) ;
210+ expanded_paths. sort ( ) ;
211+ expanded_paths. dedup ( ) ;
212212
213- if expanded_paths. is_empty ( ) {
214- anyhow:: bail!( "No input files found matching patterns: {:?}" , input_paths) ;
215- }
213+ if expanded_paths. is_empty ( ) {
214+ anyhow:: bail!( "No input files found matching patterns: {:?}" , input_paths) ;
215+ }
216216
217- let mut sources: Vec < Box < dyn DataSource > > = Vec :: new ( ) ;
218- let mut schema: Option < SchemaRef > = None ;
219- for input_path in expanded_paths {
220- let detected_input_format = detect_format ( & input_path, input_format) ?;
221- let source: Box < dyn DataSource > = match detected_input_format {
222- DataFormat :: Arrow => Box :: new ( ArrowDataSource :: new ( input_path. clone ( ) ) ) ,
223- DataFormat :: Parquet => Box :: new ( ParquetDataSource :: new ( input_path. clone ( ) ) ) ,
224- DataFormat :: Vortex => Box :: new ( VortexDataSource :: new ( input_path. clone ( ) ) ) ,
225- } ;
226- if let Some ( ref schema) = schema {
227- let source_schema = source. schema ( ) ?;
228- if * schema != source_schema {
229- anyhow:: bail!(
230- "Schema mismatch for input file {} (does not match other file(s))" ,
231- & input_path
232- ) ;
233- }
234- } else {
235- schema = Some ( source. schema ( ) ?) ;
217+ let mut sources: Vec < Box < dyn DataSource > > = Vec :: new ( ) ;
218+ let mut schema: Option < SchemaRef > = None ;
219+ for input_path in & expanded_paths {
220+ let source = make_source ( input_path, input_format) ?;
221+ if let Some ( ref schema) = schema {
222+ let source_schema = source. schema ( ) ?;
223+ if * schema != source_schema {
224+ anyhow:: bail!(
225+ "Schema mismatch for input file {} (does not match other file(s))" ,
226+ input_path
227+ ) ;
236228 }
237- sources. push ( source) ;
229+ } else {
230+ schema = Some ( source. schema ( ) ?) ;
238231 }
239- pipeline = pipeline. with_input_strategy_with_multiple_sources ( sources) ;
240- Ok ( ( ) )
232+ sources. push ( source) ;
241233 }
234+ resolved_paths = expanded_paths;
235+ InputStrategy :: Multiple ( sources)
242236 } ;
243237
244- setup_result?;
238+ // sample rows to estimate sort spill reservation before handing strategy to pipeline
239+ if has_sort {
240+ let avg_row_bytes = sample_avg_row_bytes ( & input_strategy, 100_000 ) . await ?;
241+
242+ if avg_row_bytes > 0 {
243+ let total_input_bytes: u64 = resolved_paths
244+ . iter ( )
245+ . filter_map ( |p| std:: fs:: metadata ( p) . ok ( ) )
246+ . map ( |m| m. len ( ) )
247+ . sum ( ) ;
248+
249+ let memory_limit = effective_memory_limit. unwrap_or ( total_budget * 60 / 100 ) ;
250+ let partitions = effective_target_partitions. unwrap_or ( three_quarter_cpus) ;
251+ let memory_per_partition = memory_limit / partitions. max ( 1 ) ;
252+
253+ let reservation = estimate_sort_spill_reservation (
254+ avg_row_bytes,
255+ total_input_bytes,
256+ memory_per_partition,
257+ 8192 , // DataFusion default batch size
258+ ) ;
259+
260+ pipeline = pipeline. with_sort_spill_reservation_bytes ( Some ( reservation) ) ;
261+ }
262+ }
263+
264+ pipeline = pipeline. with_input_strategy ( input_strategy) ;
245265
246266 let list_outputs_format = list_outputs;
247267
@@ -483,6 +503,18 @@ fn to_title_case(s: &str) -> String {
483503 . join ( " " )
484504}
485505
506+ fn make_source (
507+ path : & str ,
508+ input_format : Option < DataFormat > ,
509+ ) -> Result < Box < dyn DataSource > > {
510+ let format = detect_format ( path, input_format) ?;
511+ Ok ( match format {
512+ DataFormat :: Arrow => Box :: new ( ArrowDataSource :: new ( path. to_string ( ) ) ) ,
513+ DataFormat :: Parquet => Box :: new ( ParquetDataSource :: new ( path. to_string ( ) ) ) ,
514+ DataFormat :: Vortex => Box :: new ( VortexDataSource :: new ( path. to_string ( ) ) ) ,
515+ } )
516+ }
517+
486518fn detect_format ( path : & str , explicit_format : Option < DataFormat > ) -> Result < DataFormat > {
487519 if let Some ( format) = explicit_format {
488520 return Ok ( format) ;
0 commit comments