@@ -38,6 +38,7 @@ pub fn unpack_key(key: u64) -> (usize, usize, usize) {
3838///
3939/// Computed once at startup from parquet file footers (reads only the last few
4040/// KB of each file). Not persisted to disk.
41+ #[ derive( Debug ) ]
4142pub struct DatasetLayout {
4243 /// Object-store path (or S3 key) for each file, indexed by `file_idx`.
4344 pub file_keys : Vec < String > ,
@@ -92,7 +93,19 @@ impl DatasetLayout {
9293
9394 let mut running_total = 0u64 ;
9495 for & path in local_paths {
95- let file_name = Path :: new ( path)
96+ let raw = Path :: new ( path) ;
97+
98+ // Reject paths containing '..' components to prevent directory
99+ // traversal when paths originate from untrusted input.
100+ for component in raw. components ( ) {
101+ if matches ! ( component, std:: path:: Component :: ParentDir ) {
102+ return Err ( DataFusionError :: Execution ( format ! (
103+ "path traversal not allowed: {path}"
104+ ) ) ) ;
105+ }
106+ }
107+
108+ let file_name = raw
96109 . file_name ( )
97110 . and_then ( |n| n. to_str ( ) )
98111 . ok_or_else ( || DataFusionError :: Execution ( format ! ( "invalid path: {path}" ) ) ) ?;
@@ -161,4 +174,70 @@ mod tests {
161174 fn test_pack_key_file_idx_overflow_panics ( ) {
162175 pack_key ( 1 << 16 , 0 , 0 ) ;
163176 }
177+
178+ #[ cfg( any( feature = "parquet-provider" , feature = "sqlite-provider" ) ) ]
179+ mod from_files_tests {
180+ use super :: super :: DatasetLayout ;
181+ use arrow_array:: { RecordBatch , StringArray } ;
182+ use arrow_schema:: { DataType , Field , Schema } ;
183+ use parquet:: arrow:: ArrowWriter ;
184+ use std:: sync:: Arc ;
185+
186+ /// Write a minimal 1-row parquet file at the given path.
187+ fn write_parquet ( path : & std:: path:: Path ) {
188+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new( "x" , DataType :: Utf8 , true ) ] ) ) ;
189+ let batch = RecordBatch :: try_new (
190+ schema. clone ( ) ,
191+ vec ! [ Arc :: new( StringArray :: from( vec![ Some ( "a" ) ] ) ) ] ,
192+ )
193+ . unwrap ( ) ;
194+ let file = std:: fs:: File :: create ( path) . unwrap ( ) ;
195+ let mut w = ArrowWriter :: try_new ( file, schema, None ) . unwrap ( ) ;
196+ w. write ( & batch) . unwrap ( ) ;
197+ w. close ( ) . unwrap ( ) ;
198+ }
199+
200+ #[ test]
201+ fn rejects_parent_dir_traversal ( ) {
202+ let dir = tempfile:: tempdir ( ) . unwrap ( ) ;
203+ let sub = dir. path ( ) . join ( "sub" ) ;
204+ std:: fs:: create_dir ( & sub) . unwrap ( ) ;
205+ let file = sub. join ( "test.parquet" ) ;
206+ write_parquet ( & file) ;
207+
208+ // A path using '..' to escape the expected directory must be rejected.
209+ let traversal = format ! ( "{}/sub/../sub/test.parquet" , dir. path( ) . display( ) ) ;
210+ let result = DatasetLayout :: from_files ( & [ & traversal] , "" ) ;
211+ assert ! ( result. is_err( ) ) ;
212+ let msg = result. unwrap_err ( ) . to_string ( ) ;
213+ assert ! (
214+ msg. contains( "path traversal" ) ,
215+ "expected 'path traversal' error, got: {msg}"
216+ ) ;
217+ }
218+
219+ #[ test]
220+ fn rejects_dotdot_prefix_traversal ( ) {
221+ // Even relative paths with '..' must be rejected.
222+ let result = DatasetLayout :: from_files ( & [ "../../../etc/passwd" ] , "" ) ;
223+ assert ! ( result. is_err( ) ) ;
224+ let msg = result. unwrap_err ( ) . to_string ( ) ;
225+ assert ! (
226+ msg. contains( "path traversal" ) ,
227+ "expected 'path traversal' error, got: {msg}"
228+ ) ;
229+ }
230+
231+ #[ test]
232+ fn accepts_valid_absolute_path ( ) {
233+ let dir = tempfile:: tempdir ( ) . unwrap ( ) ;
234+ let file = dir. path ( ) . join ( "valid.parquet" ) ;
235+ write_parquet ( & file) ;
236+
237+ let path_str = file. to_str ( ) . unwrap ( ) ;
238+ let layout = DatasetLayout :: from_files ( & [ path_str] , "" ) . unwrap ( ) ;
239+ assert_eq ! ( layout. file_keys, vec![ "valid.parquet" ] ) ;
240+ assert_eq ! ( layout. file_cum_rows, vec![ 0 , 1 ] ) ;
241+ }
242+ }
164243}
0 commit comments