Skip to content

Commit ca92c15

Browse files
authored
Merge pull request #17 from hotdata-dev/fix/path-traversal-keys
fix(keys): reject path traversal in from_files
2 parents 6028e79 + 8bd49d8 commit ca92c15

1 file changed

Lines changed: 80 additions & 1 deletion

File tree

src/keys.rs

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ pub fn unpack_key(key: u64) -> (usize, usize, usize) {
3838
///
3939
/// Computed once at startup from parquet file footers (reads only the last few
4040
/// KB of each file). Not persisted to disk.
41+
#[derive(Debug)]
4142
pub struct DatasetLayout {
4243
/// Object-store path (or S3 key) for each file, indexed by `file_idx`.
4344
pub file_keys: Vec<String>,
@@ -92,7 +93,19 @@ impl DatasetLayout {
9293

9394
let mut running_total = 0u64;
9495
for &path in local_paths {
95-
let file_name = Path::new(path)
96+
let raw = Path::new(path);
97+
98+
// Reject paths containing '..' components to prevent directory
99+
// traversal when paths originate from untrusted input.
100+
for component in raw.components() {
101+
if matches!(component, std::path::Component::ParentDir) {
102+
return Err(DataFusionError::Execution(format!(
103+
"path traversal not allowed: {path}"
104+
)));
105+
}
106+
}
107+
108+
let file_name = raw
96109
.file_name()
97110
.and_then(|n| n.to_str())
98111
.ok_or_else(|| DataFusionError::Execution(format!("invalid path: {path}")))?;
@@ -161,4 +174,70 @@ mod tests {
161174
fn test_pack_key_file_idx_overflow_panics() {
162175
pack_key(1 << 16, 0, 0);
163176
}
177+
178+
#[cfg(any(feature = "parquet-provider", feature = "sqlite-provider"))]
179+
mod from_files_tests {
180+
use super::super::DatasetLayout;
181+
use arrow_array::{RecordBatch, StringArray};
182+
use arrow_schema::{DataType, Field, Schema};
183+
use parquet::arrow::ArrowWriter;
184+
use std::sync::Arc;
185+
186+
/// Write a minimal 1-row parquet file at the given path.
187+
fn write_parquet(path: &std::path::Path) {
188+
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Utf8, true)]));
189+
let batch = RecordBatch::try_new(
190+
schema.clone(),
191+
vec![Arc::new(StringArray::from(vec![Some("a")]))],
192+
)
193+
.unwrap();
194+
let file = std::fs::File::create(path).unwrap();
195+
let mut w = ArrowWriter::try_new(file, schema, None).unwrap();
196+
w.write(&batch).unwrap();
197+
w.close().unwrap();
198+
}
199+
200+
#[test]
201+
fn rejects_parent_dir_traversal() {
202+
let dir = tempfile::tempdir().unwrap();
203+
let sub = dir.path().join("sub");
204+
std::fs::create_dir(&sub).unwrap();
205+
let file = sub.join("test.parquet");
206+
write_parquet(&file);
207+
208+
// A path using '..' to escape the expected directory must be rejected.
209+
let traversal = format!("{}/sub/../sub/test.parquet", dir.path().display());
210+
let result = DatasetLayout::from_files(&[&traversal], "");
211+
assert!(result.is_err());
212+
let msg = result.unwrap_err().to_string();
213+
assert!(
214+
msg.contains("path traversal"),
215+
"expected 'path traversal' error, got: {msg}"
216+
);
217+
}
218+
219+
#[test]
220+
fn rejects_dotdot_prefix_traversal() {
221+
// Even relative paths with '..' must be rejected.
222+
let result = DatasetLayout::from_files(&["../../../etc/passwd"], "");
223+
assert!(result.is_err());
224+
let msg = result.unwrap_err().to_string();
225+
assert!(
226+
msg.contains("path traversal"),
227+
"expected 'path traversal' error, got: {msg}"
228+
);
229+
}
230+
231+
#[test]
232+
fn accepts_valid_absolute_path() {
233+
let dir = tempfile::tempdir().unwrap();
234+
let file = dir.path().join("valid.parquet");
235+
write_parquet(&file);
236+
237+
let path_str = file.to_str().unwrap();
238+
let layout = DatasetLayout::from_files(&[path_str], "").unwrap();
239+
assert_eq!(layout.file_keys, vec!["valid.parquet"]);
240+
assert_eq!(layout.file_cum_rows, vec![0, 1]);
241+
}
242+
}
164243
}

0 commit comments

Comments
 (0)