Skip to content

Commit a038963

Browse files
robert3005a10y
andauthored
fix issue with FileSystem path handling (#8248)
fix #6599 this is a revival of #7516 that needed some extra love In this pr we make sure to handle all the edge cases of paths that work differently locally vs over http. This is in addition to #7516 which fixed glob handling of non existent files --------- Signed-off-by: Andrew Duffy <andrew@a10y.dev> Signed-off-by: Robert Kruszewski <github@robertk.io> Co-authored-by: Andrew Duffy <andrew@a10y.dev>
1 parent efd3e9b commit a038963

7 files changed

Lines changed: 338 additions & 22 deletions

File tree

vortex-duckdb/src/filesystem.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,22 @@ impl FileSystem for DuckDbFileSystem {
147147
.boxed()
148148
}
149149

150+
async fn head(&self, path: &str) -> VortexResult<Option<FileListing>> {
151+
// DuckDB's filesystem exposes no stat/exists call, so probe the path by opening it and
152+
// reading its size. DuckDB does not distinguish "not found" from other open failures, so
153+
// any open error is treated as a missing file (logged for diagnosability).
154+
match self.open_read(path).await {
155+
Ok(reader) => Ok(Some(FileListing {
156+
path: path.to_string(),
157+
size: Some(reader.size().await?),
158+
})),
159+
Err(e) => {
160+
tracing::debug!("head({path}): treating open error as not-found: {e}");
161+
Ok(None)
162+
}
163+
}
164+
}
165+
150166
async fn open_read(&self, path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
151167
let mut url = self.base_url.clone();
152168
url.set_path(path);

vortex-ffi/test/scan.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ TEST_CASE("Creating datasources", "[datasource]") {
209209
ds = vx_data_source_new(session, &opts, &error);
210210
REQUIRE(ds == nullptr);
211211
REQUIRE(error != nullptr);
212-
REQUIRE_THAT(to_string(error), ContainsSubstring("No such file or directory"));
212+
REQUIRE_THAT(to_string(error), ContainsSubstring("No files matched the glob pattern"));
213213
vx_error_free(error);
214214

215215
opts.paths = "/tmp2/*.vortex";

vortex-io/src/compat/filesystem.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ impl<F: FileSystem> FileSystem for Compat<F> {
2121
Compat::new(self.inner().list(prefix)).boxed()
2222
}
2323

24+
async fn head(&self, path: &str) -> VortexResult<Option<FileListing>> {
25+
Compat::new(self.inner().head(path)).await
26+
}
27+
2428
async fn open_read(&self, path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
2529
let read_at = Compat::new(self.inner().open_read(path)).await?;
2630
Ok(Arc::new(Compat::new(read_at)))

vortex-io/src/filesystem/glob.rs

Lines changed: 78 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
use futures::StreamExt;
55
use futures::TryStreamExt;
6+
use futures::stream;
67
use futures::stream::BoxStream;
78
use vortex_error::VortexResult;
89
use vortex_error::vortex_bail;
@@ -22,14 +23,18 @@ impl dyn FileSystem + '_ {
2223
pub fn glob(&self, pattern: &str) -> VortexResult<BoxStream<'_, VortexResult<FileListing>>> {
2324
validate_glob(pattern)?;
2425

25-
// If there are no glob characters, the pattern is an exact file path.
26-
// Return it directly without listing the filesystem.
26+
// If there are no glob characters, the pattern is an exact file path. `list` enumerates
27+
// entries *under* a prefix on a path-segment basis and never yields the prefix itself, so
28+
// listing an exact path would report an existing file as missing (and could surface prefix
29+
// collisions such as `foo.vortex.backup` when the caller asked for `foo.vortex`). Use
30+
// `head` to confirm the file exists and capture its size, yielding a single-element stream
31+
// when it does and an empty stream when it does not.
2732
if !pattern.contains(['*', '?', '[']) {
28-
let listing = FileListing {
29-
path: pattern.to_string(),
30-
size: None,
31-
};
32-
return Ok(futures::stream::once(async { Ok(listing) }).boxed());
33+
let pattern = pattern.to_string();
34+
let stream = stream::once(async move { self.head(&pattern).await })
35+
.try_filter_map(|listing| async move { Ok(listing) })
36+
.boxed();
37+
return Ok(stream);
3338
}
3439

3540
let glob_pattern = glob::Pattern::new(pattern)
@@ -93,14 +98,41 @@ mod tests {
9398
use crate::VortexReadAt;
9499
use crate::filesystem::FileSystem;
95100

96-
/// A mock filesystem that panics if `list` is called.
101+
/// A mock filesystem that resolves exact paths through [`head`](FileSystem::head) and
102+
/// panics if [`list`](FileSystem::list) is called. This encodes the invariant the fix
103+
/// depends on: the exact-path glob branch must never list, because an object store's `list`
104+
/// does not return the exact path of a file.
97105
#[derive(Debug)]
98-
struct NoListFileSystem;
106+
struct HeadFileSystem {
107+
files: Vec<FileListing>,
108+
}
109+
110+
impl HeadFileSystem {
111+
fn new(files: &[(&str, u64)]) -> Self {
112+
Self {
113+
files: files
114+
.iter()
115+
.map(|&(path, size)| FileListing {
116+
path: path.to_string(),
117+
size: Some(size),
118+
})
119+
.collect(),
120+
}
121+
}
122+
}
99123

100124
#[async_trait]
101-
impl FileSystem for NoListFileSystem {
125+
impl FileSystem for HeadFileSystem {
102126
fn list(&self, _prefix: &str) -> BoxStream<'_, VortexResult<FileListing>> {
103-
vortex_panic!("list() should not be called for exact paths")
127+
vortex_panic!("list() must not be called for an exact path; glob should use head()")
128+
}
129+
130+
async fn head(&self, path: &str) -> VortexResult<Option<FileListing>> {
131+
Ok(self
132+
.files
133+
.iter()
134+
.find(|listing| listing.path == path)
135+
.cloned())
104136
}
105137

106138
async fn open_read(&self, _path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
@@ -113,12 +145,43 @@ mod tests {
113145
}
114146

115147
#[tokio::test]
116-
async fn test_glob_exact_path_skips_list() -> VortexResult<()> {
117-
let fs: &dyn FileSystem = &NoListFileSystem;
118-
let results: Vec<FileListing> = fs.glob("data/file.vortex")?.try_collect().await?;
148+
async fn test_glob_exact_path_existing_returns_listing_with_size() -> VortexResult<()> {
149+
let fs = HeadFileSystem::new(&[("data/file.vortex", 1024)]);
150+
let fs_dyn: &dyn FileSystem = &fs;
151+
let results: Vec<FileListing> = fs_dyn.glob("data/file.vortex")?.try_collect().await?;
119152
assert_eq!(results.len(), 1);
120153
assert_eq!(results[0].path, "data/file.vortex");
121-
assert_eq!(results[0].size, None);
154+
assert_eq!(
155+
results[0].size,
156+
Some(1024),
157+
"exact-path glob should propagate the size reported by head"
158+
);
159+
Ok(())
160+
}
161+
162+
#[tokio::test]
163+
async fn test_glob_exact_path_missing_returns_empty_stream() -> VortexResult<()> {
164+
let fs = HeadFileSystem::new(&[]);
165+
let fs_dyn: &dyn FileSystem = &fs;
166+
let results: Vec<FileListing> = fs_dyn.glob("data/missing.vortex")?.try_collect().await?;
167+
assert!(
168+
results.is_empty(),
169+
"missing exact path should yield an empty stream"
170+
);
171+
Ok(())
172+
}
173+
174+
#[tokio::test]
175+
async fn test_glob_exact_path_ignores_prefix_siblings() -> VortexResult<()> {
176+
// A real object store lists by prefix and would surface `foo.vortex.backup` when asked to
177+
// list `foo.vortex`. Resolving the exact path via head sidesteps that: only the requested
178+
// key is returned, and the panicking `list` proves the branch never enumerated.
179+
let fs = HeadFileSystem::new(&[("foo.vortex", 10), ("foo.vortex.backup", 20)]);
180+
let fs_dyn: &dyn FileSystem = &fs;
181+
let results: Vec<FileListing> = fs_dyn.glob("foo.vortex")?.try_collect().await?;
182+
assert_eq!(results.len(), 1);
183+
assert_eq!(results[0].path, "foo.vortex");
184+
assert_eq!(results[0].size, Some(10));
122185
Ok(())
123186
}
124187

vortex-io/src/filesystem/mod.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ pub type FileSystemRef = Arc<dyn FileSystem>;
3636
/// Implementations handle the details of a particular storage backend (local disk, S3, GCS, etc.)
3737
/// while consumers work through this uniform interface.
3838
///
39+
/// # Paths
40+
///
41+
/// Path strings are *literal* object keys / file paths: the characters are used verbatim, with no
42+
/// shell-style `~` expansion (`~` is a literal tilde, not the home directory) and no
43+
/// percent-encoding or -decoding applied by this layer (`%20` is the three characters `%`, `2`,
44+
/// `0`, not a space). A path produced by [`list`](FileSystem::list) or [`head`](FileSystem::head)
45+
/// is the object's actual key, so it can be passed straight back to
46+
/// [`open_read`](FileSystem::open_read) — including when it contains characters such as `~`, `%`,
47+
/// `[`, `]`, or `#`.
48+
///
3949
/// # Future Work
4050
///
4151
/// An `open_write` method will be added once [`VortexWrite`](crate::VortexWrite) is
@@ -51,6 +61,17 @@ pub trait FileSystem: Debug + Send + Sync {
5161
/// callers should sort if deterministic ordering is required.
5262
fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult<FileListing>>;
5363

64+
/// Fetch metadata for the file at the exact `path`, if it exists.
65+
///
66+
/// Unlike [`list`](FileSystem::list), which enumerates files *under* a prefix on a
67+
/// path-segment basis and never yields the prefix itself, `head` looks up the object at
68+
/// exactly `path`. It is the correct primitive for confirming that a single known file
69+
/// exists and reading its size.
70+
///
71+
/// Returns `Ok(Some(_))` with the file's [`FileListing`] when it exists, `Ok(None)` when no
72+
/// file exists at `path`, and `Err(_)` for any other failure (I/O or permission errors, etc.).
73+
async fn head(&self, path: &str) -> VortexResult<Option<FileListing>>;
74+
5475
/// Open a file for reading at the given path.
5576
async fn open_read(&self, path: &str) -> VortexResult<Arc<dyn VortexReadAt>>;
5677

vortex-io/src/filesystem/prefix.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ impl FileSystem for PrefixFileSystem {
5252
.boxed()
5353
}
5454

55+
async fn head(&self, path: &str) -> VortexResult<Option<FileListing>> {
56+
let full_path = format!("{}{}", self.prefix, path.trim_start_matches('/'));
57+
Ok(self.inner.head(&full_path).await?.map(|mut listing| {
58+
listing.path = listing
59+
.path
60+
.strip_prefix(&self.prefix)
61+
.unwrap_or(&listing.path)
62+
.to_string();
63+
listing
64+
}))
65+
}
66+
5567
async fn open_read(&self, path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
5668
self.inner
5769
.open_read(&format!("{}{}", self.prefix, path.trim_start_matches('/')))

0 commit comments

Comments
 (0)