@@ -10,125 +10,83 @@ use std::fmt;
1010use stdext:: result:: ResultExt ;
1111use url:: Url ;
1212
13- /// Canonicalised file URL identity.
13+ /// Lexically normalised file URL identity.
1414///
15- /// # The multi-source URI reconciliation problem
15+ /// Internal identity key for files received from any source (LSP, DAP,
16+ /// scanner, R runtime). Constructed via the same lexical normalisation
17+ /// at every entry point so that two paths the editor considers "the
18+ /// same file" produce the same [`UrlId`].
1619///
17- /// File URIs for the same file can arrive from independent sources, each
18- /// with its own representation:
20+ /// What we normalise: drive-letter casing on Windows; percent-encoding
21+ /// of `:` (decoded via `Url -> PathBuf -> Url` round-trip). No I/O:
22+ /// `std::fs::canonicalize()`, no symlink resolution. The same input URI
23+ /// produces the same [`UrlId`] whether or not the file exists on disk.
1924///
20- /// - **DAP (SetBreakpoints)**: Receives raw file paths from the frontend,
21- /// converted to URIs via `UrlId::from_file_path`. These are stored as
22- /// HashMap keys for breakpoint lookup.
25+ /// # Bridging across symlinks
2326///
24- /// - **LSP (didChange, etc.)**: Receives URIs directly from the editor
25- /// client, which may use non-canonical forms (e.g. percent-encoded
26- /// colons on Windows, or symlinked paths on macOS).
27+ /// R's `normalizePath()` resolves symlinks on its own. A srcref URI
28+ /// from the R runtime may name `/private/tmp/foo.R` while the editor
29+ /// sent us `/tmp/foo.R`. The two don't compare equal, so a `HashMap`
30+ /// keyed on `UrlId` treats them as separate files. Code that needs to
31+ /// match a srcref URI back to an open document or a breakpoint should
32+ /// maintain a secondary index of `fs::canonicalize`d paths and fall
33+ /// back to it on a primary miss.
34+ /// [`crate::dap::dap_state::BreakpointMap`] in `ark` does this for
35+ /// breakpoints.
2736///
28- /// - **Execute requests**: A frontend may attach a `code_location` URI
29- /// that comes straight from the editor's document model, again
30- /// potentially non-canonical.
37+ /// # Important: don't leak normalised URIs back out
3138///
32- /// - **R runtime**: When R evaluates `source()` or annotates code, it
33- /// passes URIs that went through R's `normalizePath()`, which resolves
34- /// symlinks to their canonical target (e.g. `/tmp` resolves to `/private/tmp`
35- /// on macOS), producing a path the editor never sent. More generally,
36- /// arbitrary R code can create source references that we may end up
37- /// consuming for breakpoint or debug purposes, and the paths in those
38- /// references may or may not be canonical.
39- ///
40- /// All four sources must agree on file identity. For instance breakpoints set
41- /// via DAP are looked up in a HashMap keyed by URI when code is executed or
42- /// sourced, and invalidated when documents change via LSP.
43- ///
44- /// # Design decision
45- ///
46- /// We solve this by canonicalizing URIs into [`UrlId`] at every entry
47- /// point, rather than interning paths into opaque IDs (as rust-analyzer
48- /// does with its VFS `FileId` approach). Interning would be a larger
49- /// architectural change and is not warranted here since we only need
50- /// canonical keys at a handful of call sites.
51- ///
52- /// Canonicalization uses `std::fs::canonicalize()` to resolve symlinks
53- /// (e.g. `/tmp` to `/private/tmp` on macOS), round-trips through the
54- /// filesystem path to normalize encoding variants (e.g. `%3A` to `:` on
55- /// Windows), and uppercases drive letters on Windows. When the file does
56- /// not exist on disk, we fall back to the original URI.
57- ///
58- /// # Important: canonical URIs must not leak
59- ///
60- /// [`UrlId`] is strictly for internal identity. When a URI flows back
61- /// to R (e.g. in `#line` directives or injected breakpoint calls) or to
62- /// the frontend (e.g. in DAP stack frames), always use the original raw
63- /// URI. The frontend (and possibly R code) expects their own URI
64- /// representation, and a canonical URI (e.g. `/private/tmp/...` instead of
65- /// `/tmp/...`) could be treated as a different file (e.g. open a new editor in
66- /// the frontend instead of an existing one).
67- ///
68- /// A canonicalized file URI for use as a stable identity key.
69- ///
70- /// Wraps a [`Url`] that has been canonicalized to resolve symlinks,
71- /// normalize encoding variants, and uppercase drive letters on Windows.
72- /// Use this type in HashMaps and anywhere file identity matters.
73- ///
74- /// Construct via [`UrlId::from_url`], [`UrlId::from_file_path`], or
75- /// [`UrlId::parse`].
76- ///
77- /// On Windows, `std::fs::canonicalize()` returns extended-length paths
78- /// prefixed with `\\?\` (e.g. `\\?\C:\Users\...`). Projects like Ruff
79- /// use the `dunce` crate to strip this prefix, but we don't need it
80- /// because `Url::from_file_path` already handles
81- /// `Prefix::VerbatimDisk` and produces a clean `file:///C:/...` URI.
39+ /// Even though [`UrlId`] no longer fs-canonicalises, it still
40+ /// uppercases the Windows drive letter and decodes the percent-encoded
41+ /// colon. When sending URIs back to the editor or to R, prefer the
42+ /// original bytes the frontend sent. The frontend treats a URI as the
43+ /// editor's identity for the file; a normalised form may look like a
44+ /// different file to it (e.g. open a new editor pane).
8245#[ derive( Debug , Clone , PartialEq , Eq , Hash ) ]
8346pub struct UrlId ( Url ) ;
8447
8548impl UrlId {
86- /// Canonicalize a [`Url`] into a [`UrlId`].
49+ /// Lexically normalise a [`Url`] into a [`UrlId`].
8750 ///
88- /// Resolves symlinks via `std::fs::canonicalize()` and normalizes
89- /// encoding variants (e.g. `%3A` to `:` on Windows). On Windows, also
90- /// uppercases the drive letter. Falls back to the original URI for
91- /// non-file schemes or when the path can't be resolved .
51+ /// Decodes encoding variants (e.g. `%3A` to `:` on Windows) and
52+ /// uppercases the Windows drive letter. Does no filesystem I/O.
53+ /// Non-`file:` URLs (`ark://`, `untitled:`, ...) pass through
54+ /// untouched .
9255 pub fn from_url ( uri : Url ) -> Self {
9356 if uri. scheme ( ) != "file" {
9457 return Self ( uri) ;
9558 }
9659
97- let Some ( path) = uri. to_file_path ( ) . warn_on_err ( ) else {
98- return Self ( uri) ;
60+ // Round-trip through `PathBuf` so the URI form matches what
61+ // `Url::from_file_path` produces (decoded `%3A`, etc.). Skip
62+ // on error, we let pathological URIs flow through unchanged.
63+ let uri = match uri. to_file_path ( ) . warn_on_err ( ) {
64+ Some ( path) => Url :: from_file_path ( & path)
65+ . map_err ( |( ) | anyhow:: anyhow!( "Failed to convert path to URI: {path:?}" ) )
66+ . warn_on_err ( )
67+ . unwrap_or ( uri) ,
68+ None => uri,
9969 } ;
10070
101- let path = std:: fs:: canonicalize ( & path) . trace_on_err ( ) . unwrap_or ( path) ;
102- let uri = Url :: from_file_path ( & path)
103- . map_err ( |( ) | anyhow:: anyhow!( "Failed to convert path to URI: {path:?}" ) )
104- . warn_on_err ( )
105- . unwrap_or ( uri) ;
106-
10771 #[ cfg( windows) ]
10872 let uri = uppercase_windows_drive_in_uri ( uri) ;
10973
11074 Self ( uri)
11175 }
11276
113- /// Wrap a [`Url`] that the caller asserts is already canonical.
114- pub fn from_canonical ( uri : Url ) -> Self {
115- Self ( uri)
116- }
117-
118- /// Convert a file path to a canonical [`UrlId`].
77+ /// Build a [`UrlId`] from a filesystem path.
11978 ///
120- /// Canonicalizes the path to resolve symlinks (e.g. `/var/folders` to
121- /// `/private/var/folders` on macOS) so the URI matches what R's
122- /// `normalizePath()` produces. Falls back to the original path if
123- /// canonicalization fails.
79+ /// Same lexical normalisation as [`Self::from_url`], no filesystem
80+ /// I/O. Errors only if `path` can't be expressed as a URL (e.g.
81+ /// not absolute on platforms that require it).
12482 pub fn from_file_path ( path : impl AsRef < std:: path:: Path > ) -> anyhow:: Result < Self > {
12583 let path = path. as_ref ( ) ;
12684 let url = Url :: from_file_path ( path)
12785 . map_err ( |( ) | anyhow:: anyhow!( "Failed to convert path to URL: {}" , path. display( ) ) ) ?;
12886 Ok ( Self :: from_url ( url) )
12987 }
13088
131- /// Parse a URI string into a canonical [`UrlId`].
89+ /// Parse a URI string into a [`UrlId`].
13290 pub fn parse ( s : & str ) -> Result < Self , url:: ParseError > {
13391 let url = Url :: parse ( s) ?;
13492 Ok ( Self :: from_url ( url) )
@@ -223,28 +181,33 @@ mod tests {
223181
224182 #[ test]
225183 #[ cfg( not( windows) ) ]
226- fn test_fallback_for_nonexistent_path ( ) {
227- // For paths that don't exist, canonicalization falls back to the
228- // original path so the URI is unchanged.
184+ fn test_nonexistent_path_unchanged ( ) {
185+ // Construction is lexical-only, so nonexistent paths flow
186+ // through unchanged.
229187 let uri = Url :: parse ( "file:///nonexistent/path/test.R" ) . unwrap ( ) ;
230188 let id = UrlId :: from_url ( uri. clone ( ) ) ;
231189 assert_eq ! ( * id. as_url( ) , uri) ;
232190 }
233191
234192 #[ test]
235193 #[ cfg( target_os = "macos" ) ]
236- fn test_resolves_tmp_symlink ( ) {
237- // On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` should
238- // resolve it so that URIs from different sources match.
194+ fn test_does_not_resolve_symlinks ( ) {
195+ // On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` does
196+ // *not* resolve it; same input bytes produce the same output
197+ // regardless of the symlink graph on disk. Bridging across
198+ // symlinked names is the job of secondary canonical indexes at
199+ // specific seams (e.g. the DAP breakpoint store), not of
200+ // construction.
239201 let dir = tempfile:: tempdir_in ( "/tmp" ) . unwrap ( ) ;
240202 let file = dir. path ( ) . join ( "test.R" ) ;
241203 std:: fs:: write ( & file, "" ) . unwrap ( ) ;
242204
243- let non_canonical = Url :: from_file_path ( & file) . unwrap ( ) ;
244- assert ! ( non_canonical . path( ) . starts_with( "/tmp/" ) ) ;
205+ let original = Url :: from_file_path ( & file) . unwrap ( ) ;
206+ assert ! ( original . path( ) . starts_with( "/tmp/" ) ) ;
245207
246- let id = UrlId :: from_url ( non_canonical) ;
247- assert ! ( id. as_url( ) . path( ) . starts_with( "/private/tmp/" ) ) ;
208+ let id = UrlId :: from_url ( original. clone ( ) ) ;
209+ assert_eq ! ( * id. as_url( ) , original) ;
210+ assert ! ( id. as_url( ) . path( ) . starts_with( "/tmp/" ) ) ;
248211 }
249212
250213 #[ test]
0 commit comments