Skip to content

Commit bb1e1a8

Browse files
authored
Add oak_scan crate with initial library scanning (#1243)
Branched from #1226 Progress towards #1212 Introduces `oak_scan` with a library scanner. The LSP layer calls (in the next PR) the scanner on startup with `.libPaths()`. Each libpath is set as a library root in `oak_db`. `oak_scan` is the only writer of Salsa inputs in `oak_db`, and is in charge of maintaining invariants, such as making sure backpointers are up-to-date. Library roots are static for now. Watching them for changes (package removed or installed) would require our own filesystem watcher. Since we don't need one for the workspace (we'll use LSP file events), the machinery would be a bit too heavy for what we would gain for it at this stage. For now the LSP can be restarted to register changes to libpaths.
2 parents 269fee1 + 76a82b0 commit bb1e1a8

29 files changed

Lines changed: 1647 additions & 50 deletions

AGENTS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ just test <test_name>
6767
just test -p ark
6868
```
6969

70+
### Placing Integration Tests
71+
72+
Put integration tests under `crates/<crate>/tests/integration/`, with a `main.rs` that declares each test file as a module (`mod library;`). Don't add loose `*.rs` files directly under `tests/`. Each top-level file there compiles as its own test binary, so consolidating them into one `integration` binary keeps build and link times down and saves on disk space.
73+
7074
### Running Clippy
7175

7276
```sh

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ oak_ide = { path = "crates/oak_ide" }
7979
oak_index_vec = { path = "crates/oak_index_vec" }
8080
oak_package_metadata = { path = "crates/oak_package_metadata" }
8181
oak_r_process = { path = "crates/oak_r_process" }
82+
oak_scan = { path = "crates/oak_scan" }
8283
oak_semantic = { path = "crates/oak_semantic" }
8384
oak_sources = { path = "crates/oak_sources" }
8485
once_cell = "1.21.4"

crates/oak_core/src/file.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
use std::path::Path;
22
use std::path::PathBuf;
33

4+
/// Does this path's name look like an R file (`.R` / `.r` extension)?
5+
///
6+
/// Pure name test, no I/O. It doesn't touch the filesystem, so it says
7+
/// nothing about whether the path exists or is a regular file. A
8+
/// directory named `foo.R` passes, and so does a path that isn't on disk
9+
/// at all. Callers that walk a real directory and want to skip such cases
10+
/// must check `path.is_file()` themselves.
411
pub fn is_r_file(path: &Path) -> bool {
512
path.extension()
613
.and_then(|e| e.to_str())

crates/oak_core/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ pub mod file;
33
pub mod identifier;
44
pub mod range;
55
pub mod syntax_ext;
6+
7+
pub use file::is_r_file;

crates/oak_db/src/db.rs

Lines changed: 99 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ use rustc_hash::FxHashMap;
33

44
use crate::File;
55
use crate::LibraryRoots;
6+
use crate::LiveRoot;
67
use crate::OrphanRoot;
78
use crate::Package;
89
use crate::Root;
10+
use crate::StaleRoot;
911
use crate::WorkspaceRoots;
1012

1113
/// Concrete-input surface of the salsa database. Each impl
@@ -26,6 +28,10 @@ pub trait DbInputs: salsa::Database {
2628

2729
/// Files not yet anchored to any workspace or library root.
2830
fn orphan_root(&self) -> OrphanRoot;
31+
32+
/// Files and packages from roots that have been removed. Holding
33+
/// pen for entity reuse on re-add (see [`StaleRoot`]).
34+
fn stale_root(&self) -> StaleRoot;
2935
}
3036

3137
/// Salsa database trait used throughout `oak_db`. Tracked queries take `&dyn
@@ -53,6 +59,47 @@ pub trait Db: DbInputs {
5359
/// - Installed packages in an earlier root shadow later ones
5460
/// (mirroring `.libPaths()`).
5561
fn package_by_name(&self, name: &str) -> Option<Package>;
62+
63+
/// Resolve the live `Root` that contains `pkg`, if any.
64+
///
65+
/// Returns `None` when the package is only in [`StaleRoot`] (its live
66+
/// container was previously evicted).
67+
///
68+
/// **Nested roots.** Two roots can claim the same package when one is
69+
/// nested inside the other, e.g. the frontend opens both `/proj` and
70+
/// `/proj/sub-pkg` as workspace folders and both scans walk into
71+
/// `sub-pkg/DESCRIPTION`. Both scans hand the same `Package` entity to
72+
/// their respective root's `packages` vec; the longest-path root wins
73+
/// the ownership query here. The shorter root's vec still transiently
74+
/// lists the package, but it self-heals on its next scan since
75+
/// `set_packages` replaces the vec wholesale.
76+
fn root_by_package(&self, pkg: Package) -> Option<Root>;
77+
78+
/// All live roots in lookup-precedence order: workspace folders first, then
79+
/// library paths (mirroring R's `.libPaths()`), then the orphan bucket.
80+
/// Stale roots are not included. Salsa-cached and invalidates when one of
81+
/// `workspace_roots` / `library_roots` / `orphan_root` changes.
82+
fn live_roots(&self) -> &[LiveRoot];
83+
}
84+
85+
#[salsa::tracked(returns(ref))]
86+
pub(crate) fn live_roots_query(db: &dyn Db) -> Vec<LiveRoot> {
87+
let mut roots: Vec<LiveRoot> = db
88+
.workspace_roots()
89+
.roots(db)
90+
.iter()
91+
.map(|&r| LiveRoot::Workspace(r))
92+
.collect();
93+
94+
roots.extend(
95+
db.library_roots()
96+
.roots(db)
97+
.iter()
98+
.map(|&r| LiveRoot::Library(r)),
99+
);
100+
101+
roots.push(LiveRoot::Orphan(db.orphan_root()));
102+
roots
56103
}
57104

58105
/// Implementation of [`Db::file_by_url`]. Walks the per-root indices.
@@ -61,34 +108,67 @@ pub trait Db: DbInputs {
61108
/// entity), but every step is: each [`root_url_index`] call returns a
62109
/// cached map, so adding a file to one root invalidates only that
63110
/// root's index.
64-
pub fn file_by_url_query(db: &dyn Db, url: &UrlId) -> Option<File> {
65-
for root in db.workspace_roots().roots(db) {
66-
if let Some(&file) = root_url_index(db, *root).get(url) {
67-
return Some(file);
111+
pub(crate) fn file_by_url_query(db: &dyn Db, url: &UrlId) -> Option<File> {
112+
for &root in db.live_roots() {
113+
let hit = match root {
114+
LiveRoot::Workspace(r) | LiveRoot::Library(r) => {
115+
root_url_index(db, r).get(url).copied()
116+
},
117+
LiveRoot::Orphan(_) => orphan_url_index(db).get(url).copied(),
118+
};
119+
if hit.is_some() {
120+
return hit;
68121
}
69122
}
70-
for root in db.library_roots().roots(db) {
71-
if let Some(&file) = root_url_index(db, *root).get(url) {
72-
return Some(file);
73-
}
74-
}
75-
orphan_url_index(db).get(url).copied()
123+
None
76124
}
77125

78126
/// Implementation of [`Db::package_by_name`]. Same shape as
79-
/// [`file_by_url_query`].
80-
pub fn package_by_name_query(db: &dyn Db, name: &str) -> Option<Package> {
81-
for root in db.workspace_roots().roots(db) {
82-
if let Some(&pkg) = root_package_index(db, *root).get(name) {
83-
return Some(pkg);
127+
/// [`file_by_url_query`]; orphan has no packages, so it contributes
128+
/// nothing to the walk.
129+
pub(crate) fn package_by_name_query(db: &dyn Db, name: &str) -> Option<Package> {
130+
for &root in db.live_roots() {
131+
if let LiveRoot::Workspace(r) | LiveRoot::Library(r) = root {
132+
if let Some(&pkg) = root_package_index(db, r).get(name) {
133+
return Some(pkg);
134+
}
84135
}
85136
}
86-
for root in db.library_roots().roots(db) {
87-
if let Some(&pkg) = root_package_index(db, *root).get(name) {
88-
return Some(pkg);
137+
None
138+
}
139+
140+
/// Implementation of [`Db::root_by_package`]. Walks all live roots looking for
141+
/// `pkg` in their `packages` vec, picking the longest-path root on ties.
142+
pub(crate) fn root_by_package_query(db: &dyn Db, pkg: Package) -> Option<Root> {
143+
let mut best: Option<(Root, usize)> = None;
144+
for &root in db.live_roots() {
145+
let (LiveRoot::Workspace(r) | LiveRoot::Library(r)) = root else {
146+
continue;
147+
};
148+
if r.packages(db).contains(&pkg) {
149+
let depth = root_depth(db, r);
150+
if best.is_none_or(|(_, d)| depth > d) {
151+
best = Some((r, depth));
152+
}
89153
}
90154
}
91-
None
155+
best.map(|(root, _)| root)
156+
}
157+
158+
/// Number of path segments in a root's URL. Used as the tiebreaker by
159+
/// [`root_by_package_query`] when nested roots both claim the same package.
160+
///
161+
/// Counts URL segments directly rather than going through `to_file_path()`.
162+
/// `to_file_path()` errors on Windows for non-OS-style URLs (no drive
163+
/// letter), which would silently collapse all depths to zero and degrade
164+
/// the tiebreaker into "first found wins". Depth is a structural property
165+
/// of the URL hierarchy, so the URL itself is the right source.
166+
fn root_depth(db: &dyn Db, root: Root) -> usize {
167+
root.path(db)
168+
.as_url()
169+
.path_segments()
170+
.map(|s| s.filter(|seg| !seg.is_empty()).count())
171+
.unwrap_or(0)
92172
}
93173

94174
/// Per-root URL -> File index. Salsa caches one map per `Root`;

crates/oak_db/src/file.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ pub struct File {
5050
pub url: UrlId,
5151
#[returns(ref)]
5252
pub contents: String,
53+
/// **Placement invariant.** Call this setter only through
54+
/// `oak_scan`'s helpers; see the type-level doc above.
5355
pub package: Option<Package>,
5456
}
5557

@@ -144,17 +146,22 @@ impl File {
144146

145147
/// The root containing this file, if any.
146148
///
147-
/// If the file has a registered [`Package`], dispatches through
148-
/// `Package.root`. Otherwise falls back to a URL-prefix lookup
149-
/// against [`WorkspaceRoots`] (orphan files live under a workspace
150-
/// root or nowhere; library files always have a package).
149+
/// If the file has a registered [`Package`], asks the db which live
150+
/// root holds it via [`Db::root_by_package`]. Otherwise falls back to a
151+
/// URL-prefix lookup against [`WorkspaceRoots`] (orphan files live
152+
/// under a workspace root or nowhere). Library files normally have
153+
/// a package; the `root_by_package` branch covers them too.
154+
///
155+
/// Returns `None` if the file's package was evicted to
156+
/// [`StaleRoot`] (no live root contains it), or if the file is in
157+
/// orphan and the URL falls outside every workspace folder.
151158
///
152159
/// Callers that need to distinguish workspace from library roots
153160
/// inspect `root.kind(db)`.
154161
#[salsa::tracked]
155162
pub fn root(self, db: &dyn Db) -> Option<Root> {
156163
if let Some(pkg) = self.package(db) {
157-
return Some(pkg.root(db));
164+
return db.root_by_package(pkg);
158165
}
159166
root_by_url(db, self.url(db))
160167
}

crates/oak_db/src/inputs.rs

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ pub struct Root {
2525
/// Top-level R scripts directly under this root. Each entry is a
2626
/// `File` with `package(db) == None`. Always empty for `Library`
2727
/// roots.
28+
///
29+
/// **Placement invariant.** A file present here must have
30+
/// `package(db) == None`, and a file with `package == None` must
31+
/// live here, in another `Root.scripts`, or in
32+
/// `OrphanRoot.files`. Call this setter only through `oak_scan`'s
33+
/// helpers, which keep the back-pointer and the container in sync.
2834
#[returns(ref)]
2935
pub scripts: Vec<File>,
3036
/// Packages discovered under this root (workspace packages for
@@ -39,6 +45,31 @@ pub enum RootKind {
3945
Library,
4046
}
4147

48+
/// A live root container that participates in analysis lookups.
49+
///
50+
/// Bundles the three salsa inputs that hold files / packages the user is
51+
/// actively working with: workspace [`Root`]s, library [`Root`]s, and the
52+
/// [`OrphanRoot`] that catches unanchored buffers. Stale entities in
53+
/// [`StaleRoot`] aren't included -- they have separate access patterns
54+
/// (scanner upsert only, never analysis), so they stay as their own input.
55+
///
56+
/// `Db::live_roots()` yields these in lookup precedence (workspace first, then
57+
/// library, then orphan).
58+
///
59+
/// TODO(salsa): this enum carries the workspace-vs-library distinction in its
60+
/// variant tag, which makes the `Root.kind` field redundant. Drop the field
61+
/// once callers route through `LiveRoot` everywhere instead of reading
62+
/// `root.kind(db)` directly. Further out, splitting `Root` into separate
63+
/// `WorkspaceRoot` and `LibraryRoot` salsa inputs (each with the fields that
64+
/// actually apply to its kind: scripts only on workspace, etc.) frees up
65+
/// the name `Root` to be this enum.
66+
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
67+
pub enum LiveRoot {
68+
Workspace(Root),
69+
Library(Root),
70+
Orphan(OrphanRoot),
71+
}
72+
4273
/// The set of workspace folders the user has open.
4374
///
4475
/// Populated by the LSP layer from `initialize.workspaceFolders` and
@@ -88,8 +119,11 @@ impl LibraryRoots {
88119
/// Singleton: there is one `OrphanRoot` per concrete database, lazily
89120
/// initialised by the implementation. The `files` field is what
90121
/// [`crate::Db::file_by_url`] consults to find unanchored files.
91-
#[salsa::input]
122+
#[salsa::input(debug)]
92123
pub struct OrphanRoot {
124+
/// **Placement invariant.** Files here must have `package(db) ==
125+
/// None`. Call this setter only through `oak_scan`'s helpers,
126+
/// which keep the back-pointer and the container in sync.
93127
#[returns(ref)]
94128
pub files: Vec<File>,
95129
}
@@ -100,12 +134,63 @@ impl OrphanRoot {
100134
}
101135
}
102136

137+
/// Files and packages from workspace or library roots that were removed
138+
/// during a `set_*_paths` call.
139+
///
140+
/// Salsa doesn't garbage-collect entities, so dropping a `Root` doesn't
141+
/// free its `File` and `Package` entities. They'd just leak. Instead we
142+
/// move them here and consult this bucket on the next `set_*_paths`,
143+
/// reusing entities by URL when their paths come back. This matters for
144+
/// agent / multi-repo workflows where the same workspace folder gets
145+
/// added and removed repeatedly across a session.
146+
///
147+
/// **Not consulted by analysis.** `Db::file_by_url` and
148+
/// `Db::package_by_name` walk workspace / library roots and (for files)
149+
/// `OrphanRoot` only. Entities in `StaleRoot` are invisible to
150+
/// completions, goto-def, etc. — they correspond to folders the user
151+
/// has explicitly removed.
152+
///
153+
/// **Consulted by scanners.** The scanner's package-by-URL lookup walks
154+
/// live roots then falls back to stale. Scanner upsert helpers do the same
155+
/// for files. On reuse, the entity is moved out of stale back into a live
156+
/// container.
157+
///
158+
/// Singleton like `OrphanRoot`. The `files` and `packages` fields are
159+
/// independent: a stale file's `package` may still point at a stale
160+
/// package, and that's fine. Both are invisible to analysis until one
161+
/// of them gets pulled back into a live container.
162+
#[salsa::input]
163+
pub struct StaleRoot {
164+
#[returns(ref)]
165+
pub files: Vec<File>,
166+
#[returns(ref)]
167+
pub packages: Vec<Package>,
168+
}
169+
170+
impl StaleRoot {
171+
pub fn empty(db: &dyn Db) -> Self {
172+
Self::new(db, vec![], vec![])
173+
}
174+
}
175+
103176
#[salsa::input(debug)]
104177
pub struct Package {
105-
/// The `Root` this package belongs to. Workspace packages live under
106-
/// a [`RootKind::Workspace`] root, installed packages live under a
107-
/// [`RootKind::Library`] root. Read `root.kind(db)` to distinguish.
108-
pub root: Root,
178+
/// URL of the package's `DESCRIPTION` file. Stable identity across
179+
/// rescans and workspace / library churn: scanners look up an
180+
/// existing `Package` by this URL before creating a new one. Two
181+
/// packages with the same `Package:` name can coexist on disk and the
182+
/// URL distinguishes them.
183+
///
184+
/// The package's owning [`Root`] is not stored as a field. It is
185+
/// derived from live-graph containment via [`Db::root_by_package`]: a
186+
/// package belongs to whichever `Root.packages` currently holds it.
187+
/// Workspace-vs-library is then `root.kind(db)`.
188+
#[returns(ref)]
189+
pub description_url: UrlId,
190+
// TODO(salsa): Expose a tracked `name_interned(db) -> Name<'db>`
191+
// method so `db.package_by_name()` and other lookups key on the
192+
// interned id rather than the string. Can't store `Name<'db>` on
193+
// `Package` directly because salsa inputs are lifetime-free.
109194
#[returns(ref)]
110195
pub name: String,
111196
/// Installed-package version (from `DESCRIPTION`). `None` for
@@ -118,6 +203,11 @@ pub struct Package {
118203
/// Per-package granularity: adding or removing a file in one
119204
/// package doesn't invalidate tracked queries reading another
120205
/// package's files.
206+
///
207+
/// **Placement invariant.** A file present here must have
208+
/// `package(db) == Some(self)`. Call this setter only through
209+
/// `oak_scan`'s helpers, which keep the back-pointer and the
210+
/// container in sync.
121211
#[returns(ref)]
122212
pub files: Vec<File>,
123213
/// The basename ordering from `DESCRIPTION`'s `Collate` field, if

0 commit comments

Comments
 (0)