Skip to content

Commit e6309d9

Browse files
authored
(2) Extract out oak_source for CRAN downloads of R packages and R versions (#1299)
Part of #1234 Branched from #1298 Similar to the old cache model for CRAN sources, so not a ton to review. The main structural differences are: - For R packages, we have `get_cran()` and `insert_cran()` - For R versions, we have `get_r()` and `insert_r()` Where both of these now download and keep _the whole thing_ rather than trying to extract out only `R/`, which I think is going to pay off in the long run.
2 parents 38abcd7 + 16fbf36 commit e6309d9

10 files changed

Lines changed: 413 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ oak_package_metadata = { path = "crates/oak_package_metadata" }
8585
oak_r_process = { path = "crates/oak_r_process" }
8686
oak_scan = { path = "crates/oak_scan" }
8787
oak_semantic = { path = "crates/oak_semantic" }
88+
oak_source = { path = "crates/oak_source" }
8889
oak_sources = { path = "crates/oak_sources" }
8990
once_cell = "1.21.4"
9091
parking_lot = "0.12.5"

crates/oak_fs/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
pub mod file_lock;
2+
pub mod permissions;

crates/oak_fs/src/permissions.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
use std::path::Path;
2+
3+
/// Mark a file as read only
4+
pub fn set_readonly(path: &Path) -> std::io::Result<()> {
5+
let mut permissions = std::fs::metadata(path)?.permissions();
6+
permissions.set_readonly(true);
7+
std::fs::set_permissions(path, permissions)
8+
}

crates/oak_source/Cargo.toml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "oak_source"
3+
version = "0.1.0"
4+
authors.workspace = true
5+
edition.workspace = true
6+
rust-version.workspace = true
7+
license.workspace = true
8+
9+
[dependencies]
10+
anyhow.workspace = true
11+
flate2.workspace = true
12+
log.workspace = true
13+
oak_cache.workspace = true
14+
oak_fs.workspace = true
15+
tar.workspace = true
16+
ureq.workspace = true
17+
18+
[dev-dependencies]
19+
tempfile.workspace = true
20+
21+
[lints]
22+
workspace = true

crates/oak_source/src/cran.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
use std::path::Path;
2+
3+
use crate::download::download_with_mirrors;
4+
use crate::download::Outcome;
5+
use crate::download::MIRRORS;
6+
use crate::extract;
7+
8+
/// Download an R package's source tarball from CRAN and unpack it into `dir`
9+
///
10+
/// The tarball's top-level `{name}/` directory is stripped, so the package's files land
11+
/// directly under `dir` (e.g. `dir/R/`, `dir/DESCRIPTION`).
12+
///
13+
/// Returns `Ok(false)` if the package isn't on CRAN, which we treat as "source
14+
/// unavailable" rather than an error.
15+
pub(crate) fn populate(name: &str, version: &str, dir: &Path) -> anyhow::Result<bool> {
16+
match download(name, version)? {
17+
Outcome::Success(response) => {
18+
extract::extract(response.into_body().into_reader(), dir)?;
19+
Ok(true)
20+
},
21+
Outcome::NotFound => Ok(false),
22+
}
23+
}
24+
25+
fn download(name: &str, version: &str) -> anyhow::Result<Outcome> {
26+
// Try released version
27+
let outcome = download_with_mirrors(&format!("src/contrib/{name}_{version}.tar.gz"), MIRRORS)?;
28+
29+
if matches!(outcome, Outcome::Success(_)) {
30+
return Ok(outcome);
31+
}
32+
33+
// Try archive
34+
download_with_mirrors(
35+
&format!("src/contrib/Archive/{name}/{name}_{version}.tar.gz"),
36+
MIRRORS,
37+
)
38+
}

crates/oak_source/src/download.rs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
use std::time::Duration;
2+
3+
const HTTP_NOT_FOUND: u16 = 404;
4+
const HTTP_SERVICE_UNAVAILABLE: u16 = 503;
5+
6+
const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
7+
const GLOBAL_TIMEOUT: Duration = Duration::from_secs(40);
8+
9+
/// CRAN mirrors to try, in order
10+
pub(crate) const MIRRORS: &[&str] = &["https://cran.r-project.org", "https://cran.rstudio.com"];
11+
12+
/// Outcome of a CRAN mirror HTTP request
13+
pub(crate) enum Outcome {
14+
Success(ureq::http::Response<ureq::Body>),
15+
NotFound,
16+
}
17+
18+
pub(crate) fn download_with_mirrors(suffix: &str, mirrors: &[&str]) -> anyhow::Result<Outcome> {
19+
if mirrors.is_empty() {
20+
panic!("`mirrors` can't be empty.");
21+
}
22+
23+
let mut last_error = None;
24+
25+
for mirror in mirrors {
26+
let url = format!("{mirror}/{suffix}");
27+
28+
let request = ureq::get(&url)
29+
.config()
30+
.timeout_connect(Some(CONNECT_TIMEOUT))
31+
.timeout_global(Some(GLOBAL_TIMEOUT))
32+
.build();
33+
34+
match request.call() {
35+
Ok(response) => return Ok(Outcome::Success(response)),
36+
37+
// Known to be not there, don't try any other mirrors
38+
Err(ureq::Error::StatusCode(HTTP_NOT_FOUND)) => return Ok(Outcome::NotFound),
39+
40+
// Try next mirror, this one is temporarily unavailable
41+
Err(ureq::Error::StatusCode(HTTP_SERVICE_UNAVAILABLE)) => {
42+
last_error = Some(Err(ureq::Error::StatusCode(HTTP_SERVICE_UNAVAILABLE).into()));
43+
continue;
44+
},
45+
46+
// Try next mirror, this one timed out
47+
Err(ureq::Error::Timeout(timeout)) => {
48+
last_error = Some(Err(ureq::Error::Timeout(timeout).into()));
49+
continue;
50+
},
51+
52+
// Some unhandled error occurred, bail
53+
Err(err) => return Err(err.into()),
54+
};
55+
}
56+
57+
// Every mirror returned `HTTP_SERVICE_UNAVAILABLE` or timed out
58+
last_error.expect("`mirrors` was non-empty and we always set `last_error`")
59+
}

crates/oak_source/src/extract.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use std::collections::HashSet;
2+
use std::io::Read;
3+
use std::path::Component;
4+
use std::path::Path;
5+
use std::path::PathBuf;
6+
7+
use flate2::read::GzDecoder;
8+
9+
/// Unpack a gzipped tarball into `dir`, dropping its top-level directory and marking
10+
/// every file read only
11+
///
12+
/// CRAN package tarballs and R source tarballs both wrap their content in a single
13+
/// top-level directory (`{package}/` or `R-{version}/`). We strip it so the content lands
14+
/// directly under `dir`.
15+
///
16+
/// Files are marked as read only to discourage accidental edits.
17+
pub(crate) fn extract(reader: impl Read, dir: &Path) -> anyhow::Result<()> {
18+
let gz = GzDecoder::new(reader);
19+
let mut archive = tar::Archive::new(gz);
20+
21+
// Parent directories we've already created
22+
let mut created: HashSet<PathBuf> = HashSet::new();
23+
24+
for entry in archive.entries()? {
25+
let mut entry = entry?;
26+
let is_file = entry.header().entry_type().is_file();
27+
28+
let path = entry.path()?.into_owned();
29+
let Some(relative) = strip_top_level(&path) else {
30+
// The top-level directory entry itself, or an unsafe path, nothing to unpack
31+
continue;
32+
};
33+
34+
let destination = dir.join(relative);
35+
36+
// We must create parent directories before unpacking into them. We remember ones
37+
// we've already created to avoid thousands of redundant `create_dir_all()` calls.
38+
if let Some(parent) = destination.parent() {
39+
if !created.contains(parent) {
40+
std::fs::create_dir_all(parent)?;
41+
created.insert(parent.to_path_buf());
42+
}
43+
}
44+
45+
entry.unpack(&destination)?;
46+
47+
if is_file {
48+
oak_fs::permissions::set_readonly(&destination)?;
49+
}
50+
}
51+
52+
Ok(())
53+
}
54+
55+
/// Strip the single top-level directory from a tarball entry path
56+
///
57+
/// Returns `None` for the top-level directory entry itself, or for any unsafe path
58+
/// (absolute, or containing `..`) that could escape the destination.
59+
fn strip_top_level(path: &Path) -> Option<&Path> {
60+
let mut components = path.components();
61+
components.next()?;
62+
63+
let rest = components.as_path();
64+
65+
if rest.as_os_str().is_empty() {
66+
// The top-level directory entry itself
67+
return None;
68+
}
69+
70+
if !rest.components().all(|c| matches!(c, Component::Normal(_))) {
71+
// Something would be strange here!
72+
return None;
73+
}
74+
75+
Some(rest)
76+
}

0 commit comments

Comments
 (0)