Skip to content

Commit f0ae220

Browse files
authored
Shuffle project files for preparing to crates.io release (#405)
The way the project is laid out today make it unsuitable to be published to crates.io, mainly because of the main crate having git dependencies. This PR just shuffles files and modules around so that we can have a clean main crate that can be sent to crates.io.
1 parent 86e5a38 commit f0ae220

24 files changed

Lines changed: 71 additions & 99 deletions

Cargo.lock

Lines changed: 7 additions & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -45,31 +45,21 @@ crossbeam-queue = "0.3"
4545
sysinfo = { version = "0.30", optional = true }
4646
sketches-ddsketch = { version = "0.3", features = ["use_serde"] }
4747
bincode = "1"
48+
tonic-prost = "0.14.2"
4849

4950
# integration_tests deps
5051
insta = { version = "1.46.0", features = ["filters"], optional = true }
51-
tpchgen = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "438e9c2dbc25b2fff82c0efc08b3f13b5707874f", optional = true }
52-
tpchgen-arrow = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "438e9c2dbc25b2fff82c0efc08b3f13b5707874f", optional = true }
5352
parquet = { version = "58", optional = true }
5453
arrow = { version = "58", optional = true, features = ["test_utils"] }
5554
hyper-util = { version = "0.1.16", optional = true }
56-
pretty_assertions = { version = "1.4", optional = true }
57-
reqwest = { version = "0.12", optional = true }
58-
zip = { version = "6.0", optional = true }
59-
tonic-prost = "0.14.2"
6055

6156
[features]
6257
avro = ["datafusion/avro"]
6358
integration = [
6459
"insta",
65-
"tpchgen",
66-
"tpchgen-arrow",
6760
"parquet",
6861
"arrow",
6962
"hyper-util",
70-
"pretty_assertions",
71-
"reqwest",
72-
"zip",
7363
]
7464

7565
system-metrics = ["sysinfo"]
@@ -80,15 +70,12 @@ clickbench = ["integration"]
8070
sysinfo = ["dep:sysinfo"]
8171

8272
[dev-dependencies]
73+
datafusion-distributed-benchmarks = { path = "benchmarks" }
8374
structopt = "0.3"
8475
insta = { version = "1.46.0", features = ["filters"] }
85-
tpchgen = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "438e9c2dbc25b2fff82c0efc08b3f13b5707874f" }
86-
tpchgen-arrow = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "438e9c2dbc25b2fff82c0efc08b3f13b5707874f" }
8776
parquet = "58"
8877
arrow = { version = "58", features = ["test_utils"] }
8978
tokio-stream = { version = "0.1.17", features = ["sync"] }
9079
hyper-util = "0.1.16"
9180
pretty_assertions = "1.4"
92-
reqwest = "0.12"
93-
zip = "6.0"
9481
test-case = "3.3.1"

benchmarks/Cargo.toml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ default-run = "dfbench"
66

77
[dependencies]
88
datafusion = { workspace = true }
9-
datafusion-proto = { workspace = true }
109
datafusion-distributed = { path = "..", features = [
1110
"integration",
1211
"system-metrics",
@@ -19,19 +18,19 @@ serde = "1.0.219"
1918
serde_json = "1.0.141"
2019
env_logger = "0.11.8"
2120
async-trait = "0.1.89"
22-
chrono = "0.4.42"
2321
futures = "0.3.31"
24-
dashmap = "6.0.1"
25-
prost = "0.14.1"
2622
url = "2.5.7"
27-
arrow-flight = "58"
23+
arrow = { version = "58", features = ["test_utils"] }
2824
tonic = { version = "0.14.1", features = ["transport"] }
25+
tpchgen = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "438e9c2dbc25b2fff82c0efc08b3f13b5707874f" }
26+
tpchgen-arrow = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "438e9c2dbc25b2fff82c0efc08b3f13b5707874f" }
27+
reqwest = "0.12"
28+
zip = "6.0"
2929
axum = "0.7"
3030
object_store = { version = "0.13", features = ["aws"] }
3131
aws-config = "1"
3232
aws-sdk-ec2 = "1"
33-
openssl = { version = "0.10", features = ["vendored"] }
34-
clap = "4.5"
33+
openssl = { version = "0.10", features = ["vendored"] } # Keep this. Necessary for the remote benchmarks worker.
3534
mimalloc = "0.1"
3635

3736
[dev-dependencies]

benchmarks/cdk/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
!jest.config.js
33
*.d.ts
44
node_modules
5+
.cdk-outputs.json
56

67
# CDK asset staging directory
78
.cdk.staging
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::test_utils::benchmarks_common;
1+
use super::common;
22
use datafusion::common::DataFusionError;
33
use std::fs;
44
use std::io::Write;
@@ -10,11 +10,11 @@ const URL: &str =
1010
"https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet";
1111

1212
pub fn get_queries() -> Vec<String> {
13-
benchmarks_common::get_queries("testdata/clickbench/queries")
13+
common::get_queries("testdata/clickbench/queries")
1414
}
1515

1616
pub fn get_query(id: &str) -> Result<String, DataFusionError> {
17-
benchmarks_common::get_query("testdata/clickbench/queries", id)
17+
common::get_query("testdata/clickbench/queries", id)
1818
}
1919

2020
/// Downloads the datafusion-benchmarks repository as a zip file
Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,15 @@ use datafusion::prelude::{ParquetReadOptions, SessionContext};
33
use std::fs;
44
use std::path::Path;
55

6-
pub(crate) fn get_queries(path: &str) -> Vec<String> {
7-
let queries_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join(path);
6+
/// Returns the workspace root directory (parent of the benchmarks crate).
7+
fn workspace_root() -> &'static Path {
8+
Path::new(env!("CARGO_MANIFEST_DIR"))
9+
.parent()
10+
.expect("benchmarks crate should be inside a workspace")
11+
}
12+
13+
pub fn get_queries(path: &str) -> Vec<String> {
14+
let queries_dir = workspace_root().join(path);
815
let mut result = vec![];
916
for file in queries_dir.read_dir().unwrap() {
1017
let file = file.unwrap();
@@ -36,8 +43,8 @@ pub(crate) fn get_queries(path: &str) -> Vec<String> {
3643
result
3744
}
3845

39-
pub(crate) fn get_query(path: &str, id: &str) -> Result<String, DataFusionError> {
40-
let queries_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join(path);
46+
pub fn get_query(path: &str, id: &str) -> Result<String, DataFusionError> {
47+
let queries_dir = workspace_root().join(path);
4148

4249
if !queries_dir.exists() {
4350
return internal_err!(

benchmarks/src/datasets/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pub mod clickbench;
2+
mod common;
3+
pub mod tpcds;
4+
pub mod tpch;
5+
6+
pub use common::register_tables;
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::test_utils::benchmarks_common;
1+
use super::common;
22
use arrow::datatypes::{DataType, Field};
33
use datafusion::common::internal_err;
44
use datafusion::error::DataFusionError;
@@ -18,11 +18,11 @@ use std::sync::Arc;
1818
const URL: &str = "https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip";
1919

2020
pub fn get_queries() -> Vec<String> {
21-
benchmarks_common::get_queries("testdata/tpcds/queries")
21+
common::get_queries("testdata/tpcds/queries")
2222
}
2323

2424
pub fn get_query(id: &str) -> Result<String, DataFusionError> {
25-
benchmarks_common::get_query("testdata/tpcds/queries", id)
25+
common::get_query("testdata/tpcds/queries", id)
2626
}
2727

2828
/// Downloads the datafusion-benchmarks repository as a zip file
Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
use crate::test_utils::benchmarks_common;
1+
use super::common;
22
use arrow::record_batch::RecordBatch;
33
use datafusion::error::DataFusionError;
44
use parquet::{arrow::arrow_writer::ArrowWriter, file::properties::WriterProperties};
55
use std::fs;
6+
use std::path::Path;
67
use tpchgen::generators::{
78
CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, PartGenerator,
89
PartSuppGenerator, RegionGenerator, SupplierGenerator,
@@ -13,19 +14,17 @@ use tpchgen_arrow::{
1314
};
1415

1516
pub fn get_queries() -> Vec<String> {
16-
benchmarks_common::get_queries("testdata/tpch/queries")
17+
common::get_queries("testdata/tpch/queries")
1718
}
1819

1920
pub fn get_query(id: &str) -> Result<String, DataFusionError> {
20-
benchmarks_common::get_query("testdata/tpch/queries", id)
21+
common::get_query("testdata/tpch/queries", id)
2122
}
2223

23-
// generate_table creates a parquet file in the data directory from an arrow RecordBatch row
24-
// source.
2524
fn generate_table<A>(
2625
mut data_source: A,
2726
table_name: &str,
28-
data_dir: &std::path::Path,
27+
data_dir: &Path,
2928
) -> Result<(), Box<dyn std::error::Error>>
3029
where
3130
A: Iterator<Item = RecordBatch>,
@@ -49,18 +48,16 @@ where
4948
Ok(())
5049
}
5150

52-
// generate_tpch_data generates all TPC-H tables in the specified data directory.
53-
pub fn generate_tpch_data(data_dir: &std::path::Path, sf: f64, parts: i32) {
51+
/// Generates all TPC-H tables as parquet files in the specified data directory.
52+
pub fn generate_tpch_data(data_dir: &Path, sf: f64, parts: i32) {
5453
fs::create_dir_all(data_dir).expect("Failed to create data directory");
5554

5655
macro_rules! must_generate_tpch_table {
5756
($generator:ident, $arrow:ident, $name:literal) => {
5857
let data_dir = data_dir.join($name);
5958
fs::create_dir_all(data_dir.clone()).expect("Failed to create data directory");
60-
// create three partitions for the table
6159
(1..=parts).for_each(|part| {
6260
generate_table(
63-
// TODO: Consider adjusting the partitions and batch sizes.
6461
$arrow::new($generator::new(sf, part, parts)).with_batch_size(1000),
6562
&format!("{part}"),
6663
&data_dir,

benchmarks/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod datasets;

0 commit comments

Comments
 (0)